Untitled
unknown
python
9 months ago
5.7 kB
22
Indexable
# Reload the necessary libraries and data
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.distance import great_circle
import time
import requests
import logging
import concurrent.futures
from tqdm import tqdm
import openpyxl
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
print("Loading data from Excel file...")
# Reload the file
file_path = "Raw Data - Freight Flights 2024.xlsx"
xls = pd.ExcelFile(file_path)
# Load the data
df = pd.read_excel(xls, sheet_name="Sheet1")
# Clean column names
df.columns = df.columns.str.strip()
# Filter for 2024 data
df_2024 = df[df["Year"] == 2024].copy()
print(f"Loaded {len(df_2024)} freight flights from 2024.")
# Initialize geolocator with a more specific user agent
geolocator = Nominatim(user_agent="freight_flight_distance_calculator_v1.0")
# Function to get coordinates of a city, country with better error handling
def get_coordinates(city, country):
# Try multiple query formats
queries = [
f"{city}, {country}",
f"{city} {country}",
city # Sometimes just the city works better
]
for query in queries:
try:
# Add a longer timeout and be explicit about exactly_one
location = geolocator.geocode(query, timeout=15, exactly_one=True)
if location:
logger.info(f"Successfully geocoded {city}, {country} using query: {query}")
return (city, country, (location.latitude, location.longitude))
time.sleep(1) # Small delay between query attempts
except requests.exceptions.RequestException as e:
logger.error(f"Request error for {query}: {e}")
time.sleep(2) # Slightly longer delay after a request error
except Exception as e:
logger.error(f"Error geocoding {query}: {e}")
time.sleep(2)
# If all attempts failed
logger.warning(f"Failed to geocode {city}, {country} after trying multiple formats")
return (city, country, None)
# Create a dictionary to store coordinates
city_country_coords = {}
# Extract unique city-country pairs for origins and destinations
unique_locations = list(set(zip(df_2024["Origin City"], df_2024["Origin Country"])) |
set(zip(df_2024["Destination City"], df_2024["Destination Country"])))
print(f"Found {len(unique_locations)} unique locations to geocode.")
# Function to be used with ThreadPoolExecutor
def geocode_location(location_tuple):
city, country = location_tuple
result = get_coordinates(city, country)
# Add a delay to respect rate limits
time.sleep(1)
return result
# Use ThreadPoolExecutor to parallelize geocoding
print("Geocoding locations in parallel...")
max_workers = 5 # Adjust based on API rate limits
successful_geocodes = 0
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
# Submit all tasks and create a dictionary to track them
future_to_location = {executor.submit(geocode_location, loc): loc for loc in unique_locations}
# Process results as they complete
for future in tqdm(concurrent.futures.as_completed(future_to_location), total=len(unique_locations)):
try:
city, country, coords = future.result()
city_country_coords[(city, country)] = coords
if coords:
successful_geocodes += 1
except Exception as e:
logger.error(f"Exception occurred during geocoding: {e}")
print(f"Geocoding complete: {successful_geocodes}/{len(unique_locations)} locations successfully geocoded ({successful_geocodes/len(unique_locations)*100:.1f}%).")
# Function to calculate distance between origin and destination
def calculate_distance(row):
origin = city_country_coords.get((row["Origin City"], row["Origin Country"]))
destination = city_country_coords.get((row["Destination City"], row["Destination Country"]))
if origin and destination:
return great_circle(origin, destination).kilometers
return None
# Apply the function to calculate distances
print("Calculating flight distances...")
df_2024["Flight Distance (km)"] = df_2024.apply(calculate_distance, axis=1)
# Count how many distances were successfully calculated
successful_distances = df_2024["Flight Distance (km)"].notna().sum()
print(f"Distance calculation complete: {successful_distances}/{len(df_2024)} distances calculated ({successful_distances/len(df_2024)*100:.1f}%).")
# Display the updated dataset
print("\nAir Freight Data with Calculated Distances:")
print(df_2024)
# Write the updated data back to the Excel file
print("\nWriting updated data back to Excel file...")
try:
# Load the existing Excel file to preserve other sheets
book = openpyxl.load_workbook(file_path)
# Create a Pandas Excel writer using the loaded workbook
with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
writer.book = book
writer.sheets = {ws.title: ws for ws in book.worksheets}
# Write the updated dataframe to Sheet1, overwriting the existing data
df_2024.to_excel(writer, sheet_name="Sheet1", index=False)
print(f"Successfully wrote data with flight distances back to '{file_path}'")
except Exception as e:
logger.error(f"Error writing to Excel file: {e}")
# Fallback to CSV if Excel write fails
backup_file = "air_freight_data_with_distances.csv"
df_2024.to_csv(backup_file, index=False)
print(f"Failed to write to Excel. Data saved to CSV file '{backup_file}' instead.")Editor is loading...
Leave a Comment