Untitled

# Reload the necessary libraries and data
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.distance import great_circle
import time
import requests
import logging
import concurrent.futures
from tqdm import tqdm
import openpyxl

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("Loading data from Excel file...")
# Reload the file
file_path = "Raw Data - Freight Flights 2024.xlsx"
xls = pd.ExcelFile(file_path)

# Load the data
df = pd.read_excel(xls, sheet_name="Sheet1")

# Clean column names
df.columns = df.columns.str.strip()

# Filter for 2024 data
df_2024 = df[df["Year"] == 2024].copy()
print(f"Loaded {len(df_2024)} freight flights from 2024.")

# Initialize geolocator with a more specific user agent
geolocator = Nominatim(user_agent="freight_flight_distance_calculator_v1.0")

# Function to get coordinates of a city, country with better error handling
def get_coordinates(city, country):
    # Try multiple query formats
    queries = [
        f"{city}, {country}",
        f"{city} {country}",
        city  # Sometimes just the city works better
    ]
    
    for query in queries:
        try:
            # Add a longer timeout and be explicit about exactly_one
            location = geolocator.geocode(query, timeout=15, exactly_one=True)
            if location:
                logger.info(f"Successfully geocoded {city}, {country} using query: {query}")
                return (city, country, (location.latitude, location.longitude))
            time.sleep(1)  # Small delay between query attempts
        except requests.exceptions.RequestException as e:
            logger.error(f"Request error for {query}: {e}")
            time.sleep(2)  # Slightly longer delay after a request error
        except Exception as e:
            logger.error(f"Error geocoding {query}: {e}")
            time.sleep(2)
    
    # If all attempts failed
    logger.warning(f"Failed to geocode {city}, {country} after trying multiple formats")
    return (city, country, None)

# Create a dictionary to store coordinates
city_country_coords = {}

# Extract unique city-country pairs for origins and destinations
unique_locations = list(set(zip(df_2024["Origin City"], df_2024["Origin Country"])) | 
                       set(zip(df_2024["Destination City"], df_2024["Destination Country"])))
print(f"Found {len(unique_locations)} unique locations to geocode.")

# Function to be used with ThreadPoolExecutor
def geocode_location(location_tuple):
    city, country = location_tuple
    result = get_coordinates(city, country)
    # Add a delay to respect rate limits
    time.sleep(1)
    return result

# Use ThreadPoolExecutor to parallelize geocoding
print("Geocoding locations in parallel...")
max_workers = 5  # Adjust based on API rate limits
successful_geocodes = 0

with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit all tasks and create a dictionary to track them
    future_to_location = {executor.submit(geocode_location, loc): loc for loc in unique_locations}
    
    # Process results as they complete
    for future in tqdm(concurrent.futures.as_completed(future_to_location), total=len(unique_locations)):
        try:
            city, country, coords = future.result()
            city_country_coords[(city, country)] = coords
            if coords:
                successful_geocodes += 1
        except Exception as e:
            logger.error(f"Exception occurred during geocoding: {e}")

print(f"Geocoding complete: {successful_geocodes}/{len(unique_locations)} locations successfully geocoded ({successful_geocodes/len(unique_locations)*100:.1f}%).")

# Function to calculate distance between origin and destination
def calculate_distance(row):
    origin = city_country_coords.get((row["Origin City"], row["Origin Country"]))
    destination = city_country_coords.get((row["Destination City"], row["Destination Country"]))
    
    if origin and destination:
        return great_circle(origin, destination).kilometers
    return None

# Apply the function to calculate distances
print("Calculating flight distances...")
df_2024["Flight Distance (km)"] = df_2024.apply(calculate_distance, axis=1)

# Count how many distances were successfully calculated
successful_distances = df_2024["Flight Distance (km)"].notna().sum()
print(f"Distance calculation complete: {successful_distances}/{len(df_2024)} distances calculated ({successful_distances/len(df_2024)*100:.1f}%).")

# Display the updated dataset
print("\nAir Freight Data with Calculated Distances:")
print(df_2024)

# Write the updated data back to the Excel file
print("\nWriting updated data back to Excel file...")
try:
    # Load the existing Excel file to preserve other sheets
    book = openpyxl.load_workbook(file_path)
    
    # Create a Pandas Excel writer using the loaded workbook
    with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
        writer.book = book
        writer.sheets = {ws.title: ws for ws in book.worksheets}
        
        # Write the updated dataframe to Sheet1, overwriting the existing data
        df_2024.to_excel(writer, sheet_name="Sheet1", index=False)
        
    print(f"Successfully wrote data with flight distances back to '{file_path}'")
except Exception as e:
    logger.error(f"Error writing to Excel file: {e}")
    # Fallback to CSV if Excel write fails
    backup_file = "air_freight_data_with_distances.csv"
    df_2024.to_csv(backup_file, index=False)
    print(f"Failed to write to Excel. Data saved to CSV file '{backup_file}' instead.")
Editor is loading...