Untitled
unknown
python
a month ago
5.7 kB
13
Indexable
# Reload the necessary libraries and data import pandas as pd from geopy.geocoders import Nominatim from geopy.distance import great_circle import time import requests import logging import concurrent.futures from tqdm import tqdm import openpyxl # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) print("Loading data from Excel file...") # Reload the file file_path = "Raw Data - Freight Flights 2024.xlsx" xls = pd.ExcelFile(file_path) # Load the data df = pd.read_excel(xls, sheet_name="Sheet1") # Clean column names df.columns = df.columns.str.strip() # Filter for 2024 data df_2024 = df[df["Year"] == 2024].copy() print(f"Loaded {len(df_2024)} freight flights from 2024.") # Initialize geolocator with a more specific user agent geolocator = Nominatim(user_agent="freight_flight_distance_calculator_v1.0") # Function to get coordinates of a city, country with better error handling def get_coordinates(city, country): # Try multiple query formats queries = [ f"{city}, {country}", f"{city} {country}", city # Sometimes just the city works better ] for query in queries: try: # Add a longer timeout and be explicit about exactly_one location = geolocator.geocode(query, timeout=15, exactly_one=True) if location: logger.info(f"Successfully geocoded {city}, {country} using query: {query}") return (city, country, (location.latitude, location.longitude)) time.sleep(1) # Small delay between query attempts except requests.exceptions.RequestException as e: logger.error(f"Request error for {query}: {e}") time.sleep(2) # Slightly longer delay after a request error except Exception as e: logger.error(f"Error geocoding {query}: {e}") time.sleep(2) # If all attempts failed logger.warning(f"Failed to geocode {city}, {country} after trying multiple formats") return (city, country, None) # Create a dictionary to store coordinates city_country_coords = {} # Extract unique city-country pairs for origins and destinations unique_locations = list(set(zip(df_2024["Origin City"], df_2024["Origin Country"])) | set(zip(df_2024["Destination City"], df_2024["Destination Country"]))) print(f"Found {len(unique_locations)} unique locations to geocode.") # Function to be used with ThreadPoolExecutor def geocode_location(location_tuple): city, country = location_tuple result = get_coordinates(city, country) # Add a delay to respect rate limits time.sleep(1) return result # Use ThreadPoolExecutor to parallelize geocoding print("Geocoding locations in parallel...") max_workers = 5 # Adjust based on API rate limits successful_geocodes = 0 with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit all tasks and create a dictionary to track them future_to_location = {executor.submit(geocode_location, loc): loc for loc in unique_locations} # Process results as they complete for future in tqdm(concurrent.futures.as_completed(future_to_location), total=len(unique_locations)): try: city, country, coords = future.result() city_country_coords[(city, country)] = coords if coords: successful_geocodes += 1 except Exception as e: logger.error(f"Exception occurred during geocoding: {e}") print(f"Geocoding complete: {successful_geocodes}/{len(unique_locations)} locations successfully geocoded ({successful_geocodes/len(unique_locations)*100:.1f}%).") # Function to calculate distance between origin and destination def calculate_distance(row): origin = city_country_coords.get((row["Origin City"], row["Origin Country"])) destination = city_country_coords.get((row["Destination City"], row["Destination Country"])) if origin and destination: return great_circle(origin, destination).kilometers return None # Apply the function to calculate distances print("Calculating flight distances...") df_2024["Flight Distance (km)"] = df_2024.apply(calculate_distance, axis=1) # Count how many distances were successfully calculated successful_distances = df_2024["Flight Distance (km)"].notna().sum() print(f"Distance calculation complete: {successful_distances}/{len(df_2024)} distances calculated ({successful_distances/len(df_2024)*100:.1f}%).") # Display the updated dataset print("\nAir Freight Data with Calculated Distances:") print(df_2024) # Write the updated data back to the Excel file print("\nWriting updated data back to Excel file...") try: # Load the existing Excel file to preserve other sheets book = openpyxl.load_workbook(file_path) # Create a Pandas Excel writer using the loaded workbook with pd.ExcelWriter(file_path, engine='openpyxl') as writer: writer.book = book writer.sheets = {ws.title: ws for ws in book.worksheets} # Write the updated dataframe to Sheet1, overwriting the existing data df_2024.to_excel(writer, sheet_name="Sheet1", index=False) print(f"Successfully wrote data with flight distances back to '{file_path}'") except Exception as e: logger.error(f"Error writing to Excel file: {e}") # Fallback to CSV if Excel write fails backup_file = "air_freight_data_with_distances.csv" df_2024.to_csv(backup_file, index=False) print(f"Failed to write to Excel. Data saved to CSV file '{backup_file}' instead.")
Editor is loading...
Leave a Comment