Untitled
unknown
python
6 days ago
3.2 kB
10
Indexable
Never
import pandas as pd import numpy as np df=pd.read_csv("uber_rides_data.xlsx - sample_train.csv") df.head() df1=df.shape print(df1) integer_columns = df.select_dtypes(include=['int64']) num_integer_columns = integer_columns.shape[1] print(f'The number of integer columns in the dataset is: {num_integer_columns}') missing_values = df['dropoff_longitude'].isnull().sum() print(f'The number of missing values in the "dropoff_longitude" column is: {missing_values}') df.info() df_cleaned = df.dropna(subset=['fare_amount']) average_fare = df_cleaned['fare_amount'].mean() print(f"The average fare amount is: {average_fare}") def haversine(lat1, lon1, lat2, lon2): lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2]) dlat = lat2 - lat1 dlon = lon2 - lon1 a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2 c = 2 * np.arcsin(np.sqrt(a)) r = 6371.0 return r * c df['haversine_distance'] = haversine(df['pickup_latitude'], df['pickup_longitude'], df['dropoff_latitude'], df['dropoff_longitude']) median_distance = df['haversine_distance'].median() print(f"The median Haversine distance is: {median_distance:.2f} km") max_distance = df['haversine_distance'].max() print(f"The maximum Haversine distance is: {max_distance:.2f} km") zero_distance_count = df[df['haversine_distance'] == 0.0].shape[0] print(f"The number of rides with 0.0 Haversine distance is: {zero_distance_count}") zero_distance_rides = df[df['haversine_distance'] == 0.0] mean_fare = zero_distance_rides['fare_amount'].mean() print(f"The mean fare amount for rides with 0.0 Haversine distance is: {mean_fare}") max_fare_amount = df['fare_amount'].max() print(f"The maximum fare amount for a ride is: {max_fare_amount}") max_fare_row = df.loc[df['fare_amount'].idxmax()] pickup_lat = max_fare_row['pickup_latitude'] pickup_lon = max_fare_row['pickup_longitude'] dropoff_lat = max_fare_row['dropoff_latitude'] dropoff_lon = max_fare_row['dropoff_longitude'] distance = haversine(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon) print(f"The Haversine distance for the costliest ride is: {distance:.6f} km") df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime']) rides_2014 = df[(df['pickup_datetime'].dt.year == 2014)] num_rides_2014 = rides_2014.shape[0] print(f"Number of rides recorded in the year 2014: {num_rides_2014}") first_quarter_2014 = df[(df['pickup_datetime'].dt.year == 2014) & (df['pickup_datetime'].dt.month.isin([1, 2, 3]))] num_rides_first_quarter_2014 = first_quarter_2014.shape[0] print(f"Number of rides recorded in the first quarter of 2014: {num_rides_first_quarter_2014}") september_2010 = df[(df['pickup_datetime'].dt.year == 2010) & (df['pickup_datetime'].dt.month == 9)] september_2010['day_of_week'] = september_2010['pickup_datetime'].dt.day_name() rides_per_day = september_2010.groupby('day_of_week').size() max_rides_day = rides_per_day.idxmax() max_rides_count = rides_per_day.max() print(f"The day with maximum rides recorded in September 2010 was {max_rides_day} with {max_rides_count} rides.")
Leave a Comment