Untitled

mail@pastecode.io avatar
unknown
python
6 days ago
3.2 kB
10
Indexable
Never
import pandas as pd
import numpy as np
df=pd.read_csv("uber_rides_data.xlsx - sample_train.csv")
df.head()
df1=df.shape
print(df1)

integer_columns = df.select_dtypes(include=['int64'])
num_integer_columns = integer_columns.shape[1] 
print(f'The number of integer columns in the dataset is: {num_integer_columns}')


missing_values = df['dropoff_longitude'].isnull().sum()
print(f'The number of missing values in the "dropoff_longitude" column is: {missing_values}')


df.info()
df_cleaned = df.dropna(subset=['fare_amount'])
average_fare = df_cleaned['fare_amount'].mean()
print(f"The average fare amount is: {average_fare}")


def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371.0
    return r * c
df['haversine_distance'] = haversine(df['pickup_latitude'], df['pickup_longitude'],
                                     df['dropoff_latitude'], df['dropoff_longitude'])

median_distance = df['haversine_distance'].median()
print(f"The median Haversine distance is: {median_distance:.2f} km")



max_distance = df['haversine_distance'].max()
print(f"The maximum Haversine distance is: {max_distance:.2f} km")


zero_distance_count = df[df['haversine_distance'] == 0.0].shape[0]
print(f"The number of rides with 0.0 Haversine distance is: {zero_distance_count}")



zero_distance_rides = df[df['haversine_distance'] == 0.0]
mean_fare = zero_distance_rides['fare_amount'].mean()
print(f"The mean fare amount for rides with 0.0 Haversine distance is: {mean_fare}")


max_fare_amount = df['fare_amount'].max()
print(f"The maximum fare amount for a ride is: {max_fare_amount}")


max_fare_row = df.loc[df['fare_amount'].idxmax()]
pickup_lat = max_fare_row['pickup_latitude']
pickup_lon = max_fare_row['pickup_longitude']
dropoff_lat = max_fare_row['dropoff_latitude']
dropoff_lon = max_fare_row['dropoff_longitude']
distance = haversine(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon)
print(f"The Haversine distance for the costliest ride is: {distance:.6f} km")


df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
rides_2014 = df[(df['pickup_datetime'].dt.year == 2014)]
num_rides_2014 = rides_2014.shape[0]
print(f"Number of rides recorded in the year 2014: {num_rides_2014}")



first_quarter_2014 = df[(df['pickup_datetime'].dt.year == 2014) & 
                         (df['pickup_datetime'].dt.month.isin([1, 2, 3]))]
num_rides_first_quarter_2014 = first_quarter_2014.shape[0]
print(f"Number of rides recorded in the first quarter of 2014: {num_rides_first_quarter_2014}")



september_2010 = df[(df['pickup_datetime'].dt.year == 2010) & 
                    (df['pickup_datetime'].dt.month == 9)]
september_2010['day_of_week'] = september_2010['pickup_datetime'].dt.day_name()
rides_per_day = september_2010.groupby('day_of_week').size()
max_rides_day = rides_per_day.idxmax()
max_rides_count = rides_per_day.max()
print(f"The day with maximum rides recorded in September 2010 was {max_rides_day} with {max_rides_count} rides.")
Leave a Comment