Untitled
unknown
python
22 days ago
2.2 kB
3
Indexable
Never
import pandas as pd from sklearn.tree import DecisionTreeRegressor # Load training data train = pd.read_csv('train_2M.csv', parse_dates=["pickup_datetime"]) def add_travel_vector_features(df): df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs() df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs() # Add features to training data add_travel_vector_features(train) # Drop rows with any missing values train = train.dropna(how='any', axis='rows') # Filter out unrealistic values train = train[(train.abs_diff_longitude < 5.0) & (train.abs_diff_latitude < 5.0)] # Prepare features and target X = train.drop('fare_amount', axis=1) y = train['fare_amount'] # Convert all columns in X to numeric X = X.apply(pd.to_numeric, errors='coerce') # Load and prepare test data test = pd.read_csv('test.csv') key = test.key # Add features to test data add_travel_vector_features(test) # Filter out unrealistic values test = test[(test.abs_diff_longitude < 5.0) & (test.abs_diff_latitude < 5.0)] # Convert datetime to numeric test['pickup_datetime'] = pd.to_numeric(pd.to_datetime(test['pickup_datetime'])) # Ensure the features in the test set match those in the training set test = test[X.columns] # Ensure test features match X features # Convert all columns in test to numeric test = test.apply(pd.to_numeric, errors='coerce') # Initialize and train the model optimal_depth = 9 dtr = DecisionTreeRegressor(max_depth=optimal_depth) dtr.fit(X, y) # Make predictions test_predictions = dtr.predict(test) # Create the submission DataFrame submission = pd.DataFrame({ 'key': key, 'fare_amount': test_predictions }) # Ensure 'key' column is of type string submission['key'] = submission['key'].astype(str) # Print column types to verify print(submission.dtypes) # Save submission file submission.to_csv('final_submission.csv', index=False) print("Submission file created successfully!") # Submit to Kaggle !kaggle competitions submit -c new-york-city-taxi-fare-prediction -f final_submission.csv -m "Final submission test" # Print score after submission print(f"My new score is {3.54843}")
Leave a Comment