Untitled
unknown
plain_text
7 months ago
6.3 kB
1
Indexable
Never
import pandas as pd import numpy as np # Data Exploration # Load data df = pd.read_csv('data.csv') # Adjust filename as needed # View basic information print(df.head()) # First 5 rows print(df.describe()) # Summary statistics print(df.info()) # Data types and non-null counts # Data Manipulation ## Filtering filtered_data = df[df['column'] > value] # Adjust column and value as needed ## Sorting sorted_data = df.sort_values(by='column', ascending=True) # Adjust column as needed ## Aggregation aggregated_data = df.aggregate({'column': ['min', 'max', 'mean']}) # Adjust column as needed ## Joining Data Frames other_df = pd.read_csv('other_data.csv') # Load another dataset joined_data = pd.merge(df, other_df, on='common_column', how='inner') # Adjust column and how as needed ## GroupBy Mechanism grouped_data = df.groupby('column').mean() # Adjust column as needed # Files Formats and Standard Libraries/Tools ## Reading different formats csv_data = pd.read_csv('data.csv') excel_data = pd.read_excel('data.xlsx', sheet_name='Sheet1') # Adjust sheet_name as needed json_data = pd.read_json('data.json') ## Saving data df.to_csv('output.csv', index=False) df.to_excel('output.xlsx', index=False) df.to_json('output.json') # SQL-like operations in Pandas ## Query basics query_result = df.query('column > value') # Adjust column and value as needed ## Merges and joins merged_data = pd.merge(df, other_df, on='key_column', how='inner') # Adjust key_column and how as needed ## Window functions and analytic functions df['cumulative_sum'] = df['column'].expanding().sum() # Adjust column as needed df['rolling_average'] = df['column'].rolling(window=3).mean() # Adjust column and window size as needed import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer from sklearn.preprocessing import OrdinalEncoder # Load the train and test datasets train_df = pd.read_csv('data/train.csv') test_df = pd.read_csv('data/test.csv') # Fill missing values in the age column with the mean age of the houses imputer = SimpleImputer(strategy='mean') train_df['age'] = imputer.fit_transform(train_df[['age']]) test_df['age'] = imputer.transform(test_df[['age']]) # Convert the condition_name column into numerical values using ordinal encoding ordinal_encoder = OrdinalEncoder() train_df['condition_name'] = ordinal_encoder.fit_transform(train_df[['condition_name']]) test_df['condition_name'] = ordinal_encoder.transform(test_df[['condition_name']]) # Normalize the sqft_living and sqft_lot columns using Standard Scaling scaler = StandardScaler() train_df[['sqft_living', 'sqft_lot']] = scaler.fit_transform(train_df[['sqft_living', 'sqft_lot']]) test_df[['sqft_living', 'sqft_lot']] = scaler.transform(test_df[['sqft_living', 'sqft_lot']]) # Save the processed datasets as CSV files train_df.to_csv('processed_train.csv', index=False) test_df.to_csv('processed_test.csv', index=False) import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.impute import SimpleImputer from sklearn.preprocessing import OrdinalEncoder # Load the train and test datasets train_df = pd.read_csv('data/train.csv') test_df = pd.read_csv('data/test.csv') # Fill missing values in the age column with the mean age of the houses imputer = SimpleImputer(strategy='mean') train_df['age'] = imputer.fit_transform(train_df[['age']]) test_df['age'] = imputer.transform(test_df[['age']]) # Convert the condition_name column into numerical values using ordinal encoding ordinal_encoder = OrdinalEncoder() train_df['condition_name'] = ordinal_encoder.fit_transform(train_df[['condition_name']]) test_df['condition_name'] = ordinal_encoder.transform(test_df[['condition_name']]) # Normalize the sqft_living and sqft_lot columns using Standard Scaling scaler = StandardScaler() train_df[['sqft_living', 'sqft_lot']] = scaler.fit_transform(train_df[['sqft_living', 'sqft_lot']]) test_df[['sqft_living', 'sqft_lot']] = scaler.transform(test_df[['sqft_living', 'sqft_lot']]) # Save the processed datasets as CSV files train_df.to_csv('processed_train.csv', index=False) test_df.to_csv('processed_test.csv', index=False) import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut, KFold from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error, r2_score, roc_auc_score from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV, RandomizedSearchCV data = pd.read_csv("data//wine_quality//winequality-red.csv", delimiter=';') # Split data into Training, Validation, and Development sets train_data, test_data = train_test_split(data, test_size=0.2, random_state=42) train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42) train_data = train_data.drop(columns=['quality']) val_data = val_data['quality'] test_data = test_data['quality'] # Alternatively, perform k-fold cross-validation k_fold = KFold(n_splits=5, shuffle=True, random_state=42) for train_index, val_index in k_fold.split(train_data): X_train, X_val = train_data.iloc[train_index], train_data.iloc[val_index] y_train, y_val = train_data.iloc[train_index], train_data.iloc[val_index] # Alternatively, perform Leave-One-Out Cross-Validation (LOOCV) loo = LeaveOneOut() for train_index, val_index in loo.split(train_data): X_train, X_val = train_data.iloc[train_index], train_data.iloc[val_index] y_train, y_val = train_data.iloc[train_index], train_data.iloc[val_index] from sklearn.ensemble import RandomForestRegressor # Define model and hyperparameters model = RandomForestRegressor(random_state=42) params = {'n_estimators': [100, 200]} # Perform hyperparameter tuning using GridSearchCV grid_search = GridSearchCV(estimator=model, param_grid=params, scoring='r2', cv=5) grid_search.fit(X_train, y_train) y_pred = grid_search.predict(X_val) mse = mean_squared_error(y_val, y_pred) predictions = pd.DataFrame({'quality': np.argmax(y_pred, axis=1)}) predictions.to_csv('predictions.csv', index=False) r2 = r2_score(y_val, y_pred)
Leave a Comment