Untitled
unknown
plain_text
2 years ago
6.3 kB
8
Indexable
import pandas as pd
import numpy as np
# Data Exploration
# Load data
df = pd.read_csv('data.csv') # Adjust filename as needed
# View basic information
print(df.head()) # First 5 rows
print(df.describe()) # Summary statistics
print(df.info()) # Data types and non-null counts
# Data Manipulation
## Filtering
filtered_data = df[df['column'] > value] # Adjust column and value as needed
## Sorting
sorted_data = df.sort_values(by='column', ascending=True) # Adjust column as needed
## Aggregation
aggregated_data = df.aggregate({'column': ['min', 'max', 'mean']}) # Adjust column as needed
## Joining Data Frames
other_df = pd.read_csv('other_data.csv') # Load another dataset
joined_data = pd.merge(df, other_df, on='common_column', how='inner') # Adjust column and how as needed
## GroupBy Mechanism
grouped_data = df.groupby('column').mean() # Adjust column as needed
# Files Formats and Standard Libraries/Tools
## Reading different formats
csv_data = pd.read_csv('data.csv')
excel_data = pd.read_excel('data.xlsx', sheet_name='Sheet1') # Adjust sheet_name as needed
json_data = pd.read_json('data.json')
## Saving data
df.to_csv('output.csv', index=False)
df.to_excel('output.xlsx', index=False)
df.to_json('output.json')
# SQL-like operations in Pandas
## Query basics
query_result = df.query('column > value') # Adjust column and value as needed
## Merges and joins
merged_data = pd.merge(df, other_df, on='key_column', how='inner') # Adjust key_column and how as needed
## Window functions and analytic functions
df['cumulative_sum'] = df['column'].expanding().sum() # Adjust column as needed
df['rolling_average'] = df['column'].rolling(window=3).mean() # Adjust column and window size as needed
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
# Load the train and test datasets
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
# Fill missing values in the age column with the mean age of the houses
imputer = SimpleImputer(strategy='mean')
train_df['age'] = imputer.fit_transform(train_df[['age']])
test_df['age'] = imputer.transform(test_df[['age']])
# Convert the condition_name column into numerical values using ordinal encoding
ordinal_encoder = OrdinalEncoder()
train_df['condition_name'] = ordinal_encoder.fit_transform(train_df[['condition_name']])
test_df['condition_name'] = ordinal_encoder.transform(test_df[['condition_name']])
# Normalize the sqft_living and sqft_lot columns using Standard Scaling
scaler = StandardScaler()
train_df[['sqft_living', 'sqft_lot']] = scaler.fit_transform(train_df[['sqft_living', 'sqft_lot']])
test_df[['sqft_living', 'sqft_lot']] = scaler.transform(test_df[['sqft_living', 'sqft_lot']])
# Save the processed datasets as CSV files
train_df.to_csv('processed_train.csv', index=False)
test_df.to_csv('processed_test.csv', index=False)
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
# Load the train and test datasets
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
# Fill missing values in the age column with the mean age of the houses
imputer = SimpleImputer(strategy='mean')
train_df['age'] = imputer.fit_transform(train_df[['age']])
test_df['age'] = imputer.transform(test_df[['age']])
# Convert the condition_name column into numerical values using ordinal encoding
ordinal_encoder = OrdinalEncoder()
train_df['condition_name'] = ordinal_encoder.fit_transform(train_df[['condition_name']])
test_df['condition_name'] = ordinal_encoder.transform(test_df[['condition_name']])
# Normalize the sqft_living and sqft_lot columns using Standard Scaling
scaler = StandardScaler()
train_df[['sqft_living', 'sqft_lot']] = scaler.fit_transform(train_df[['sqft_living', 'sqft_lot']])
test_df[['sqft_living', 'sqft_lot']] = scaler.transform(test_df[['sqft_living', 'sqft_lot']])
# Save the processed datasets as CSV files
train_df.to_csv('processed_train.csv', index=False)
test_df.to_csv('processed_test.csv', index=False)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut, KFold
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error, r2_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
data = pd.read_csv("data//wine_quality//winequality-red.csv", delimiter=';')
# Split data into Training, Validation, and Development sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)
train_data = train_data.drop(columns=['quality'])
val_data = val_data['quality']
test_data = test_data['quality']
# Alternatively, perform k-fold cross-validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, val_index in k_fold.split(train_data):
X_train, X_val = train_data.iloc[train_index], train_data.iloc[val_index]
y_train, y_val = train_data.iloc[train_index], train_data.iloc[val_index]
# Alternatively, perform Leave-One-Out Cross-Validation (LOOCV)
loo = LeaveOneOut()
for train_index, val_index in loo.split(train_data):
X_train, X_val = train_data.iloc[train_index], train_data.iloc[val_index]
y_train, y_val = train_data.iloc[train_index], train_data.iloc[val_index]
from sklearn.ensemble import RandomForestRegressor
# Define model and hyperparameters
model = RandomForestRegressor(random_state=42)
params = {'n_estimators': [100, 200]}
# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=params, scoring='r2', cv=5)
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
predictions = pd.DataFrame({'quality': np.argmax(y_pred, axis=1)})
predictions.to_csv('predictions.csv', index=False)
r2 = r2_score(y_val, y_pred)
Editor is loading...
Leave a Comment