Untitled

mail@pastecode.io avatar
unknown
plain_text
7 months ago
6.3 kB
1
Indexable
Never
import pandas as pd
import numpy as np

# Data Exploration
# Load data
df = pd.read_csv('data.csv')  # Adjust filename as needed

# View basic information
print(df.head())  # First 5 rows
print(df.describe())  # Summary statistics
print(df.info())  # Data types and non-null counts

# Data Manipulation
## Filtering
filtered_data = df[df['column'] > value]  # Adjust column and value as needed

## Sorting
sorted_data = df.sort_values(by='column', ascending=True)  # Adjust column as needed

## Aggregation
aggregated_data = df.aggregate({'column': ['min', 'max', 'mean']})  # Adjust column as needed

## Joining Data Frames
other_df = pd.read_csv('other_data.csv')  # Load another dataset
joined_data = pd.merge(df, other_df, on='common_column', how='inner')  # Adjust column and how as needed

## GroupBy Mechanism
grouped_data = df.groupby('column').mean()  # Adjust column as needed

# Files Formats and Standard Libraries/Tools
## Reading different formats
csv_data = pd.read_csv('data.csv')
excel_data = pd.read_excel('data.xlsx', sheet_name='Sheet1')  # Adjust sheet_name as needed
json_data = pd.read_json('data.json')

## Saving data
df.to_csv('output.csv', index=False)
df.to_excel('output.xlsx', index=False)
df.to_json('output.json')

# SQL-like operations in Pandas
## Query basics
query_result = df.query('column > value')  # Adjust column and value as needed

## Merges and joins
merged_data = pd.merge(df, other_df, on='key_column', how='inner')  # Adjust key_column and how as needed

## Window functions and analytic functions
df['cumulative_sum'] = df['column'].expanding().sum()  # Adjust column as needed
df['rolling_average'] = df['column'].rolling(window=3).mean()  # Adjust column and window size as needed



import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

# Load the train and test datasets
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Fill missing values in the age column with the mean age of the houses
imputer = SimpleImputer(strategy='mean')
train_df['age'] = imputer.fit_transform(train_df[['age']])
test_df['age'] = imputer.transform(test_df[['age']])

# Convert the condition_name column into numerical values using ordinal encoding
ordinal_encoder = OrdinalEncoder()
train_df['condition_name'] = ordinal_encoder.fit_transform(train_df[['condition_name']])
test_df['condition_name'] = ordinal_encoder.transform(test_df[['condition_name']])

# Normalize the sqft_living and sqft_lot columns using Standard Scaling
scaler = StandardScaler()
train_df[['sqft_living', 'sqft_lot']] = scaler.fit_transform(train_df[['sqft_living', 'sqft_lot']])
test_df[['sqft_living', 'sqft_lot']] = scaler.transform(test_df[['sqft_living', 'sqft_lot']])

# Save the processed datasets as CSV files
train_df.to_csv('processed_train.csv', index=False)
test_df.to_csv('processed_test.csv', index=False)



import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder

# Load the train and test datasets
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

# Fill missing values in the age column with the mean age of the houses
imputer = SimpleImputer(strategy='mean')
train_df['age'] = imputer.fit_transform(train_df[['age']])
test_df['age'] = imputer.transform(test_df[['age']])

# Convert the condition_name column into numerical values using ordinal encoding
ordinal_encoder = OrdinalEncoder()
train_df['condition_name'] = ordinal_encoder.fit_transform(train_df[['condition_name']])
test_df['condition_name'] = ordinal_encoder.transform(test_df[['condition_name']])

# Normalize the sqft_living and sqft_lot columns using Standard Scaling
scaler = StandardScaler()
train_df[['sqft_living', 'sqft_lot']] = scaler.fit_transform(train_df[['sqft_living', 'sqft_lot']])
test_df[['sqft_living', 'sqft_lot']] = scaler.transform(test_df[['sqft_living', 'sqft_lot']])

# Save the processed datasets as CSV files
train_df.to_csv('processed_train.csv', index=False)
test_df.to_csv('processed_test.csv', index=False)


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut, KFold
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error, r2_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

data = pd.read_csv("data//wine_quality//winequality-red.csv", delimiter=';')

# Split data into Training, Validation, and Development sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)

train_data = train_data.drop(columns=['quality'])
val_data = val_data['quality']
test_data = test_data['quality']

# Alternatively, perform k-fold cross-validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, val_index in k_fold.split(train_data):
    X_train, X_val = train_data.iloc[train_index], train_data.iloc[val_index]
    y_train, y_val = train_data.iloc[train_index], train_data.iloc[val_index]

# Alternatively, perform Leave-One-Out Cross-Validation (LOOCV)
loo = LeaveOneOut()
for train_index, val_index in loo.split(train_data):
    X_train, X_val = train_data.iloc[train_index], train_data.iloc[val_index]
    y_train, y_val = train_data.iloc[train_index], train_data.iloc[val_index]


from sklearn.ensemble import RandomForestRegressor

# Define model and hyperparameters
model = RandomForestRegressor(random_state=42)
params = {'n_estimators': [100, 200]}


# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=params, scoring='r2', cv=5)
grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_val)

mse = mean_squared_error(y_val, y_pred)

predictions = pd.DataFrame({'quality': np.argmax(y_pred, axis=1)})
predictions.to_csv('predictions.csv', index=False)
r2 = r2_score(y_val, y_pred)

Leave a Comment