mail@pastecode.io avatar
a year ago
2.5 kB
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load the data into a Pandas DataFrame
data = pd.read_csv('data.csv')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.drop('csat', axis=1), data['csat'], test_size=0.2, random_state=42)

# One-hot encode the language column
preprocessor = ColumnTransformer(
    transformers=[('cat', OneHotEncoder(), ['language'])],
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Train a linear regression model for each language
languages = data['language'].unique()
models = {}
for language in languages:
    X_train_lang = X_train[data['language'] == language]
    y_train_lang = y_train[data['language'] == language]
    model_lang = LinearRegression().fit(X_train_lang, y_train_lang)
    models[language] = model_lang

# Train other regression models
models['DecisionTree'] = DecisionTreeRegressor().fit(X_train, y_train)
models['RandomForest'] = RandomForestRegressor().fit(X_train, y_train)

# Evaluate the performance of each model on the testing set
results = {}
for name, model in models.items():
    if name == 'DecisionTree' or name == 'RandomForest':
        y_pred = model.predict(X_test)
        lang = name
        X_test_lang = X_test[:, preprocessor.transformers_[0][2].index(f'x0_{lang}')]
        y_pred_lang = model.predict(X_test_lang)
        y_pred = [y_pred_lang[i] if X_test[i, preprocessor.transformers_[0][2].index(f'x0_{lang}')] == 1 else None for i in range(len(X_test))]
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MSE': mse, 'R2': r2}

# Print the results
for name, result in results.items():

# Select the best model based on R2 score
best_model_name = max(results, key=lambda x: results[x]['R2'])
best_model = models[best_model_name]
print(f'\nThe best model is {best_model_name} with an R2 score of {results[best_model_name]["R2"]:.2f}.')