Untitled
unknown
python
2 years ago
2.5 kB
3
Indexable
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score # Load the data into a Pandas DataFrame data = pd.read_csv('data.csv') # Split the data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(data.drop('csat', axis=1), data['csat'], test_size=0.2, random_state=42) # One-hot encode the language column preprocessor = ColumnTransformer( transformers=[('cat', OneHotEncoder(), ['language'])], remainder='passthrough') X_train = preprocessor.fit_transform(X_train) X_test = preprocessor.transform(X_test) # Train a linear regression model for each language languages = data['language'].unique() models = {} for language in languages: X_train_lang = X_train[data['language'] == language] y_train_lang = y_train[data['language'] == language] model_lang = LinearRegression().fit(X_train_lang, y_train_lang) models[language] = model_lang # Train other regression models models['DecisionTree'] = DecisionTreeRegressor().fit(X_train, y_train) models['RandomForest'] = RandomForestRegressor().fit(X_train, y_train) # Evaluate the performance of each model on the testing set results = {} for name, model in models.items(): if name == 'DecisionTree' or name == 'RandomForest': y_pred = model.predict(X_test) else: lang = name X_test_lang = X_test[:, preprocessor.transformers_[0][2].index(f'x0_{lang}')] y_pred_lang = model.predict(X_test_lang) y_pred = [y_pred_lang[i] if X_test[i, preprocessor.transformers_[0][2].index(f'x0_{lang}')] == 1 else None for i in range(len(X_test))] mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) results[name] = {'MSE': mse, 'R2': r2} # Print the results print('Model\t\t\tMSE\t\tR2') for name, result in results.items(): print(f'{name}\t\t{result["MSE"]:.2f}\t\t{result["R2"]:.2f}') # Select the best model based on R2 score best_model_name = max(results, key=lambda x: results[x]['R2']) best_model = models[best_model_name] print(f'\nThe best model is {best_model_name} with an R2 score of {results[best_model_name]["R2"]:.2f}.')
Editor is loading...