Untitled
unknown
python
3 years ago
2.2 kB
9
Indexable
from sklearn.preprocessing import OneHotEncoder
# Load the data into a Pandas DataFrame
data = pd.read_csv('data.csv')
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.drop('csat', axis=1), data['csat'], test_size=0.2, random_state=42)
# Train a linear regression model for each language
languages = X_train['language'].unique()
models = {}
for language in languages:
X_train_lang = X_train[X_train['language'] == language].drop('language', axis=1)
y_train_lang = y_train[X_train['language'] == language]
model_lang = LinearRegression().fit(X_train_lang, y_train_lang)
models[language] = model_lang
# One-hot encode the language column for the Decision Tree and Random Forest models
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train)
X_test_encoded = encoder.transform(X_test)
# Train the Decision Tree and Random Forest models
models['DecisionTree'] = DecisionTreeRegressor().fit(X_train_encoded, y_train)
models['RandomForest'] = RandomForestRegressor().fit(X_train_encoded, y_train)
# Evaluate the performance of each model on the testing set
results = {}
for name, model in models.items():
if name == 'DecisionTree' or name == 'RandomForest':
y_pred = model.predict(X_test_encoded)
else:
lang = name
X_test_lang = X_test[X_test['language'] == lang].drop('language', axis=1)
y_test_lang = y_test[X_test['language'] == lang]
y_pred_lang = model.predict(X_test_lang)
y_pred = [y_pred_lang[i] if lang == X_test.iloc[i]['language'] else None for i in range(len(X_test))]
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
results[name] = {'MSE': mse, 'R2': r2}
# Print the results
print('Model\t\t\tMSE\t\tR2')
for name, result in results.items():
print(f'{name}\t\t{result["MSE"]:.2f}\t\t{result["R2"]:.2f}')
# Select the best model based on R2 score
best_model_name = max(results, key=lambda x: results[x]['R2'])
best_model = models[best_model_name]
print(f'\nThe best model is {best_model_name} with an R2 score of {results[best_model_name]["R2"]:.2f}.')Editor is loading...