assignment 3
unknown
plain_text
3 years ago
3.5 kB
12
Indexable
1)
import numpy as np
import pandas as pd
q1)a
def answer_one():
data = pd.read_csv('assets/fraud_data.csv')
result = len(data[data.Class == 1])/len(data)
return result
answer_one()
q1)b
# Use X_train, X_test, y_train, y_test for all of the following questions
from sklearn.model_selection import train_test_split
df = pd.read_csv('assets/fraud_data.csv')
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
q2)
def answer_two():
from sklearn.dummy import DummyClassifier
from sklearn.metrics import recall_score
# Negative class (0) is most frequent
dummy_majority = DummyClassifier(strategy = 'most_frequent').fit(X_train, y_train)
# Therefore the dummy 'most_frequent' classifier always predicts class 0
y_dummy_predictions = dummy_majority.predict(X_test)
acc_score = dummy_majority.score(X_test, y_test)
rec_score = recall_score(y_test, y_dummy_predictions)
return acc_score, rec_score
answer_two()
q3)
def answer_three():
from sklearn.metrics import recall_score, precision_score
from sklearn.svm import SVC
svm = SVC().fit(X_train, y_train)
y_pred = svm.predict(X_test)
acc_score = svm.score(X_test, y_test)
rec_score = recall_score(y_test, y_pred)
prec_score = precision_score(y_test, y_pred)
return acc_score, rec_score, prec_score
answer_three()
q4)
def answer_four():
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
THRESHOLD = -220.0
params = {'C': 1e9, 'gamma': 1e-07}
y_score = SVC(**params).fit(X_train, y_train).decision_function(X_test)
y_score_adj = np.zeros_like(y_score)
y_score_adj[np.where(y_score >= THRESHOLD)] = 1
confusion = confusion_matrix(y_test, y_score_adj)
return confusion
answer_four()
q5)
def answer_five():
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_curve, auc
lr = LogisticRegression()
y_proba_lr = lr.fit(X_train, y_train).predict_proba(X_test)
y_scores_lr = lr.fit(X_train, y_train).decision_function(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_scores_lr)
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_scores_lr)
rec = float(recall[np.where(precision==0.75)])
tpr = float(tpr_lr[np.where((fpr_lr >= 0.159) & (fpr_lr <= 0.161))][0])
return rec, tpr
answer_five()
q6)a
def answer_six():
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
grid_values = {'penalty': ['l1', 'l2'], 'C':[0.01, 0.1, 1, 10]}
lr = LogisticRegression()
grid_lr = GridSearchCV(lr, param_grid = grid_values, scoring = 'recall')
grid_lr.fit(X_train, y_train)
result = grid_lr.cv_results_['mean_test_score'].reshape(-1, 2)
return result
answer_six()
q6)b
# Use the following function to help visualize results from the grid search
def GridSearch_Heatmap(scores):
%matplotlib notebook
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure()
sns.heatmap(scores.reshape(4,2), xticklabels=['l1','l2'], yticklabels=[0.01, 0.1, 1, 10])
plt.yticks(rotation=0);
GridSearch_Heatmap(answer_six())
Editor is loading...