Untitled
unknown
plain_text
2 years ago
5.8 kB
6
Indexable
#Balancing---------------------------------------------------------------------- original = read_csv(f'dados/{file}_scaled_zscore.csv') target = 'target' ###train e test split y: np.ndarray = original.pop(target).values X: np.ndarray = original.values labels: np.ndarray = unique(y) labels.sort() trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0) train_bal = concat([DataFrame(trnX, columns=original.columns), DataFrame(trnY,columns=[target])], axis=1) train_bal.to_csv(f'dados/{file}_train_bal.csv', index=False) test_bal = concat([DataFrame(tstX, columns=original.columns), DataFrame(tstY,columns=[target])], axis=1) test_bal.to_csv(f'dados/{file}_test_bal.csv', index=False) values['Train_bal'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))] values['Test_bal'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))] plt.figure(figsize=(12,4)) ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset') #plt.show() train_bal: DataFrame = read_csv(f'dados/{file}_train_bal.csv') test_bal: DataFrame = read_csv(f'dados/{file}_test_bal.csv') ####criar copias train_bal_1 = copy.deepcopy(train_bal) test_bal = copy.deepcopy(test_bal) train_bal_2 = copy.deepcopy(train_bal) #test_bal_2 = copy.deepcopy(test_bal) train_bal_3 = copy.deepcopy(train_bal) #test_bal_3 = copy.deepcopy(test_bal) from matplotlib.pyplot import figure, savefig, show, bar,xlabel, ylabel,title from ds_charts import bar_chart #####balancing print(train_bal_1.columns) target_count = train_bal_1[target].value_counts() positive_class = target_count.idxmin() negative_class = target_count.idxmax() ind_positive_class = target_count.index.get_loc(positive_class) print('Minority class=', positive_class, ':', target_count[positive_class]) print('Majority class=', negative_class, ':', target_count[negative_class]) print('Proportion:', round(target_count[positive_class] / target_count[negative_class], 2), ': 1') values = {'Original': [target_count[positive_class], target_count[negative_class]]} figure() #esta tudo mal alinhado bar_chart(target_count.index, target_count.values, title='Class balance') savefig(f'dados/{file}_balance.png') #show() ##undersample df_positives = train_bal_1[train_bal_1[target] == positive_class] df_negatives = train_bal_1[train_bal_1[target] == negative_class] from pandas import concat, DataFrame df_neg_sample = DataFrame(df_negatives.sample(len(df_positives))) df_under = concat([df_positives, df_neg_sample], axis=0) df_under.to_csv(f'{file}_undersample.csv', index=False) values['UnderSample'] = [len(df_positives), len(df_neg_sample)] print('Minority class=', positive_class, ':', len(df_positives)) print('Majority class=', negative_class, ':', len(df_neg_sample)) print('Proportion:', round(len(df_positives) / len(df_neg_sample), 2), ': 1') ####NB - undersample trnY: ndarray = df_under.pop(target).values trnX: ndarray = df_under.values labels = unique(trnY) labels.sort() tstY: ndarray = test_bal.pop(target).values tstX: ndarray = test_bal.values clf = GaussianNB() clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst) savefig(f'dados/{file}_nb_best_under.png') show() ###Replication aka oversample df_positives = train_bal_2[train_bal_2[target] == positive_class] df_negatives = train_bal_2[train_bal_2[target] == negative_class] from pandas import concat, DataFrame df_pos_sample = DataFrame(df_positives.sample(len(df_negatives), replace=True)) df_over = concat([df_pos_sample, df_negatives], axis=0) df_over.to_csv(f'dados/{file}_oversample.csv', index=False) values['OverSample'] = [len(df_pos_sample), len(df_negatives)] print('Minority class=', positive_class, ':', len(df_pos_sample)) print('Majority class=', negative_class, ':', len(df_negatives)) print('Proportion:', round(len(df_pos_sample) / len(df_negatives), 2), ': 1') ###NB - replication trnY_rep: ndarray = df_over.pop(target).values trnX_rep: ndarray = df_over.values labels = unique(trnY) labels.sort() #print(test_bal_2.columns) #tstY: ndarray = test_ba.pop(target).values #tstX: ndarray = test_bal_2.values clf = GaussianNB() clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst) savefig(f'dados/{file}_nb_best_over.png') show() #Smote from pandas import Series from imblearn.over_sampling import SMOTE RANDOM_STATE = 42 smote = SMOTE(sampling_strategy='minority', random_state=RANDOM_STATE) y = train_bal_3.pop(target).values X = train_bal_3.values smote_X, smote_y = smote.fit_resample(X, y) df_smote = concat([DataFrame(smote_X), DataFrame(smote_y)], axis=1) df_smote.columns = list(original.columns) + ['target'] df_smote.to_csv(f'dados/{file}_smote.csv', index=False) smote_target_count = Series(smote_y).value_counts() values['SMOTE'] = [smote_target_count[positive_class], smote_target_count[negative_class]] print('Minority class=', positive_class, ':', smote_target_count[positive_class]) print('Majority class=', negative_class, ':', smote_target_count[negative_class]) print('Proportion:', round(smote_target_count[positive_class] / smote_target_count[negative_class], 2), ': 1') ##NB - smote trnY: ndarray = df_smote.pop(target).values trnX: ndarray = df_smote.values labels = unique(trnY) labels.sort() #tstY: ndarray = test_bal_3.pop(target).values #tstX: ndarray = test_bal_3.values clf = GaussianNB() clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst) savefig(f'dados/{file}_nb_best_smote.png') show()
Editor is loading...