Untitled

mail@pastecode.io avatar
unknown
plain_text
a year ago
5.8 kB
1
Indexable
Never

#Balancing----------------------------------------------------------------------

original = read_csv(f'dados/{file}_scaled_zscore.csv')
target = 'target'

###train e test split

y: np.ndarray = original.pop(target).values
X: np.ndarray = original.values
labels: np.ndarray = unique(y)
labels.sort()

trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)

train_bal = concat([DataFrame(trnX, columns=original.columns), DataFrame(trnY,columns=[target])], axis=1)
train_bal.to_csv(f'dados/{file}_train_bal.csv', index=False)

test_bal = concat([DataFrame(tstX, columns=original.columns), DataFrame(tstY,columns=[target])], axis=1)
test_bal.to_csv(f'dados/{file}_test_bal.csv', index=False)
values['Train_bal'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
values['Test_bal'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]

plt.figure(figsize=(12,4))
ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
#plt.show()

train_bal: DataFrame = read_csv(f'dados/{file}_train_bal.csv')
test_bal: DataFrame = read_csv(f'dados/{file}_test_bal.csv')

####criar copias
train_bal_1 = copy.deepcopy(train_bal)
test_bal = copy.deepcopy(test_bal)

train_bal_2 = copy.deepcopy(train_bal)
#test_bal_2 = copy.deepcopy(test_bal)

train_bal_3 = copy.deepcopy(train_bal)
#test_bal_3 = copy.deepcopy(test_bal)

from matplotlib.pyplot import figure, savefig, show, bar,xlabel, ylabel,title
from ds_charts import bar_chart

#####balancing

print(train_bal_1.columns)
target_count = train_bal_1[target].value_counts()
positive_class = target_count.idxmin()
negative_class = target_count.idxmax()
ind_positive_class = target_count.index.get_loc(positive_class)
print('Minority class=', positive_class, ':', target_count[positive_class])
print('Majority class=', negative_class, ':', target_count[negative_class])
print('Proportion:', round(target_count[positive_class] / target_count[negative_class], 2), ': 1')
values = {'Original': [target_count[positive_class], target_count[negative_class]]}

figure() #esta tudo mal alinhado
bar_chart(target_count.index, target_count.values, title='Class balance')
savefig(f'dados/{file}_balance.png')
#show()


##undersample

df_positives = train_bal_1[train_bal_1[target] == positive_class]
df_negatives = train_bal_1[train_bal_1[target] == negative_class]

from pandas import concat, DataFrame

df_neg_sample = DataFrame(df_negatives.sample(len(df_positives)))
df_under = concat([df_positives, df_neg_sample], axis=0)
df_under.to_csv(f'{file}_undersample.csv', index=False)
values['UnderSample'] = [len(df_positives), len(df_neg_sample)]
print('Minority class=', positive_class, ':', len(df_positives))
print('Majority class=', negative_class, ':', len(df_neg_sample))
print('Proportion:', round(len(df_positives) / len(df_neg_sample), 2), ': 1')

####NB - undersample
trnY: ndarray = df_under.pop(target).values
trnX: ndarray = df_under.values

labels = unique(trnY)
labels.sort()

tstY: ndarray = test_bal.pop(target).values
tstX: ndarray = test_bal.values

clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst)
savefig(f'dados/{file}_nb_best_under.png')
show()

###Replication aka oversample

df_positives = train_bal_2[train_bal_2[target] == positive_class]
df_negatives = train_bal_2[train_bal_2[target] == negative_class]

from pandas import concat, DataFrame

df_pos_sample = DataFrame(df_positives.sample(len(df_negatives), replace=True))
df_over = concat([df_pos_sample, df_negatives], axis=0)
df_over.to_csv(f'dados/{file}_oversample.csv', index=False)
values['OverSample'] = [len(df_pos_sample), len(df_negatives)]
print('Minority class=', positive_class, ':', len(df_pos_sample))
print('Majority class=', negative_class, ':', len(df_negatives))
print('Proportion:', round(len(df_pos_sample) / len(df_negatives), 2), ': 1')

###NB - replication

trnY_rep: ndarray = df_over.pop(target).values
trnX_rep: ndarray = df_over.values

labels = unique(trnY)
labels.sort()

#print(test_bal_2.columns)
#tstY: ndarray = test_ba.pop(target).values
#tstX: ndarray = test_bal_2.values

clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst)
savefig(f'dados/{file}_nb_best_over.png')
show()


#Smote
from pandas import Series
from imblearn.over_sampling import SMOTE
RANDOM_STATE = 42

smote = SMOTE(sampling_strategy='minority', random_state=RANDOM_STATE)
y = train_bal_3.pop(target).values
X = train_bal_3.values
smote_X, smote_y = smote.fit_resample(X, y)
df_smote = concat([DataFrame(smote_X), DataFrame(smote_y)], axis=1)
df_smote.columns = list(original.columns) + ['target']
df_smote.to_csv(f'dados/{file}_smote.csv', index=False)

smote_target_count = Series(smote_y).value_counts()
values['SMOTE'] = [smote_target_count[positive_class], smote_target_count[negative_class]]
print('Minority class=', positive_class, ':', smote_target_count[positive_class])
print('Majority class=', negative_class, ':', smote_target_count[negative_class])
print('Proportion:', round(smote_target_count[positive_class] / smote_target_count[negative_class], 2), ': 1')

##NB - smote
trnY: ndarray = df_smote.pop(target).values
trnX: ndarray = df_smote.values

labels = unique(trnY)
labels.sort()

#tstY: ndarray = test_bal_3.pop(target).values
#tstX: ndarray = test_bal_3.values

clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst)
savefig(f'dados/{file}_nb_best_smote.png')
show()