from copy import deepcopy
import numpy as np
from pandas import read_csv, concat, unique, DataFrame
import matplotlib.pyplot as plt
import ds_charts as ds
from sklearn.model_selection import train_test_split
import pandas
file = 'monthly_data'
filename = 'small_ca_monthly_data.csv'
data = read_csv(filename, na_values='na')
data=data.dropna(axis=1,how='all')
data = data.drop(columns='station')
data['FRGT'] = pandas.to_numeric(data['FRGT'], errors='coerce')
print(data.shape)
print(data.FRGT)
target = 'target'
positive = 1
negative = 0
values = {'Original': [len(data[data[target] == positive]), len(data[data[target] == negative])]}
##MV Imputation - drop cols + mean
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
mv = {}
for var in data:
nr = data[var].isna().sum()
if nr > 0:
mv[var] = nr
##Drop columns with more than 90% MVs
threshold = data.shape[0] * 0.90
missings = [c for c in mv.keys() if mv[c]>threshold]
df = data.drop(columns=missings, inplace=False)
df.to_csv(f'dados/{file}_drop_columns_mv.csv', index=False)
print('Dropped variables', missings)
print(df.shape)
data_drop: DataFrame = read_csv(f'dados/{file}_drop_columns_mv.csv', index_col=0)
#fill the others with mean
from sklearn.impute import SimpleImputer
from pandas import concat, DataFrame
from ds_charts import get_variable_types
from numpy import nan
tmp_nr, tmp_sb, tmp_bool = None, None, None
variables = get_variable_types(data_drop)
numeric_vars = variables['Numeric']
if len(numeric_vars) > 0:
imp = SimpleImputer(strategy='mean', missing_values=nan, copy=True)
tmp_nr = DataFrame(imp.fit_transform(data_drop[numeric_vars]), columns=numeric_vars)
df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1)
df.index = data_drop.index
df[["target"]] = data_drop[["target"]]
df.to_csv(f'dados/{file}_data_drop_mean.csv', index=False)
print(df.describe(include='all'))
data_drop_mean: DataFrame = read_csv(f'dados/{file}_data_drop_mean.csv', index_col=0)
##Train and test split
target = 'target'
y: np.ndarray = data_drop_mean.pop(target).values
X: np.ndarray = data_drop_mean.values
labels: np.ndarray = unique(y)
labels.sort()
trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)
train_drop_mean = concat([DataFrame(trnX, columns=data_drop_mean.columns), DataFrame(trnY,columns=[target])], axis=1)
train_drop_mean.to_csv(f'dados/{file}_train_drop_mean.csv', index=False)
test_drop_mean = concat([DataFrame(tstX, columns=data_drop_mean.columns), DataFrame(tstY,columns=[target])], axis=1)
test_drop_mean.to_csv(f'dados/{file}_test_drop_mean.csv', index=False)
values['Train_drop_mean'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
values['Test_drop_mean'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]
plt.figure(figsize=(12,4))
ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
plt.show()
train_drop_mean: DataFrame = read_csv(f'dados/{file}_train_drop_mean.csv')
test_drop_mean: DataFrame = read_csv(f'dados/{file}_test_drop_mean.csv')
#NB
from numpy import ndarray
from pandas import DataFrame, read_csv, unique
from matplotlib.pyplot import savefig, show
from sklearn.naive_bayes import GaussianNB
from ds_charts import plot_evaluation_results
trnY: ndarray = train_drop_mean.pop(target).values
trnX: ndarray = train_drop_mean.values
labels = unique(trnY)
labels.sort()
print(test_drop_mean.columns)
tstY: ndarray = test_drop_mean.pop(target).values
tstX: ndarray = test_drop_mean.values
clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels,trnY, prd_trn, tstY, prd_tst)
savefig(f'dados/{file}_nb_best_drop_mean.png')
show()
##MV Imputation - drop rows + mean
##Drop rows with more than 60% MVs
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
threshold = data.shape[1] * 0.40
df = data.dropna(thresh=threshold, inplace=False)
df.to_csv(f'dados/{file}_drop_rows_mv.csv', index=True)
print(df.shape)
print(df.columns)
data_drop_r: DataFrame = read_csv(f'dados/{file}_drop_rows_mv.csv', index_col=0)
#fill empty columns with 0
nan_cols = data_drop_r.columns[data_drop_r.isna().all()].tolist()
data_drop_r[nan_cols] = data_drop_r[nan_cols].fillna(0)
#fill the others with mean
from sklearn.impute import SimpleImputer
from pandas import concat, DataFrame
from ds_charts import get_variable_types
from numpy import nan
tmp_nr, tmp_sb, tmp_bool = None, None, None
variables = get_variable_types(data_drop_r)
numeric_vars = variables['Numeric']
if len(numeric_vars) > 0:
imp = SimpleImputer(strategy='mean', missing_values=nan, copy=True)
tmp_nr = DataFrame(imp.fit_transform(data_drop_r[numeric_vars]), columns=numeric_vars)
df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1)
df.index = data_drop_r.index
df[["target"]] = data_drop_r[["target"]]
df.to_csv(f'dados/{file}_data_drop_mean_r.csv', index=False)
print(df.describe(include='all'))
data_drop_mean_r: DataFrame = read_csv(f'dados/{file}_data_drop_mean_r.csv', index_col=0)
##Train and test split
target = 'target'
y: np.ndarray = data_drop_mean_r.pop(target).values
X: np.ndarray = data_drop_mean_r.values
labels: np.ndarray = unique(y)
labels.sort()
trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)
train_drop_mean_r = concat([DataFrame(trnX, columns=data_drop_mean_r.columns), DataFrame(trnY,columns=[target])], axis=1)
train_drop_mean_r.to_csv(f'dados/{file}_train_drop_mean_r.csv', index=False)
test_drop_mean_r = concat([DataFrame(tstX, columns=data_drop_mean_r.columns), DataFrame(tstY,columns=[target])], axis=1)
test_drop_mean_r.to_csv(f'dados/{file}_test_drop_mean_r.csv', index=False)
values['Train_drop_mean_r'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
values['Test_drop_mean_r'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]
plt.figure(figsize=(12,4))
ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
plt.show()
train_drop_mean_r: DataFrame = read_csv(f'dados/{file}_train_drop_mean_r.csv')
test_drop_mean_r: DataFrame = read_csv(f'dados/{file}_test_drop_mean_r.csv')
#NB
from numpy import ndarray
from pandas import DataFrame, read_csv, unique
from matplotlib.pyplot import savefig, show
from sklearn.naive_bayes import GaussianNB
from ds_charts import plot_evaluation_results
trnY: ndarray = train_drop_mean_r.pop(target).values
trnX: ndarray = train_drop_mean_r.values
labels = unique(trnY)
labels.sort()
print(test_drop_mean_r.columns)
tstY: ndarray = test_drop_mean_r.pop(target).values
tstX: ndarray = test_drop_mean_r.values
clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst)
savefig(f'dados/{file}_nb_best_drop_mean_r.png')
show()
##Scaling --------------------------------------------------------------
data_scaling = read_csv(f'dados/{file}_data_drop_mean_r.csv')
variable_types = get_variable_types(data_scaling)
numeric_vars = variable_types['Numeric']
symbolic_vars = variable_types['Symbolic']
boolean_vars = variable_types['Binary']
df_nr = data_scaling[numeric_vars]
df_sb = data_scaling[symbolic_vars]
df_bool = data_scaling[boolean_vars]
#Z-score
from sklearn.preprocessing import StandardScaler
from pandas import DataFrame, concat
transf = StandardScaler(with_mean=True, with_std=True, copy=True).fit(df_nr)
tmp = DataFrame(transf.transform(df_nr), index=data_scaling.index, columns= numeric_vars)
norm_data_zscore = concat([tmp, df_sb, df_bool], axis=1)
norm_data_zscore.to_csv(f'dados/{file}_scaled_zscore.csv', index=False)
###Train Test Split
target = 'target'
y: np.ndarray = norm_data_zscore.pop(target).values
X: np.ndarray = norm_data_zscore.values
labels: np.ndarray = unique(y)
labels.sort()
trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)
train_zscore = concat([DataFrame(trnX, columns=norm_data_zscore.columns), DataFrame(trnY,columns=[target])], axis=1)
train_zscore.to_csv(f'dados/{file}_train_zscore.csv', index=False)
test_zscore = concat([DataFrame(tstX, columns=norm_data_zscore.columns), DataFrame(tstY,columns=[target])], axis=1)
test_zscore.to_csv(f'dados/{file}_test_zscore.csv', index=False)
values['Train_zscore'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
values['Test_zscore'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]
plt.figure(figsize=(12,4))
ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
plt.show()
train_zscore: DataFrame = read_csv(f'dados/{file}_train_zscore.csv')
test_zscore: DataFrame = read_csv(f'dados/{file}_test_zscore.csv')
####NB - zscore
trnY: ndarray = train_zscore.pop(target).values
trnX: ndarray = train_zscore.values
labels = unique(trnY)
labels.sort()
print(test_zscore.columns)
tstY: ndarray = test_zscore.pop(target).values
tstX: ndarray = test_zscore.values
clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst)
savefig(f'dados/{file}_nb_best_zscore.png')
show()
#MinMax
from sklearn.preprocessing import MinMaxScaler
from pandas import DataFrame, concat
transf = MinMaxScaler(feature_range=(0, 1), copy=True).fit(df_nr)
tmp = DataFrame(transf.transform(df_nr), index=data_scaling.index, columns= numeric_vars)
norm_data_minmax = concat([tmp, df_sb, df_bool], axis=1)
norm_data_minmax.to_csv(f'dados/{file}_scaled_minmax.csv', index=False)
print(norm_data_minmax.describe())
###Train Test Split
target = 'target'
y: np.ndarray = norm_data_minmax.pop(target).values
X: np.ndarray = norm_data_minmax.values
labels: np.ndarray = unique(y)
labels.sort()
trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)
train_minmax = concat([DataFrame(trnX, columns=norm_data_minmax.columns), DataFrame(trnY,columns=[target])], axis=1)
train_minmax.to_csv(f'dados/{file}_train_minmax.csv', index=False)
test_minmax = concat([DataFrame(tstX, columns=norm_data_minmax.columns), DataFrame(tstY,columns=[target])], axis=1)
test_minmax.to_csv(f'dados/{file}_test_minmax.csv', index=False)
values['Train_minmax'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
values['Test_minmax'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]
plt.figure(figsize=(12,4))
ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
plt.show()
train_minmax: DataFrame = read_csv(f'dados/{file}_train_minmax.csv')
test_minmax: DataFrame = read_csv(f'dados/{file}_test_minmax.csv')
####NB - zscore
trnY: ndarray = train_minmax.pop(target).values
trnX: ndarray = train_minmax.values
labels = unique(trnY)
labels.sort()
print(test_minmax.columns)
tstY: ndarray = test_minmax.pop(target).values
tstX: ndarray = test_minmax.values
clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst)
savefig(f'dados/{file}_nb_best_minmax.png')
show()
##Resultados- Scaling
from matplotlib.pyplot import figure, subplots, show
figure()
fig, axs = subplots(1, 3, figsize=(20,10),squeeze=False)
axs[0, 0].set_title('Original data')
data.boxplot(ax=axs[0, 0])
axs[0, 1].set_title('Z-score normalization')
norm_data_zscore.boxplot(ax=axs[0, 1])
axs[0, 2].set_title('MinMax normalization')
norm_data_minmax.boxplot(ax=axs[0, 2])
savefig(f'dados/{file}_scaling.png')
show()
#Balancing----------------------------------------------------------------------
original = read_csv(f'dados/{file}_scaled_minmax.csv') ##mudar se nao for minmax
target = 'target'
###train e test split
y: np.ndarray = original.pop(target).values
X: np.ndarray = original.values
labels: np.ndarray = unique(y)
labels.sort()
trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)
train_bal = concat([DataFrame(trnX, columns=original.columns), DataFrame(trnY,columns=[target])], axis=1)
train_bal.to_csv(f'dados/{file}_train_bal.csv', index=False)
test_bal = concat([DataFrame(tstX, columns=original.columns), DataFrame(tstY,columns=[target])], axis=1)
test_bal.to_csv(f'dados/{file}_test_bal.csv', index=False)
values['Train_bal'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
values['Test_bal'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]
plt.figure(figsize=(12,4))
ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
plt.show()
train_bal: DataFrame = read_csv(f'dados/{file}_train_bal.csv')
test_bal: DataFrame = read_csv(f'dados/{file}_test_bal.csv')
####criar copias
train_bal_1 = train_bal.deepcopy()
test_bal_1 = test_bal.deepcopy()
train_bal_2 = train_bal.deepcopy()
test_bal_2 = test_bal.deepcopy()
train_bal_3 = train_bal.deepcopy()
test_bal_3 = test_bal.deepcopy()
from matplotlib.pyplot import figure, savefig, show, bar,xlabel, ylabel,title
from ds_charts import bar_chart
#####balancing
print(train_bal_1.columns)
target_count = train_bal_1[target].value_counts()
positive_class = target_count.idxmin()
negative_class = target_count.idxmax()
ind_positive_class = target_count.index.get_loc(positive_class)
print('Minority class=', positive_class, ':', target_count[positive_class])
print('Majority class=', negative_class, ':', target_count[negative_class])
print('Proportion:', round(target_count[positive_class] / target_count[negative_class], 2), ': 1')
values = {'Original': [target_count[positive_class], target_count[negative_class]]}
figure() #esta tudo mal alinhado
bar_chart(target_count.index, target_count.values, title='Class balance')
savefig(f'dados/{file}_balance.png')
show()
##undersample
df_positives = train_bal_1[train_bal_1[target] == positive_class]
df_negatives = train_bal_1[train_bal_1[target] == negative_class]
from pandas import concat, DataFrame
df_neg_sample = DataFrame(df_negatives.sample(len(df_positives)))
df_under = concat([df_positives, df_neg_sample], axis=0)
df_under.to_csv(f'{file}_undersample.csv', index=False)
values['UnderSample'] = [len(df_positives), len(df_neg_sample)]
print('Minority class=', positive_class, ':', len(df_positives))
print('Majority class=', negative_class, ':', len(df_neg_sample))
print('Proportion:', round(len(df_positives) / len(df_neg_sample), 2), ': 1')
####NB - undersample
trnY: ndarray = train_bal_1.pop(target).values
trnX: ndarray = train_bal_1.values
labels = unique(trnY)
labels.sort()
print(test_bal_1.columns)
tstY: ndarray = test_bal_1.pop(target).values
tstX: ndarray = test_bal_1.values
clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels, trnY, prd_trn, trnY, prd_trn)
savefig(f'dados/{file}_nb_best_under.png')
show()
###Replication aka oversample
train_bal_2[["target"]] = train_bal[["target"]]
test_bal_2[["target"]] = test_bal[["target"]]
df_positives = train_bal_2[train_bal_2[target] == positive_class]
df_negatives = train_bal_2[train_bal_2[target] == negative_class]
from pandas import concat, DataFrame
df_pos_sample = DataFrame(df_positives.sample(len(df_negatives), replace=True))
df_over = concat([df_pos_sample, df_negatives], axis=0)
df_over.to_csv(f'dados/{file}_oversample.csv', index=False)
values['OverSample'] = [len(df_pos_sample), len(df_negatives)]
print('Minority class=', positive_class, ':', len(df_pos_sample))
print('Majority class=', negative_class, ':', len(df_negatives))
print('Proportion:', round(len(df_pos_sample) / len(df_negatives), 2), ': 1')
###NB - replication
trnY_rep: ndarray = train_bal_2.pop(target).values
trnX_rep: ndarray = train_bal_2.values
labels = unique(trnY)
labels.sort()
print(test_bal_2.columns)
tstY: ndarray = test_bal_2.pop(target).values
tstX: ndarray = test_bal_2.values
clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels, trnY, prd_trn, trnY, prd_trn)
savefig(f'dados/{file}_nb_best_over.png')
show()
#Smote
from pandas import Series
from imblearn.over_sampling import SMOTE
RANDOM_STATE = 42
smote = SMOTE(sampling_strategy='minority', random_state=RANDOM_STATE)
y = original.pop(target).values
X = original.values
smote_X, smote_y = smote.fit_resample(X, y)
df_smote = concat([DataFrame(smote_X), DataFrame(smote_y)], axis=1)
df_smote.columns = list(original.columns) + ['target']
df_smote.to_csv(f'dados/{file}_smote.csv', index=False)
smote_target_count = Series(smote_y).value_counts()
values['SMOTE'] = [smote_target_count[positive_class], smote_target_count[negative_class]]
print('Minority class=', positive_class, ':', smote_target_count[positive_class])
print('Majority class=', negative_class, ':', smote_target_count[negative_class])
print('Proportion:', round(smote_target_count[positive_class] / smote_target_count[negative_class], 2), ': 1')