Untitled

 avatar
unknown
plain_text
2 years ago
18 kB
3
Indexable
from copy import deepcopy
import numpy as np
from pandas import read_csv, concat, unique, DataFrame
import matplotlib.pyplot as plt
import ds_charts as ds
from sklearn.model_selection import train_test_split
import pandas

file = 'monthly_data'
filename = 'small_ca_monthly_data.csv'
data = read_csv(filename, na_values='na')
data=data.dropna(axis=1,how='all')
data = data.drop(columns='station')
data['FRGT'] = pandas.to_numeric(data['FRGT'], errors='coerce')
print(data.shape)
print(data.FRGT)

target = 'target'
positive = 1
negative = 0
values = {'Original': [len(data[data[target] == positive]), len(data[data[target] == negative])]}

##MV Imputation - drop cols + mean
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()

mv = {}
for var in data:
    nr = data[var].isna().sum()
    if nr > 0:
        mv[var] = nr

##Drop columns with more than 90% MVs
threshold = data.shape[0] * 0.90

missings = [c for c in mv.keys() if mv[c]>threshold]
df = data.drop(columns=missings, inplace=False)
df.to_csv(f'dados/{file}_drop_columns_mv.csv', index=False)

print('Dropped variables', missings)
print(df.shape)
data_drop: DataFrame = read_csv(f'dados/{file}_drop_columns_mv.csv', index_col=0)

#fill the others with mean
from sklearn.impute import SimpleImputer
from pandas import concat, DataFrame
from ds_charts import get_variable_types
from numpy import nan

tmp_nr, tmp_sb, tmp_bool = None, None, None
variables = get_variable_types(data_drop)
numeric_vars = variables['Numeric']

if len(numeric_vars) > 0:
    imp = SimpleImputer(strategy='mean', missing_values=nan, copy=True)
    tmp_nr = DataFrame(imp.fit_transform(data_drop[numeric_vars]), columns=numeric_vars)

df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1)
df.index = data_drop.index
df[["target"]] = data_drop[["target"]]
df.to_csv(f'dados/{file}_data_drop_mean.csv', index=False)

print(df.describe(include='all'))
data_drop_mean: DataFrame = read_csv(f'dados/{file}_data_drop_mean.csv', index_col=0)


##Train and test split
target = 'target'
y: np.ndarray = data_drop_mean.pop(target).values
X: np.ndarray = data_drop_mean.values
labels: np.ndarray = unique(y)
labels.sort()

trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)

train_drop_mean = concat([DataFrame(trnX, columns=data_drop_mean.columns), DataFrame(trnY,columns=[target])], axis=1)
train_drop_mean.to_csv(f'dados/{file}_train_drop_mean.csv', index=False)

test_drop_mean = concat([DataFrame(tstX, columns=data_drop_mean.columns), DataFrame(tstY,columns=[target])], axis=1)
test_drop_mean.to_csv(f'dados/{file}_test_drop_mean.csv', index=False)
values['Train_drop_mean'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
values['Test_drop_mean'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]

plt.figure(figsize=(12,4))
ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
plt.show()

train_drop_mean: DataFrame = read_csv(f'dados/{file}_train_drop_mean.csv')
test_drop_mean: DataFrame = read_csv(f'dados/{file}_test_drop_mean.csv')

#NB
from numpy import ndarray
from pandas import DataFrame, read_csv, unique
from matplotlib.pyplot import savefig, show
from sklearn.naive_bayes import GaussianNB
from ds_charts import plot_evaluation_results


trnY: ndarray = train_drop_mean.pop(target).values
trnX: ndarray = train_drop_mean.values

labels = unique(trnY)
labels.sort()

print(test_drop_mean.columns)
tstY: ndarray = test_drop_mean.pop(target).values
tstX: ndarray = test_drop_mean.values

clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels,trnY, prd_trn, tstY, prd_tst)
savefig(f'dados/{file}_nb_best_drop_mean.png')
show()


##MV Imputation - drop rows + mean

##Drop rows with more than 60% MVs
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()
threshold = data.shape[1] * 0.40

df = data.dropna(thresh=threshold, inplace=False)
df.to_csv(f'dados/{file}_drop_rows_mv.csv', index=True)
print(df.shape)
print(df.columns)
data_drop_r: DataFrame = read_csv(f'dados/{file}_drop_rows_mv.csv', index_col=0)

#fill empty columns with 0

nan_cols = data_drop_r.columns[data_drop_r.isna().all()].tolist()
data_drop_r[nan_cols] = data_drop_r[nan_cols].fillna(0)


#fill the others with mean
from sklearn.impute import SimpleImputer
from pandas import concat, DataFrame
from ds_charts import get_variable_types
from numpy import nan

tmp_nr, tmp_sb, tmp_bool = None, None, None
variables = get_variable_types(data_drop_r)
numeric_vars = variables['Numeric']

if len(numeric_vars) > 0:
    imp = SimpleImputer(strategy='mean', missing_values=nan, copy=True)
    tmp_nr = DataFrame(imp.fit_transform(data_drop_r[numeric_vars]), columns=numeric_vars)

df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1)
df.index = data_drop_r.index
df[["target"]] = data_drop_r[["target"]]
df.to_csv(f'dados/{file}_data_drop_mean_r.csv', index=False)

print(df.describe(include='all'))
data_drop_mean_r: DataFrame = read_csv(f'dados/{file}_data_drop_mean_r.csv', index_col=0)

##Train and test split
target = 'target'
y: np.ndarray = data_drop_mean_r.pop(target).values
X: np.ndarray = data_drop_mean_r.values
labels: np.ndarray = unique(y)
labels.sort()

trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)

train_drop_mean_r = concat([DataFrame(trnX, columns=data_drop_mean_r.columns), DataFrame(trnY,columns=[target])], axis=1)
train_drop_mean_r.to_csv(f'dados/{file}_train_drop_mean_r.csv', index=False)

test_drop_mean_r = concat([DataFrame(tstX, columns=data_drop_mean_r.columns), DataFrame(tstY,columns=[target])], axis=1)
test_drop_mean_r.to_csv(f'dados/{file}_test_drop_mean_r.csv', index=False)
values['Train_drop_mean_r'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
values['Test_drop_mean_r'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]

plt.figure(figsize=(12,4))
ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
plt.show()

train_drop_mean_r: DataFrame = read_csv(f'dados/{file}_train_drop_mean_r.csv')
test_drop_mean_r: DataFrame = read_csv(f'dados/{file}_test_drop_mean_r.csv')


#NB
from numpy import ndarray
from pandas import DataFrame, read_csv, unique
from matplotlib.pyplot import savefig, show
from sklearn.naive_bayes import GaussianNB
from ds_charts import plot_evaluation_results


trnY: ndarray = train_drop_mean_r.pop(target).values
trnX: ndarray = train_drop_mean_r.values

labels = unique(trnY)
labels.sort()

print(test_drop_mean_r.columns)
tstY: ndarray = test_drop_mean_r.pop(target).values
tstX: ndarray = test_drop_mean_r.values

clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst)
savefig(f'dados/{file}_nb_best_drop_mean_r.png')
show()


##Scaling --------------------------------------------------------------

data_scaling = read_csv(f'dados/{file}_data_drop_mean_r.csv')

variable_types = get_variable_types(data_scaling)
numeric_vars = variable_types['Numeric']
symbolic_vars = variable_types['Symbolic']
boolean_vars = variable_types['Binary']

df_nr = data_scaling[numeric_vars]
df_sb = data_scaling[symbolic_vars]
df_bool = data_scaling[boolean_vars]

#Z-score
from sklearn.preprocessing import StandardScaler
from pandas import DataFrame, concat

transf = StandardScaler(with_mean=True, with_std=True, copy=True).fit(df_nr)
tmp = DataFrame(transf.transform(df_nr), index=data_scaling.index, columns= numeric_vars)
norm_data_zscore = concat([tmp, df_sb,  df_bool], axis=1)
norm_data_zscore.to_csv(f'dados/{file}_scaled_zscore.csv', index=False)

###Train Test Split
target = 'target'
y: np.ndarray = norm_data_zscore.pop(target).values
X: np.ndarray = norm_data_zscore.values
labels: np.ndarray = unique(y)
labels.sort()

trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)

train_zscore = concat([DataFrame(trnX, columns=norm_data_zscore.columns), DataFrame(trnY,columns=[target])], axis=1)
train_zscore.to_csv(f'dados/{file}_train_zscore.csv', index=False)

test_zscore = concat([DataFrame(tstX, columns=norm_data_zscore.columns), DataFrame(tstY,columns=[target])], axis=1)
test_zscore.to_csv(f'dados/{file}_test_zscore.csv', index=False)
values['Train_zscore'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
values['Test_zscore'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]

plt.figure(figsize=(12,4))
ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
plt.show()

train_zscore: DataFrame = read_csv(f'dados/{file}_train_zscore.csv')
test_zscore: DataFrame = read_csv(f'dados/{file}_test_zscore.csv')

####NB - zscore

trnY: ndarray = train_zscore.pop(target).values
trnX: ndarray = train_zscore.values

labels = unique(trnY)
labels.sort()

print(test_zscore.columns)
tstY: ndarray = test_zscore.pop(target).values
tstX: ndarray = test_zscore.values

clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst)
savefig(f'dados/{file}_nb_best_zscore.png')
show()

#MinMax
from sklearn.preprocessing import MinMaxScaler
from pandas import DataFrame, concat

transf = MinMaxScaler(feature_range=(0, 1), copy=True).fit(df_nr)
tmp = DataFrame(transf.transform(df_nr), index=data_scaling.index, columns= numeric_vars)
norm_data_minmax = concat([tmp, df_sb,  df_bool], axis=1)
norm_data_minmax.to_csv(f'dados/{file}_scaled_minmax.csv', index=False)
print(norm_data_minmax.describe())

###Train Test Split
target = 'target'
y: np.ndarray = norm_data_minmax.pop(target).values
X: np.ndarray = norm_data_minmax.values
labels: np.ndarray = unique(y)
labels.sort()

trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)

train_minmax = concat([DataFrame(trnX, columns=norm_data_minmax.columns), DataFrame(trnY,columns=[target])], axis=1)
train_minmax.to_csv(f'dados/{file}_train_minmax.csv', index=False)

test_minmax = concat([DataFrame(tstX, columns=norm_data_minmax.columns), DataFrame(tstY,columns=[target])], axis=1)
test_minmax.to_csv(f'dados/{file}_test_minmax.csv', index=False)
values['Train_minmax'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
values['Test_minmax'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]

plt.figure(figsize=(12,4))
ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
plt.show()

train_minmax: DataFrame = read_csv(f'dados/{file}_train_minmax.csv')
test_minmax: DataFrame = read_csv(f'dados/{file}_test_minmax.csv')

####NB - zscore

trnY: ndarray = train_minmax.pop(target).values
trnX: ndarray = train_minmax.values

labels = unique(trnY)
labels.sort()

print(test_minmax.columns)
tstY: ndarray = test_minmax.pop(target).values
tstX: ndarray = test_minmax.values

clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst)
savefig(f'dados/{file}_nb_best_minmax.png')
show()


##Resultados- Scaling
from matplotlib.pyplot import figure, subplots, show

figure()
fig, axs = subplots(1, 3, figsize=(20,10),squeeze=False)
axs[0, 0].set_title('Original data')
data.boxplot(ax=axs[0, 0])
axs[0, 1].set_title('Z-score normalization')
norm_data_zscore.boxplot(ax=axs[0, 1])
axs[0, 2].set_title('MinMax normalization')
norm_data_minmax.boxplot(ax=axs[0, 2])
savefig(f'dados/{file}_scaling.png')
show()

#Balancing----------------------------------------------------------------------

original = read_csv(f'dados/{file}_scaled_minmax.csv') ##mudar se nao for minmax
target = 'target'

###train e test split

y: np.ndarray = original.pop(target).values
X: np.ndarray = original.values
labels: np.ndarray = unique(y)
labels.sort()

trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)

train_bal = concat([DataFrame(trnX, columns=original.columns), DataFrame(trnY,columns=[target])], axis=1)
train_bal.to_csv(f'dados/{file}_train_bal.csv', index=False)

test_bal = concat([DataFrame(tstX, columns=original.columns), DataFrame(tstY,columns=[target])], axis=1)
test_bal.to_csv(f'dados/{file}_test_bal.csv', index=False)
values['Train_bal'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
values['Test_bal'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]

plt.figure(figsize=(12,4))
ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
plt.show()

train_bal: DataFrame = read_csv(f'dados/{file}_train_bal.csv')
test_bal: DataFrame = read_csv(f'dados/{file}_test_bal.csv')

####criar copias
train_bal_1 = train_bal.deepcopy()
test_bal_1 = test_bal.deepcopy()

train_bal_2 = train_bal.deepcopy()
test_bal_2 = test_bal.deepcopy()

train_bal_3 = train_bal.deepcopy()
test_bal_3 = test_bal.deepcopy()

from matplotlib.pyplot import figure, savefig, show, bar,xlabel, ylabel,title
from ds_charts import bar_chart

#####balancing

print(train_bal_1.columns)
target_count = train_bal_1[target].value_counts()
positive_class = target_count.idxmin()
negative_class = target_count.idxmax()
ind_positive_class = target_count.index.get_loc(positive_class)
print('Minority class=', positive_class, ':', target_count[positive_class])
print('Majority class=', negative_class, ':', target_count[negative_class])
print('Proportion:', round(target_count[positive_class] / target_count[negative_class], 2), ': 1')
values = {'Original': [target_count[positive_class], target_count[negative_class]]}

figure() #esta tudo mal alinhado
bar_chart(target_count.index, target_count.values, title='Class balance')
savefig(f'dados/{file}_balance.png')
show()


##undersample

df_positives = train_bal_1[train_bal_1[target] == positive_class]
df_negatives = train_bal_1[train_bal_1[target] == negative_class]

from pandas import concat, DataFrame

df_neg_sample = DataFrame(df_negatives.sample(len(df_positives)))
df_under = concat([df_positives, df_neg_sample], axis=0)
df_under.to_csv(f'{file}_undersample.csv', index=False)
values['UnderSample'] = [len(df_positives), len(df_neg_sample)]
print('Minority class=', positive_class, ':', len(df_positives))
print('Majority class=', negative_class, ':', len(df_neg_sample))
print('Proportion:', round(len(df_positives) / len(df_neg_sample), 2), ': 1')

####NB - undersample
trnY: ndarray = train_bal_1.pop(target).values
trnX: ndarray = train_bal_1.values

labels = unique(trnY)
labels.sort()

print(test_bal_1.columns)
tstY: ndarray = test_bal_1.pop(target).values
tstX: ndarray = test_bal_1.values

clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels, trnY, prd_trn, trnY, prd_trn)
savefig(f'dados/{file}_nb_best_under.png')
show()

###Replication aka oversample
train_bal_2[["target"]] = train_bal[["target"]]
test_bal_2[["target"]] = test_bal[["target"]]
df_positives = train_bal_2[train_bal_2[target] == positive_class]
df_negatives = train_bal_2[train_bal_2[target] == negative_class]

from pandas import concat, DataFrame

df_pos_sample = DataFrame(df_positives.sample(len(df_negatives), replace=True))
df_over = concat([df_pos_sample, df_negatives], axis=0)
df_over.to_csv(f'dados/{file}_oversample.csv', index=False)
values['OverSample'] = [len(df_pos_sample), len(df_negatives)]
print('Minority class=', positive_class, ':', len(df_pos_sample))
print('Majority class=', negative_class, ':', len(df_negatives))
print('Proportion:', round(len(df_pos_sample) / len(df_negatives), 2), ': 1')

###NB - replication

trnY_rep: ndarray = train_bal_2.pop(target).values
trnX_rep: ndarray = train_bal_2.values

labels = unique(trnY)
labels.sort()

print(test_bal_2.columns)
tstY: ndarray = test_bal_2.pop(target).values
tstX: ndarray = test_bal_2.values

clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels, trnY, prd_trn, trnY, prd_trn)
savefig(f'dados/{file}_nb_best_over.png')
show()


#Smote
from pandas import Series
from imblearn.over_sampling import SMOTE
RANDOM_STATE = 42

smote = SMOTE(sampling_strategy='minority', random_state=RANDOM_STATE)
y = original.pop(target).values
X = original.values
smote_X, smote_y = smote.fit_resample(X, y)
df_smote = concat([DataFrame(smote_X), DataFrame(smote_y)], axis=1)
df_smote.columns = list(original.columns) + ['target']
df_smote.to_csv(f'dados/{file}_smote.csv', index=False)

smote_target_count = Series(smote_y).value_counts()
values['SMOTE'] = [smote_target_count[positive_class], smote_target_count[negative_class]]
print('Minority class=', positive_class, ':', smote_target_count[positive_class])
print('Majority class=', negative_class, ':', smote_target_count[negative_class])
print('Proportion:', round(smote_target_count[positive_class] / smote_target_count[negative_class], 2), ': 1')