Untitled
unknown
plain_text
2 years ago
18 kB
4
Indexable
from copy import deepcopy import numpy as np from pandas import read_csv, concat, unique, DataFrame import matplotlib.pyplot as plt import ds_charts as ds from sklearn.model_selection import train_test_split import pandas file = 'monthly_data' filename = 'small_ca_monthly_data.csv' data = read_csv(filename, na_values='na') data=data.dropna(axis=1,how='all') data = data.drop(columns='station') data['FRGT'] = pandas.to_numeric(data['FRGT'], errors='coerce') print(data.shape) print(data.FRGT) target = 'target' positive = 1 negative = 0 values = {'Original': [len(data[data[target] == positive]), len(data[data[target] == negative])]} ##MV Imputation - drop cols + mean from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() mv = {} for var in data: nr = data[var].isna().sum() if nr > 0: mv[var] = nr ##Drop columns with more than 90% MVs threshold = data.shape[0] * 0.90 missings = [c for c in mv.keys() if mv[c]>threshold] df = data.drop(columns=missings, inplace=False) df.to_csv(f'dados/{file}_drop_columns_mv.csv', index=False) print('Dropped variables', missings) print(df.shape) data_drop: DataFrame = read_csv(f'dados/{file}_drop_columns_mv.csv', index_col=0) #fill the others with mean from sklearn.impute import SimpleImputer from pandas import concat, DataFrame from ds_charts import get_variable_types from numpy import nan tmp_nr, tmp_sb, tmp_bool = None, None, None variables = get_variable_types(data_drop) numeric_vars = variables['Numeric'] if len(numeric_vars) > 0: imp = SimpleImputer(strategy='mean', missing_values=nan, copy=True) tmp_nr = DataFrame(imp.fit_transform(data_drop[numeric_vars]), columns=numeric_vars) df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1) df.index = data_drop.index df[["target"]] = data_drop[["target"]] df.to_csv(f'dados/{file}_data_drop_mean.csv', index=False) print(df.describe(include='all')) data_drop_mean: DataFrame = read_csv(f'dados/{file}_data_drop_mean.csv', index_col=0) ##Train and test split target = 'target' y: np.ndarray = data_drop_mean.pop(target).values X: np.ndarray = data_drop_mean.values labels: np.ndarray = unique(y) labels.sort() trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0) train_drop_mean = concat([DataFrame(trnX, columns=data_drop_mean.columns), DataFrame(trnY,columns=[target])], axis=1) train_drop_mean.to_csv(f'dados/{file}_train_drop_mean.csv', index=False) test_drop_mean = concat([DataFrame(tstX, columns=data_drop_mean.columns), DataFrame(tstY,columns=[target])], axis=1) test_drop_mean.to_csv(f'dados/{file}_test_drop_mean.csv', index=False) values['Train_drop_mean'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))] values['Test_drop_mean'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))] plt.figure(figsize=(12,4)) ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset') plt.show() train_drop_mean: DataFrame = read_csv(f'dados/{file}_train_drop_mean.csv') test_drop_mean: DataFrame = read_csv(f'dados/{file}_test_drop_mean.csv') #NB from numpy import ndarray from pandas import DataFrame, read_csv, unique from matplotlib.pyplot import savefig, show from sklearn.naive_bayes import GaussianNB from ds_charts import plot_evaluation_results trnY: ndarray = train_drop_mean.pop(target).values trnX: ndarray = train_drop_mean.values labels = unique(trnY) labels.sort() print(test_drop_mean.columns) tstY: ndarray = test_drop_mean.pop(target).values tstX: ndarray = test_drop_mean.values clf = GaussianNB() clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) plot_evaluation_results(labels,trnY, prd_trn, tstY, prd_tst) savefig(f'dados/{file}_nb_best_drop_mean.png') show() ##MV Imputation - drop rows + mean ##Drop rows with more than 60% MVs from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() threshold = data.shape[1] * 0.40 df = data.dropna(thresh=threshold, inplace=False) df.to_csv(f'dados/{file}_drop_rows_mv.csv', index=True) print(df.shape) print(df.columns) data_drop_r: DataFrame = read_csv(f'dados/{file}_drop_rows_mv.csv', index_col=0) #fill empty columns with 0 nan_cols = data_drop_r.columns[data_drop_r.isna().all()].tolist() data_drop_r[nan_cols] = data_drop_r[nan_cols].fillna(0) #fill the others with mean from sklearn.impute import SimpleImputer from pandas import concat, DataFrame from ds_charts import get_variable_types from numpy import nan tmp_nr, tmp_sb, tmp_bool = None, None, None variables = get_variable_types(data_drop_r) numeric_vars = variables['Numeric'] if len(numeric_vars) > 0: imp = SimpleImputer(strategy='mean', missing_values=nan, copy=True) tmp_nr = DataFrame(imp.fit_transform(data_drop_r[numeric_vars]), columns=numeric_vars) df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1) df.index = data_drop_r.index df[["target"]] = data_drop_r[["target"]] df.to_csv(f'dados/{file}_data_drop_mean_r.csv', index=False) print(df.describe(include='all')) data_drop_mean_r: DataFrame = read_csv(f'dados/{file}_data_drop_mean_r.csv', index_col=0) ##Train and test split target = 'target' y: np.ndarray = data_drop_mean_r.pop(target).values X: np.ndarray = data_drop_mean_r.values labels: np.ndarray = unique(y) labels.sort() trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0) train_drop_mean_r = concat([DataFrame(trnX, columns=data_drop_mean_r.columns), DataFrame(trnY,columns=[target])], axis=1) train_drop_mean_r.to_csv(f'dados/{file}_train_drop_mean_r.csv', index=False) test_drop_mean_r = concat([DataFrame(tstX, columns=data_drop_mean_r.columns), DataFrame(tstY,columns=[target])], axis=1) test_drop_mean_r.to_csv(f'dados/{file}_test_drop_mean_r.csv', index=False) values['Train_drop_mean_r'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))] values['Test_drop_mean_r'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))] plt.figure(figsize=(12,4)) ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset') plt.show() train_drop_mean_r: DataFrame = read_csv(f'dados/{file}_train_drop_mean_r.csv') test_drop_mean_r: DataFrame = read_csv(f'dados/{file}_test_drop_mean_r.csv') #NB from numpy import ndarray from pandas import DataFrame, read_csv, unique from matplotlib.pyplot import savefig, show from sklearn.naive_bayes import GaussianNB from ds_charts import plot_evaluation_results trnY: ndarray = train_drop_mean_r.pop(target).values trnX: ndarray = train_drop_mean_r.values labels = unique(trnY) labels.sort() print(test_drop_mean_r.columns) tstY: ndarray = test_drop_mean_r.pop(target).values tstX: ndarray = test_drop_mean_r.values clf = GaussianNB() clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst) savefig(f'dados/{file}_nb_best_drop_mean_r.png') show() ##Scaling -------------------------------------------------------------- data_scaling = read_csv(f'dados/{file}_data_drop_mean_r.csv') variable_types = get_variable_types(data_scaling) numeric_vars = variable_types['Numeric'] symbolic_vars = variable_types['Symbolic'] boolean_vars = variable_types['Binary'] df_nr = data_scaling[numeric_vars] df_sb = data_scaling[symbolic_vars] df_bool = data_scaling[boolean_vars] #Z-score from sklearn.preprocessing import StandardScaler from pandas import DataFrame, concat transf = StandardScaler(with_mean=True, with_std=True, copy=True).fit(df_nr) tmp = DataFrame(transf.transform(df_nr), index=data_scaling.index, columns= numeric_vars) norm_data_zscore = concat([tmp, df_sb, df_bool], axis=1) norm_data_zscore.to_csv(f'dados/{file}_scaled_zscore.csv', index=False) ###Train Test Split target = 'target' y: np.ndarray = norm_data_zscore.pop(target).values X: np.ndarray = norm_data_zscore.values labels: np.ndarray = unique(y) labels.sort() trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0) train_zscore = concat([DataFrame(trnX, columns=norm_data_zscore.columns), DataFrame(trnY,columns=[target])], axis=1) train_zscore.to_csv(f'dados/{file}_train_zscore.csv', index=False) test_zscore = concat([DataFrame(tstX, columns=norm_data_zscore.columns), DataFrame(tstY,columns=[target])], axis=1) test_zscore.to_csv(f'dados/{file}_test_zscore.csv', index=False) values['Train_zscore'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))] values['Test_zscore'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))] plt.figure(figsize=(12,4)) ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset') plt.show() train_zscore: DataFrame = read_csv(f'dados/{file}_train_zscore.csv') test_zscore: DataFrame = read_csv(f'dados/{file}_test_zscore.csv') ####NB - zscore trnY: ndarray = train_zscore.pop(target).values trnX: ndarray = train_zscore.values labels = unique(trnY) labels.sort() print(test_zscore.columns) tstY: ndarray = test_zscore.pop(target).values tstX: ndarray = test_zscore.values clf = GaussianNB() clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst) savefig(f'dados/{file}_nb_best_zscore.png') show() #MinMax from sklearn.preprocessing import MinMaxScaler from pandas import DataFrame, concat transf = MinMaxScaler(feature_range=(0, 1), copy=True).fit(df_nr) tmp = DataFrame(transf.transform(df_nr), index=data_scaling.index, columns= numeric_vars) norm_data_minmax = concat([tmp, df_sb, df_bool], axis=1) norm_data_minmax.to_csv(f'dados/{file}_scaled_minmax.csv', index=False) print(norm_data_minmax.describe()) ###Train Test Split target = 'target' y: np.ndarray = norm_data_minmax.pop(target).values X: np.ndarray = norm_data_minmax.values labels: np.ndarray = unique(y) labels.sort() trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0) train_minmax = concat([DataFrame(trnX, columns=norm_data_minmax.columns), DataFrame(trnY,columns=[target])], axis=1) train_minmax.to_csv(f'dados/{file}_train_minmax.csv', index=False) test_minmax = concat([DataFrame(tstX, columns=norm_data_minmax.columns), DataFrame(tstY,columns=[target])], axis=1) test_minmax.to_csv(f'dados/{file}_test_minmax.csv', index=False) values['Train_minmax'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))] values['Test_minmax'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))] plt.figure(figsize=(12,4)) ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset') plt.show() train_minmax: DataFrame = read_csv(f'dados/{file}_train_minmax.csv') test_minmax: DataFrame = read_csv(f'dados/{file}_test_minmax.csv') ####NB - zscore trnY: ndarray = train_minmax.pop(target).values trnX: ndarray = train_minmax.values labels = unique(trnY) labels.sort() print(test_minmax.columns) tstY: ndarray = test_minmax.pop(target).values tstX: ndarray = test_minmax.values clf = GaussianNB() clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst) savefig(f'dados/{file}_nb_best_minmax.png') show() ##Resultados- Scaling from matplotlib.pyplot import figure, subplots, show figure() fig, axs = subplots(1, 3, figsize=(20,10),squeeze=False) axs[0, 0].set_title('Original data') data.boxplot(ax=axs[0, 0]) axs[0, 1].set_title('Z-score normalization') norm_data_zscore.boxplot(ax=axs[0, 1]) axs[0, 2].set_title('MinMax normalization') norm_data_minmax.boxplot(ax=axs[0, 2]) savefig(f'dados/{file}_scaling.png') show() #Balancing---------------------------------------------------------------------- original = read_csv(f'dados/{file}_scaled_minmax.csv') ##mudar se nao for minmax target = 'target' ###train e test split y: np.ndarray = original.pop(target).values X: np.ndarray = original.values labels: np.ndarray = unique(y) labels.sort() trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0) train_bal = concat([DataFrame(trnX, columns=original.columns), DataFrame(trnY,columns=[target])], axis=1) train_bal.to_csv(f'dados/{file}_train_bal.csv', index=False) test_bal = concat([DataFrame(tstX, columns=original.columns), DataFrame(tstY,columns=[target])], axis=1) test_bal.to_csv(f'dados/{file}_test_bal.csv', index=False) values['Train_bal'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))] values['Test_bal'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))] plt.figure(figsize=(12,4)) ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset') plt.show() train_bal: DataFrame = read_csv(f'dados/{file}_train_bal.csv') test_bal: DataFrame = read_csv(f'dados/{file}_test_bal.csv') ####criar copias train_bal_1 = train_bal.deepcopy() test_bal_1 = test_bal.deepcopy() train_bal_2 = train_bal.deepcopy() test_bal_2 = test_bal.deepcopy() train_bal_3 = train_bal.deepcopy() test_bal_3 = test_bal.deepcopy() from matplotlib.pyplot import figure, savefig, show, bar,xlabel, ylabel,title from ds_charts import bar_chart #####balancing print(train_bal_1.columns) target_count = train_bal_1[target].value_counts() positive_class = target_count.idxmin() negative_class = target_count.idxmax() ind_positive_class = target_count.index.get_loc(positive_class) print('Minority class=', positive_class, ':', target_count[positive_class]) print('Majority class=', negative_class, ':', target_count[negative_class]) print('Proportion:', round(target_count[positive_class] / target_count[negative_class], 2), ': 1') values = {'Original': [target_count[positive_class], target_count[negative_class]]} figure() #esta tudo mal alinhado bar_chart(target_count.index, target_count.values, title='Class balance') savefig(f'dados/{file}_balance.png') show() ##undersample df_positives = train_bal_1[train_bal_1[target] == positive_class] df_negatives = train_bal_1[train_bal_1[target] == negative_class] from pandas import concat, DataFrame df_neg_sample = DataFrame(df_negatives.sample(len(df_positives))) df_under = concat([df_positives, df_neg_sample], axis=0) df_under.to_csv(f'{file}_undersample.csv', index=False) values['UnderSample'] = [len(df_positives), len(df_neg_sample)] print('Minority class=', positive_class, ':', len(df_positives)) print('Majority class=', negative_class, ':', len(df_neg_sample)) print('Proportion:', round(len(df_positives) / len(df_neg_sample), 2), ': 1') ####NB - undersample trnY: ndarray = train_bal_1.pop(target).values trnX: ndarray = train_bal_1.values labels = unique(trnY) labels.sort() print(test_bal_1.columns) tstY: ndarray = test_bal_1.pop(target).values tstX: ndarray = test_bal_1.values clf = GaussianNB() clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) plot_evaluation_results(labels, trnY, prd_trn, trnY, prd_trn) savefig(f'dados/{file}_nb_best_under.png') show() ###Replication aka oversample train_bal_2[["target"]] = train_bal[["target"]] test_bal_2[["target"]] = test_bal[["target"]] df_positives = train_bal_2[train_bal_2[target] == positive_class] df_negatives = train_bal_2[train_bal_2[target] == negative_class] from pandas import concat, DataFrame df_pos_sample = DataFrame(df_positives.sample(len(df_negatives), replace=True)) df_over = concat([df_pos_sample, df_negatives], axis=0) df_over.to_csv(f'dados/{file}_oversample.csv', index=False) values['OverSample'] = [len(df_pos_sample), len(df_negatives)] print('Minority class=', positive_class, ':', len(df_pos_sample)) print('Majority class=', negative_class, ':', len(df_negatives)) print('Proportion:', round(len(df_pos_sample) / len(df_negatives), 2), ': 1') ###NB - replication trnY_rep: ndarray = train_bal_2.pop(target).values trnX_rep: ndarray = train_bal_2.values labels = unique(trnY) labels.sort() print(test_bal_2.columns) tstY: ndarray = test_bal_2.pop(target).values tstX: ndarray = test_bal_2.values clf = GaussianNB() clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) plot_evaluation_results(labels, trnY, prd_trn, trnY, prd_trn) savefig(f'dados/{file}_nb_best_over.png') show() #Smote from pandas import Series from imblearn.over_sampling import SMOTE RANDOM_STATE = 42 smote = SMOTE(sampling_strategy='minority', random_state=RANDOM_STATE) y = original.pop(target).values X = original.values smote_X, smote_y = smote.fit_resample(X, y) df_smote = concat([DataFrame(smote_X), DataFrame(smote_y)], axis=1) df_smote.columns = list(original.columns) + ['target'] df_smote.to_csv(f'dados/{file}_smote.csv', index=False) smote_target_count = Series(smote_y).value_counts() values['SMOTE'] = [smote_target_count[positive_class], smote_target_count[negative_class]] print('Minority class=', positive_class, ':', smote_target_count[positive_class]) print('Majority class=', negative_class, ':', smote_target_count[negative_class]) print('Proportion:', round(smote_target_count[positive_class] / smote_target_count[negative_class], 2), ': 1')
Editor is loading...