Untitled
unknown
plain_text
2 years ago
8.4 kB
6
Indexable
import numpy as np from pandas import read_csv, concat, unique, DataFrame import matplotlib.pyplot as plt import ds_charts as ds from sklearn.model_selection import train_test_split import pandas file = 'monthly_data' filename = 'small_ca_monthly_data.csv' data = read_csv(filename, na_values='na') data=data.dropna(axis=1,how='all') data = data.drop(columns='station') data['FRGT'] = pandas.to_numeric(data['FRGT'], errors='coerce') print(data.shape) print(data.FRGT) target = 'target' positive = 1 negative = 0 values = {'Original': [len(data[data[target] == positive]), len(data[data[target] == negative])]} ##MV Imputation - drop cols + mean from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() mv = {} for var in data: nr = data[var].isna().sum() if nr > 0: mv[var] = nr ##Drop columns with more than 90% MVs threshold = data.shape[0] * 0.90 missings = [c for c in mv.keys() if mv[c]>threshold] df = data.drop(columns=missings, inplace=False) df.to_csv(f'imagens/{file}_drop_columns_mv.csv', index=False) print('Dropped variables', missings) print(df.shape) data_drop: DataFrame = read_csv(f'imagens/{file}_drop_columns_mv.csv', index_col=0) ##Train and test split target = 'target' y: np.ndarray = data_drop.pop(target).values X: np.ndarray = data_drop.values labels: np.ndarray = unique(y) labels.sort() trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y) train_drop = concat([DataFrame(trnX, columns=data_drop.columns), DataFrame(trnY,columns=[target])], axis=1) train_drop.to_csv(f'imagens/{file}_train_drop.csv', index=False) test_drop = concat([DataFrame(tstX, columns=data_drop.columns), DataFrame(tstY,columns=[target])], axis=1) test_drop.to_csv(f'imagens/{file}_test_drop.csv', index=False) values['Train_drop'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))] values['Test_drop'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))] plt.figure(figsize=(12,4)) ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset') plt.show() train_drop: DataFrame = read_csv(f'imagens/{file}_train_drop.csv') test_drop: DataFrame = read_csv(f'imagens/{file}_test_drop.csv') #fill the others with mean from sklearn.impute import SimpleImputer from pandas import concat, DataFrame from ds_charts import get_variable_types from numpy import nan tmp_nr, tmp_sb, tmp_bool = None, None, None variables = get_variable_types(train_drop) numeric_vars = variables['Numeric'] if len(numeric_vars) > 0: imp = SimpleImputer(strategy='mean', fill_value=0, missing_values=nan, copy=True) tmp_nr = DataFrame(imp.fit_transform(train_drop[numeric_vars]), columns=numeric_vars) df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1) df.index = train_drop.index df[["target"]] = train_drop[["target"]] df.to_csv(f'imagens/{file}_train_drop_mean.csv', index=False) print(df.describe(include='all')) train_drop_mean: DataFrame = read_csv(f'imagens/{file}_train_drop_mean.csv', index_col=0) tmp_nr, tmp_sb, tmp_bool = None, None, None variables = get_variable_types(test_drop) numeric_vars = variables['Numeric'] if len(numeric_vars) > 0: imp = SimpleImputer(strategy='mean', fill_value=0, missing_values=nan, copy=True) tmp_nr = DataFrame(imp.fit_transform(test_drop[numeric_vars]), columns=numeric_vars) df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1) df.index = test_drop.index df[["target"]] = test_drop[["target"]] df.to_csv(f'imagens/{file}_test_drop_mean.csv', index=False) print(df.describe(include='all')) test_drop_mean: DataFrame = read_csv(f'imagens/{file}_test_drop_mean.csv', index_col=0) #NB from numpy import ndarray from pandas import DataFrame, read_csv, unique from matplotlib.pyplot import savefig, show from sklearn.naive_bayes import GaussianNB from ds_charts import plot_evaluation_results trnY: ndarray = train_drop_mean.pop(target).values trnX: ndarray = train_drop_mean.values labels = unique(trnY) labels.sort() print(test_drop_mean.columns) tstY: ndarray = test_drop_mean.pop(target).values tstX: ndarray = test_drop_mean.values clf = GaussianNB() clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst) savefig(f'imagens/{file}_nb_best_drop_mean.png') show() ##MV Imputation - drop rows + mean ##Drop rows with more than 60% MVs from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() threshold = data.shape[1] * 0.40 df = data.dropna(thresh=threshold, inplace=False) df.to_csv(f'imagens/{file}_drop_rows_mv.csv', index=True) print(df.shape) print(df.columns) data_drop_r: DataFrame = read_csv(f'imagens/{file}_drop_rows_mv.csv', index_col=0) ##Train and test split target = 'target' y: np.ndarray = data_drop_r.pop(target).values X: np.ndarray = data_drop_r.values labels: np.ndarray = unique(y) labels.sort() trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y) train_drop_r = concat([DataFrame(trnX, columns=data_drop_r.columns), DataFrame(trnY,columns=[target])], axis=1) train_drop_r.to_csv(f'imagens/{file}_train_drop_r.csv', index=False) test_drop_r = concat([DataFrame(tstX, columns=data_drop_r.columns), DataFrame(tstY,columns=[target])], axis=1) test_drop_r.to_csv(f'imagens/{file}_test_drop_r.csv', index=False) values['Train_drop_r'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))] values['Test_drop_r'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))] plt.figure(figsize=(12,4)) ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset') plt.show() train_drop_r: DataFrame = read_csv(f'imagens/{file}_train_drop_r.csv') test_drop_r: DataFrame = read_csv(f'imagens/{file}_test_drop_r.csv') #fill the others with mean from sklearn.impute import SimpleImputer from pandas import concat, DataFrame from ds_charts import get_variable_types from numpy import nan tmp_nr, tmp_sb, tmp_bool = None, None, None variables = get_variable_types(train_drop_r) numeric_vars = variables['Numeric'] if len(numeric_vars) > 0: imp = SimpleImputer(strategy='mean', fill_value=0, missing_values=nan, copy=True) tmp_nr = DataFrame(imp.fit_transform(train_drop_r[numeric_vars]), columns=numeric_vars) df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1) df.index = train_drop_r.index df[["target"]] = train_drop_r[["target"]] df.to_csv(f'imagens/{file}_train_drop_mean_r.csv', index=False) print(df.describe(include='all')) train_drop_mean_r: DataFrame = read_csv(f'imagens/{file}_train_drop_mean_r.csv', index_col=0) tmp_nr, tmp_sb, tmp_bool = None, None, None variables = get_variable_types(test_drop_r) numeric_vars = variables['Numeric'] if len(numeric_vars) > 0: imp = SimpleImputer(strategy='mean', fill_value=0, missing_values=nan, copy=True) tmp_nr = DataFrame(imp.fit_transform(test_drop_r[numeric_vars]), columns=numeric_vars) df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1) df.index = test_drop_r.index df[["target"]] = test_drop_r[["target"]] df.to_csv(f'imagens/{file}_test_drop_mean_r.csv', index=False) print(df.describe(include='all')) test_drop_mean_r: DataFrame = read_csv(f'imagens/{file}_test_drop_mean_r.csv', index_col=0) #NB from numpy import ndarray from pandas import DataFrame, read_csv, unique from matplotlib.pyplot import savefig, show from sklearn.naive_bayes import GaussianNB from ds_charts import plot_evaluation_results trnY: ndarray = train_drop_mean_r.pop(target).values trnX: ndarray = train_drop_mean_r.values labels = unique(trnY) labels.sort() print(test_drop_mean_r.columns) tstY: ndarray = test_drop_mean_r.pop(target).values tstX: ndarray = test_drop_mean_r.values clf = GaussianNB() clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst) savefig(f'imagens/{file}_nb_best_drop_mean_r.png') show()
Editor is loading...