Untitled
unknown
plain_text
2 years ago
3.6 kB
4
Indexable
##Train and test split import numpy as np from pandas import read_csv, concat, unique, DataFrame import matplotlib.pyplot as plt import ds_charts as ds from sklearn.model_selection import train_test_split file = 'monthly_data' filename = 'small_ca_monthly_data.csv' data = read_csv(filename, na_values='na') data=data.dropna(axis=1,how='all') data = data.drop(columns='station') print(data.shape) target = 'target' positive = 1 negative = 0 values = {'Original': [len(data[data[target] == positive]), len(data[data[target] == negative])]} y: np.ndarray = data.pop(target).values X: np.ndarray = data.values labels: np.ndarray = unique(y) labels.sort() trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y) train = concat([DataFrame(trnX, columns=data.columns), DataFrame(trnY,columns=[target])], axis=1) train.to_csv(f'imagens/{file}_train.csv', index=False) test = concat([DataFrame(tstX, columns=data.columns), DataFrame(tstY,columns=[target])], axis=1) test.to_csv(f'imagens/{file}_test.csv', index=False) values['Train'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))] values['Test'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))] plt.figure(figsize=(12,4)) ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset') plt.show() train: DataFrame = read_csv(f'imagens/{file}_train.csv') test: DataFrame = read_csv(f'imagens/{file}_test.csv') ##MV Imputation - drop from pandas.plotting import register_matplotlib_converters register_matplotlib_converters() mv = {} for var in train: nr = train[var].isna().sum() if nr > 0: mv[var] = nr ##Drop columns with more than 90% MVs threshold = train.shape[0] * 0.90 missings = [c for c in mv.keys() if mv[c]>threshold] df = train.drop(columns=missings, inplace=False) print(df.columns) #df.fillna('NA', inplace=True) #test.fillna('NA', inplace=True) #df[np.isnan(df)] = 'NA' #test[np.isnan(test)] = 'NA' df.to_csv(f'imagens/{file}_drop_columns_mv_train.csv', index=True) print('Dropped variables', missings) print(df.shape) train_drop: DataFrame = read_csv(f'imagens/{file}_drop_columns_mv_train.csv', index_col=0) #fill the others with mean from sklearn.impute import SimpleImputer from pandas import concat, DataFrame from ds_charts import get_variable_types from numpy import nan tmp_nr, tmp_sb, tmp_bool = None, None, None variables = get_variable_types(train_drop) numeric_vars = variables['Numeric'] if len(numeric_vars) > 0: imp = SimpleImputer(strategy='mean', fill_value=0, missing_values=nan, copy=True) tmp_nr = DataFrame(imp.fit_transform(train_drop[numeric_vars]), columns=numeric_vars) df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1) df.index = train_drop.index df.to_csv(f'imagens/{file}_train_drop_mean.csv', index=True) print(df.describe(include='all')) train_drop_mean: DataFrame = read_csv(f'imagens/{file}_train_drop_mean.csv', index_col=0) #NB from numpy import ndarray from pandas import DataFrame, read_csv, unique from matplotlib.pyplot import savefig, show from sklearn.naive_bayes import GaussianNB from ds_charts import plot_evaluation_results trnY: ndarray = train_drop_mean.pop(target).values trnX: ndarray = train_drop_mean.values labels = unique(trnY) labels.sort() tstY: ndarray = test.pop(target).values tstX: ndarray = test.values clf = GaussianNB() clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst) savefig(f'imagens/{file}_nb_best_drop_mean.png') show()
Editor is loading...