Untitled
unknown
plain_text
2 years ago
3.4 kB
1
Indexable
Never
thresholds = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95] train_precisions = [] test_precisions = [] for threshold in thresholds: #threshold = data.shape[0] * 0.90 missings = [c for c in mv.keys() if mv[c]>threshold] df = data.drop(columns=missings, inplace=False) df.to_csv(f'dados/{file}_drop_columns_mv.csv', index=False) print('Dropped variables', missings) print(df.shape) data_drop: DataFrame = read_csv(f'dados/{file}_drop_columns_mv.csv', index_col=0) #fill the others with mean from sklearn.impute import SimpleImputer from pandas import concat, DataFrame from ds_charts import get_variable_types from numpy import nan tmp_nr, tmp_sb, tmp_bool = None, None, None variables = get_variable_types(data_drop) numeric_vars = variables['Numeric'] print(numeric_vars) if len(numeric_vars) > 0: imp = SimpleImputer(strategy='mean', missing_values=nan, copy=True) tmp_nr = DataFrame(imp.fit_transform(data_drop[numeric_vars]), columns=numeric_vars) df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1) df.index = data_drop.index df[["target"]] = data_drop[["target"]] df.to_csv(f'dados/{file}_data_drop_mean.csv', index=False) print(df.describe(include='all')) data_drop_mean: DataFrame = read_csv(f'dados/{file}_data_drop_mean.csv', index_col=0) ##Train and test split target = 'target' y: np.ndarray = data_drop_mean.pop(target).values X: np.ndarray = data_drop_mean.values labels: np.ndarray = unique(y) labels.sort() trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0) train_drop_mean = concat([DataFrame(trnX, columns=data_drop_mean.columns), DataFrame(trnY,columns=[target])], axis=1) train_drop_mean.to_csv(f'dados/{file}_train_drop_mean.csv', index=False) test_drop_mean = concat([DataFrame(tstX, columns=data_drop_mean.columns), DataFrame(tstY,columns=[target])], axis=1) test_drop_mean.to_csv(f'dados/{file}_test_drop_mean.csv', index=False) values['Train_drop_mean'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))] values['Test_drop_mean'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))] plt.figure(figsize=(12,4)) ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset') plt.show() train_drop_mean: DataFrame = read_csv(f'dados/{file}_train_drop_mean.csv') test_drop_mean: DataFrame = read_csv(f'dados/{file}_test_drop_mean.csv') #NB from numpy import ndarray from pandas import DataFrame, read_csv, unique from matplotlib.pyplot import savefig, show from sklearn.naive_bayes import GaussianNB from ds_charts import plot_evaluation_results trnY: ndarray = train_drop_mean.pop(target).values trnX: ndarray = train_drop_mean.values labels = unique(trnY) labels.sort() print(test_drop_mean.columns) tstY: ndarray = test_drop_mean.pop(target).values tstX: ndarray = test_drop_mean.values clf = GaussianNB() clf.fit(trnX, trnY) prd_trn = clf.predict(trnX) prd_tst = clf.predict(tstX) eval = plot_evaluation_results(labels,trnY, prd_trn, tstY, prd_tst) print(eval)