Untitled
unknown
plain_text
3 years ago
3.4 kB
8
Indexable
thresholds = [0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
train_precisions = []
test_precisions = []
for threshold in thresholds:
#threshold = data.shape[0] * 0.90
missings = [c for c in mv.keys() if mv[c]>threshold]
df = data.drop(columns=missings, inplace=False)
df.to_csv(f'dados/{file}_drop_columns_mv.csv', index=False)
print('Dropped variables', missings)
print(df.shape)
data_drop: DataFrame = read_csv(f'dados/{file}_drop_columns_mv.csv', index_col=0)
#fill the others with mean
from sklearn.impute import SimpleImputer
from pandas import concat, DataFrame
from ds_charts import get_variable_types
from numpy import nan
tmp_nr, tmp_sb, tmp_bool = None, None, None
variables = get_variable_types(data_drop)
numeric_vars = variables['Numeric']
print(numeric_vars)
if len(numeric_vars) > 0:
imp = SimpleImputer(strategy='mean', missing_values=nan, copy=True)
tmp_nr = DataFrame(imp.fit_transform(data_drop[numeric_vars]), columns=numeric_vars)
df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1)
df.index = data_drop.index
df[["target"]] = data_drop[["target"]]
df.to_csv(f'dados/{file}_data_drop_mean.csv', index=False)
print(df.describe(include='all'))
data_drop_mean: DataFrame = read_csv(f'dados/{file}_data_drop_mean.csv', index_col=0)
##Train and test split
target = 'target'
y: np.ndarray = data_drop_mean.pop(target).values
X: np.ndarray = data_drop_mean.values
labels: np.ndarray = unique(y)
labels.sort()
trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)
train_drop_mean = concat([DataFrame(trnX, columns=data_drop_mean.columns), DataFrame(trnY,columns=[target])], axis=1)
train_drop_mean.to_csv(f'dados/{file}_train_drop_mean.csv', index=False)
test_drop_mean = concat([DataFrame(tstX, columns=data_drop_mean.columns), DataFrame(tstY,columns=[target])], axis=1)
test_drop_mean.to_csv(f'dados/{file}_test_drop_mean.csv', index=False)
values['Train_drop_mean'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
values['Test_drop_mean'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]
plt.figure(figsize=(12,4))
ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
plt.show()
train_drop_mean: DataFrame = read_csv(f'dados/{file}_train_drop_mean.csv')
test_drop_mean: DataFrame = read_csv(f'dados/{file}_test_drop_mean.csv')
#NB
from numpy import ndarray
from pandas import DataFrame, read_csv, unique
from matplotlib.pyplot import savefig, show
from sklearn.naive_bayes import GaussianNB
from ds_charts import plot_evaluation_results
trnY: ndarray = train_drop_mean.pop(target).values
trnX: ndarray = train_drop_mean.values
labels = unique(trnY)
labels.sort()
print(test_drop_mean.columns)
tstY: ndarray = test_drop_mean.pop(target).values
tstX: ndarray = test_drop_mean.values
clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
eval = plot_evaluation_results(labels,trnY, prd_trn, tstY, prd_tst)
print(eval)
Editor is loading...