Untitled

 avatar
unknown
plain_text
2 years ago
8.4 kB
6
Indexable
import numpy as np
from pandas import read_csv, concat, unique, DataFrame
import matplotlib.pyplot as plt
import ds_charts as ds
from sklearn.model_selection import train_test_split
import pandas

file = 'monthly_data'
filename = 'small_ca_monthly_data.csv'
data = read_csv(filename, na_values='na')
data=data.dropna(axis=1,how='all')
data = data.drop(columns='station')
data['FRGT'] = pandas.to_numeric(data['FRGT'], errors='coerce')
print(data.shape)
print(data.FRGT)

target = 'target'
positive = 1
negative = 0
values = {'Original': [len(data[data[target] == positive]), len(data[data[target] == negative])]}

##MV Imputation - drop cols + mean
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()

mv = {}
for var in data:
    nr = data[var].isna().sum()
    if nr > 0:
        mv[var] = nr
##Drop columns with more than 90% MVs
threshold = data.shape[0] * 0.90

missings = [c for c in mv.keys() if mv[c]>threshold]
df = data.drop(columns=missings, inplace=False)
df.to_csv(f'imagens/{file}_drop_columns_mv.csv', index=False)

print('Dropped variables', missings)
print(df.shape)
data_drop: DataFrame = read_csv(f'imagens/{file}_drop_columns_mv.csv', index_col=0)

##Train and test split
target = 'target'
y: np.ndarray = data_drop.pop(target).values
X: np.ndarray = data_drop.values
labels: np.ndarray = unique(y)
labels.sort()

trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y)

train_drop = concat([DataFrame(trnX, columns=data_drop.columns), DataFrame(trnY,columns=[target])], axis=1)

train_drop.to_csv(f'imagens/{file}_train_drop.csv', index=False)
test_drop = concat([DataFrame(tstX, columns=data_drop.columns), DataFrame(tstY,columns=[target])], axis=1)
test_drop.to_csv(f'imagens/{file}_test_drop.csv', index=False)
values['Train_drop'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
values['Test_drop'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]

plt.figure(figsize=(12,4))
ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
plt.show()

train_drop: DataFrame = read_csv(f'imagens/{file}_train_drop.csv')
test_drop: DataFrame = read_csv(f'imagens/{file}_test_drop.csv')

#fill the others with mean
from sklearn.impute import SimpleImputer
from pandas import concat, DataFrame
from ds_charts import get_variable_types
from numpy import nan

tmp_nr, tmp_sb, tmp_bool = None, None, None
variables = get_variable_types(train_drop)
numeric_vars = variables['Numeric']

if len(numeric_vars) > 0:
    imp = SimpleImputer(strategy='mean', fill_value=0, missing_values=nan, copy=True)
    tmp_nr = DataFrame(imp.fit_transform(train_drop[numeric_vars]), columns=numeric_vars)

df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1)
df.index = train_drop.index
df[["target"]] = train_drop[["target"]]
df.to_csv(f'imagens/{file}_train_drop_mean.csv', index=False)

print(df.describe(include='all'))
train_drop_mean: DataFrame = read_csv(f'imagens/{file}_train_drop_mean.csv', index_col=0)



tmp_nr, tmp_sb, tmp_bool = None, None, None
variables = get_variable_types(test_drop)
numeric_vars = variables['Numeric']

if len(numeric_vars) > 0:
    imp = SimpleImputer(strategy='mean', fill_value=0, missing_values=nan, copy=True)
    tmp_nr = DataFrame(imp.fit_transform(test_drop[numeric_vars]), columns=numeric_vars)

df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1)
df.index = test_drop.index
df[["target"]] = test_drop[["target"]]
df.to_csv(f'imagens/{file}_test_drop_mean.csv', index=False)

print(df.describe(include='all'))

test_drop_mean: DataFrame = read_csv(f'imagens/{file}_test_drop_mean.csv', index_col=0)

#NB
from numpy import ndarray
from pandas import DataFrame, read_csv, unique
from matplotlib.pyplot import savefig, show
from sklearn.naive_bayes import GaussianNB
from ds_charts import plot_evaluation_results


trnY: ndarray = train_drop_mean.pop(target).values
trnX: ndarray = train_drop_mean.values

labels = unique(trnY)
labels.sort()

print(test_drop_mean.columns)
tstY: ndarray = test_drop_mean.pop(target).values
tstX: ndarray = test_drop_mean.values

clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst)
savefig(f'imagens/{file}_nb_best_drop_mean.png')
show()


##MV Imputation - drop rows + mean


##Drop rows with more than 60% MVs
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()
threshold = data.shape[1] * 0.40

df = data.dropna(thresh=threshold, inplace=False)
df.to_csv(f'imagens/{file}_drop_rows_mv.csv', index=True)
print(df.shape)
print(df.columns)
data_drop_r: DataFrame = read_csv(f'imagens/{file}_drop_rows_mv.csv', index_col=0)

##Train and test split
target = 'target'
y: np.ndarray = data_drop_r.pop(target).values
X: np.ndarray = data_drop_r.values
labels: np.ndarray = unique(y)
labels.sort()

trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y)

train_drop_r = concat([DataFrame(trnX, columns=data_drop_r.columns), DataFrame(trnY,columns=[target])], axis=1)

train_drop_r.to_csv(f'imagens/{file}_train_drop_r.csv', index=False)
test_drop_r = concat([DataFrame(tstX, columns=data_drop_r.columns), DataFrame(tstY,columns=[target])], axis=1)
test_drop_r.to_csv(f'imagens/{file}_test_drop_r.csv', index=False)
values['Train_drop_r'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
values['Test_drop_r'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]

plt.figure(figsize=(12,4))
ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
plt.show()

train_drop_r: DataFrame = read_csv(f'imagens/{file}_train_drop_r.csv')
test_drop_r: DataFrame = read_csv(f'imagens/{file}_test_drop_r.csv')

#fill the others with mean
from sklearn.impute import SimpleImputer
from pandas import concat, DataFrame
from ds_charts import get_variable_types
from numpy import nan


tmp_nr, tmp_sb, tmp_bool = None, None, None
variables = get_variable_types(train_drop_r)
numeric_vars = variables['Numeric']


if len(numeric_vars) > 0:
    imp = SimpleImputer(strategy='mean', fill_value=0, missing_values=nan, copy=True)
    tmp_nr = DataFrame(imp.fit_transform(train_drop_r[numeric_vars]), columns=numeric_vars)

df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1)
df.index = train_drop_r.index
df[["target"]] = train_drop_r[["target"]]
df.to_csv(f'imagens/{file}_train_drop_mean_r.csv', index=False)

print(df.describe(include='all'))
train_drop_mean_r: DataFrame = read_csv(f'imagens/{file}_train_drop_mean_r.csv', index_col=0)


tmp_nr, tmp_sb, tmp_bool = None, None, None
variables = get_variable_types(test_drop_r)
numeric_vars = variables['Numeric']

if len(numeric_vars) > 0:
    imp = SimpleImputer(strategy='mean', fill_value=0, missing_values=nan, copy=True)
    tmp_nr = DataFrame(imp.fit_transform(test_drop_r[numeric_vars]), columns=numeric_vars)

df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1)
df.index = test_drop_r.index
df[["target"]] = test_drop_r[["target"]]
df.to_csv(f'imagens/{file}_test_drop_mean_r.csv', index=False)

print(df.describe(include='all'))

test_drop_mean_r: DataFrame = read_csv(f'imagens/{file}_test_drop_mean_r.csv', index_col=0)

#NB
from numpy import ndarray
from pandas import DataFrame, read_csv, unique
from matplotlib.pyplot import savefig, show
from sklearn.naive_bayes import GaussianNB
from ds_charts import plot_evaluation_results


trnY: ndarray = train_drop_mean_r.pop(target).values
trnX: ndarray = train_drop_mean_r.values

labels = unique(trnY)
labels.sort()

print(test_drop_mean_r.columns)
tstY: ndarray = test_drop_mean_r.pop(target).values
tstX: ndarray = test_drop_mean_r.values

clf = GaussianNB()
clf.fit(trnX, trnY)
prd_trn = clf.predict(trnX)
prd_tst = clf.predict(tstX)
plot_evaluation_results(labels, trnY, prd_trn, tstY, prd_tst)
savefig(f'imagens/{file}_nb_best_drop_mean_r.png')
show()









Editor is loading...