Untitled

mail@pastecode.io avatar
unknown
plain_text
2 years ago
4.9 kB
3
Indexable
Never
import copy
import numpy as np
from pandas import read_csv, concat, unique, DataFrame
import matplotlib.pyplot as plt
import ds_charts as ds
from sklearn.model_selection import train_test_split
import pandas

file = 'monthly_data'
filename = 'small_ca_monthly_data.csv'
data = read_csv(filename, na_values='na')
data=data.dropna(axis=1,how='all')
data = data.drop(columns='station')
data['FRGT'] = pandas.to_numeric(data['FRGT'], errors='coerce')
data[['year', 'month', 'DAPR', 'DWPR', 'MDPR', 'PRCP']] = data[['year', 'month', 'DAPR', 'DWPR', 'MDPR', 'PRCP']].astype(int)

print(data.shape)
print(data.FRGT)

target = 'target'
positive = 1
negative = 0
values = {'Original': [len(data[data[target] == positive]), len(data[data[target] == negative])]}

##MV Imputation - drop cols + mean
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()

mv = {}
for var in data:
    nr = data[var].isna().sum()
    if nr > 0:
        mv[var] = nr

##Drop columns with more than 90% MVs
accuracy_values = []
recall_values = []
f1score_values = []
precision_values = []
thresholds = [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
train_precisions = []
test_precisions = []
for threshold in thresholds:
    #threshold = data.shape[0] * 0.90

    missings = [c for c in mv.keys() if mv[c]>threshold]
    df = data.drop(columns=missings, inplace=False)
    df.to_csv(f'dados/{file}_drop_columns_mv.csv', index=False)

    print('Dropped variables', missings)
    print(df.shape)
    data_drop: DataFrame = read_csv(f'dados/{file}_drop_columns_mv.csv', index_col=0)

    #fill the others with mean
    from sklearn.impute import SimpleImputer
    from pandas import concat, DataFrame
    from ds_charts import get_variable_types
    from numpy import nan

    tmp_nr, tmp_sb, tmp_bool = None, None, None
    variables = get_variable_types(data_drop)
    numeric_vars = variables['Numeric']
    print(numeric_vars)

    if len(numeric_vars) > 0:
        imp = SimpleImputer(strategy='mean', missing_values=nan, copy=True)
        tmp_nr = DataFrame(imp.fit_transform(data_drop[numeric_vars]), columns=numeric_vars)

    df = concat([tmp_nr, tmp_sb, tmp_bool], axis=1)
    df.index = data_drop.index
    df[["target"]] = data_drop[["target"]]
    df.to_csv(f'dados/{file}_data_drop_mean.csv', index=False)

    print(df.describe(include='all'))
    data_drop_mean: DataFrame = read_csv(f'dados/{file}_data_drop_mean.csv', index_col=0)


    ##Train and test split
    target = 'target'
    y: np.ndarray = data_drop_mean.pop(target).values
    X: np.ndarray = data_drop_mean.values
    labels: np.ndarray = unique(y)
    labels.sort()

    trnX, tstX, trnY, tstY = train_test_split(X, y, train_size=0.7, stratify=y, random_state=0)

    train_drop_mean = concat([DataFrame(trnX, columns=data_drop_mean.columns), DataFrame(trnY,columns=[target])], axis=1)
    train_drop_mean.to_csv(f'dados/{file}_train_drop_mean.csv', index=False)

    test_drop_mean = concat([DataFrame(tstX, columns=data_drop_mean.columns), DataFrame(tstY,columns=[target])], axis=1)
    test_drop_mean.to_csv(f'dados/{file}_test_drop_mean.csv', index=False)
    values['Train_drop_mean'] = [len(np.delete(trnY, np.argwhere(trnY==negative))), len(np.delete(trnY, np.argwhere(trnY==positive)))]
    values['Test_drop_mean'] = [len(np.delete(tstY, np.argwhere(tstY==negative))), len(np.delete(tstY, np.argwhere(tstY==positive)))]

    plt.figure(figsize=(12,4))
    ds.multiple_bar_chart([positive, negative], values, title='Data distribution per dataset')
    plt.show()

    train_drop_mean: DataFrame = read_csv(f'dados/{file}_train_drop_mean.csv')
    test_drop_mean: DataFrame = read_csv(f'dados/{file}_test_drop_mean.csv')

    #NB
    from numpy import ndarray
    from pandas import DataFrame, read_csv, unique
    from matplotlib.pyplot import savefig, show
    from sklearn.naive_bayes import GaussianNB
    from ds_charts import plot_evaluation_results


    trnY: ndarray = train_drop_mean.pop(target).values
    trnX: ndarray = train_drop_mean.values

    labels = unique(trnY)
    labels.sort()

    print(test_drop_mean.columns)
    tstY: ndarray = test_drop_mean.pop(target).values
    tstX: ndarray = test_drop_mean.values

    clf = GaussianNB()
    clf.fit(trnX, trnY)
    prd_trn = clf.predict(trnX)
    prd_tst = clf.predict(tstX)
    evaluation_dict = plot_evaluation_results(labels,trnY, prd_trn, tstY, prd_tst)
    accuracy_values.append (evaluation_dict['Accuracy'])
    recall_values.append(evaluation_dict['Recall'])
    f1score_values.append(evaluation_dict['F1-score'])
    precision_values.append(evaluation_dict['Precision'])
print(accuracy_values)
print(recall_values)
print(f1score_values)
print(precision_values)
savefig(f'dados/{file}_nb_best_drop_mean.png')
show()