import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedKFold
# Klasyfikatory
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
matrix_labels = ['gaussian_nb', 'logistic_regression']
datasets = ["bupa", "coil2000", "cryotherapy", "ecoli4", "german", "glass2", "glass4", "glass5", "haberman", "heart", "ionosphere", "iris", "liver", "mammographic", "monk-2", "phoneme", "pima", "popfailures", "ring", "sonar", "soybean", "spambase", "spectfheart", "titanic", "twonorm", "vowel0", "waveform", "wine", "wisconsin"]
def prepare_dataset(set_name):
dataset = np.genfromtxt(f"dataset/{set_name}.csv", delimiter=",")
x = dataset[:, :-1]
y = dataset[:, -1].astype(int)
return x,y
def split_data(x, y):
x_train, x_test, y_train, y_test = train_test_split(
x, y,
test_size=.3,
random_state=42
)
return x_train, x_test, y_train, y_test
def get_gaussian(x_train, y_train, x_test):
clf = GaussianNB()
clf.fit(x_train, y_train)
predict = clf.predict(x_test)
return predict
def get_logistic_regression(x_train, y_train, x_test):
logreg_clf = LogisticRegression()
logreg_clf.fit(x_train, y_train)
predict = logreg_clf.predict(x_test)
return predict
def test(x, y):
kf = RepeatedKFold(n_splits=5, n_repeats=2, random_state=1234)
numpy = []
numpy.append(matrix_labels)
for train_index, test_index in kf.split(x):
x_train, x_test = x[train_index], x[test_index]
y_train, y_test = y[train_index], y[test_index]
predict_gaussian = get_gaussian(x_train, y_train, x_test)
predict_logistic_regression = get_logistic_regression(x_train, y_train, x_test)
numpy.append([
accuracy_score(y_test, predict_gaussian),
accuracy_score(y_test, predict_logistic_regression)
])
return numpy
def get_score(numpy):
mean_score = np.mean(numpy)
std_score = np.std(numpy)
return "Accuracy score: %.3f (%.3f)" % (mean_score, std_score)
for dataset in datasets:
x,y = prepare_dataset(dataset)
x_train, x_test, y_train, y_test = split_data(x,y)
result = test(x, y)
print(get_score(result))