Untitled
unknown
plain_text
a year ago
3.9 kB
6
Indexable
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# Load the dataset
url = 'diabetes.csv'
column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']
data = pd.read_csv(url, header=None, names=column_names)
# Convert columns to numeric, if any conversion issues arise, replace them with NaN
numeric_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce')
# Replace zero values with NaN in these columns
cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[cols] = data[cols].replace(0, np.nan)
# Drop rows with missing values
data.dropna(inplace=True)
# Split the dataset into features and target variable
X = data.drop('Outcome', axis=1)
y = data['Outcome']
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Implementing KNN from scratch
def euclidean_distance(row1, row2):
return np.sqrt(np.sum((row1 - row2) ** 2))
def manhattan_distance(row1, row2):
return np.sum(np.abs(row1 - row2))
def get_neighbors(train, train_labels, test_row, num_neighbors, distance_metric):
distances = []
for i, train_row in enumerate(train):
dist = distance_metric(test_row, train_row)
distances.append((train_labels.iloc[i], dist))
distances.sort(key=lambda x: x[1])
neighbors = [distances[i][0] for i in range(num_neighbors)]
return neighbors
def predict_classification(train, train_labels, test_row, num_neighbors, distance_metric):
neighbors = get_neighbors(train, train_labels, test_row, num_neighbors, distance_metric)
prediction = max(set(neighbors), key=neighbors.count)
return prediction
def knn_predict(train, train_labels, test, num_neighbors, distance_metric):
predictions = []
for row in test:
output = predict_classification(train, train_labels, row, num_neighbors, distance_metric)
predictions.append(output)
return predictions
# Experiment with different values of k and store accuracy values for both distance metrics
k_values = range(1, 11)
euclidean_accuracies = []
manhattan_accuracies = []
for k in k_values:
# Euclidean Distance
y_pred_euclidean = knn_predict(X_train_scaled, y_train, X_test_scaled, num_neighbors=k, distance_metric=euclidean_distance)
accuracy_euclidean = accuracy_score(y_test, y_pred_euclidean)
euclidean_accuracies.append(accuracy_euclidean)
# Manhattan Distance
y_pred_manhattan = knn_predict(X_train_scaled, y_train, X_test_scaled, num_neighbors=k, distance_metric=manhattan_distance)
accuracy_manhattan = accuracy_score(y_test, y_pred_manhattan)
manhattan_accuracies.append(accuracy_manhattan)
print(f"Accuracy with k={k} (Euclidean): {accuracy_euclidean}")
print(f"Accuracy with k={k} (Manhattan): {accuracy_manhattan}")
# Plot the accuracy vs. number of neighbors for both distance metrics
plt.figure(figsize=(10, 6))
plt.plot(k_values, euclidean_accuracies, marker='o', label='Euclidean Distance')
plt.plot(k_values, manhattan_accuracies, marker='s', label='Manhattan Distance')
plt.title('Accuracy vs. Number of Neighbors (k)')
plt.xlabel('Number of Neighbors (k)')
plt.ylabel('Accuracy')
plt.xticks(k_values)
plt.legend()
plt.grid(True)
plt.show()
Editor is loading...
Leave a Comment