Untitled
unknown
plain_text
a year ago
3.9 kB
4
Indexable
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt # Load the dataset url = 'diabetes.csv' column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'] data = pd.read_csv(url, header=None, names=column_names) # Convert columns to numeric, if any conversion issues arise, replace them with NaN numeric_cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'] data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce') # Replace zero values with NaN in these columns cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI'] data[cols] = data[cols].replace(0, np.nan) # Drop rows with missing values data.dropna(inplace=True) # Split the dataset into features and target variable X = data.drop('Outcome', axis=1) y = data['Outcome'] # Split the data into training and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Normalize the features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # Implementing KNN from scratch def euclidean_distance(row1, row2): return np.sqrt(np.sum((row1 - row2) ** 2)) def manhattan_distance(row1, row2): return np.sum(np.abs(row1 - row2)) def get_neighbors(train, train_labels, test_row, num_neighbors, distance_metric): distances = [] for i, train_row in enumerate(train): dist = distance_metric(test_row, train_row) distances.append((train_labels.iloc[i], dist)) distances.sort(key=lambda x: x[1]) neighbors = [distances[i][0] for i in range(num_neighbors)] return neighbors def predict_classification(train, train_labels, test_row, num_neighbors, distance_metric): neighbors = get_neighbors(train, train_labels, test_row, num_neighbors, distance_metric) prediction = max(set(neighbors), key=neighbors.count) return prediction def knn_predict(train, train_labels, test, num_neighbors, distance_metric): predictions = [] for row in test: output = predict_classification(train, train_labels, row, num_neighbors, distance_metric) predictions.append(output) return predictions # Experiment with different values of k and store accuracy values for both distance metrics k_values = range(1, 11) euclidean_accuracies = [] manhattan_accuracies = [] for k in k_values: # Euclidean Distance y_pred_euclidean = knn_predict(X_train_scaled, y_train, X_test_scaled, num_neighbors=k, distance_metric=euclidean_distance) accuracy_euclidean = accuracy_score(y_test, y_pred_euclidean) euclidean_accuracies.append(accuracy_euclidean) # Manhattan Distance y_pred_manhattan = knn_predict(X_train_scaled, y_train, X_test_scaled, num_neighbors=k, distance_metric=manhattan_distance) accuracy_manhattan = accuracy_score(y_test, y_pred_manhattan) manhattan_accuracies.append(accuracy_manhattan) print(f"Accuracy with k={k} (Euclidean): {accuracy_euclidean}") print(f"Accuracy with k={k} (Manhattan): {accuracy_manhattan}") # Plot the accuracy vs. number of neighbors for both distance metrics plt.figure(figsize=(10, 6)) plt.plot(k_values, euclidean_accuracies, marker='o', label='Euclidean Distance') plt.plot(k_values, manhattan_accuracies, marker='s', label='Manhattan Distance') plt.title('Accuracy vs. Number of Neighbors (k)') plt.xlabel('Number of Neighbors (k)') plt.ylabel('Accuracy') plt.xticks(k_values) plt.legend() plt.grid(True) plt.show()
Editor is loading...
Leave a Comment