k_mean_clustering

mail@pastecode.io avatar
unknown
python
7 months ago
1.5 kB
7
Indexable
Never
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


excel_file = r"C:\Users\andyl\Documents\UTCC\All_stores_kmean.xlsx"
df = pd.read_excel(excel_file)


data = df[['Average Sales']]


scaler = StandardScaler()
normalized_data = scaler.fit_transform(data)


kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(normalized_data)


centroids = scaler.inverse_transform(kmeans.cluster_centers_)


outliers = df[df['Average Sales'] > 3000000]


plt.figure(figsize=(10, 6))


plt.axhline(y=0, xmin=0, xmax=1, color='black', linestyle='--', label='Average Sales Range')


plt.scatter(df['Average Sales'], [0] * len(df), c=df['Cluster'], cmap='viridis', marker='o', edgecolors='k', s=50, alpha=0.7, label='Data Points')


plt.scatter(centroids[:, 0], [0] * len(centroids), marker='x', s=200, linewidths=3, color='red', label='Centroids')


plt.scatter(outliers['Average Sales'], [0] * len(outliers), c='orange', marker='o', edgecolors='k', s=50, alpha=0.7, label='Outliers')


plt.title('K-Means Clustering with Centroids and Outliers (1 Variable)')
plt.xlabel('Average Sales')
plt.ylabel('Cluster')
plt.xlim(10000, 3000000) 
plt.ylim(-1, 1)  
plt.yticks([])  


plt.xticks([10000, 500000, 1000000, 1500000, 2000000, 2500000, 3000000], ['10,000', '500,000', '1,000,000', '1,500,000', '2,000,000', '2,500,000', '3,000,000'])

plt.legend()


plt.show()
Leave a Comment