# K-Means Clustering
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# Importing the dataset
dataset = pd.read_csv('datasets/K-Means/Mall_Customers.csv')
X = dataset.iloc[:, [3, 4]].values
# y = dataset.iloc[:, 3].values
print(dataset)
CustomerID Genre Age Annual Income (k$) Spending Score (1-100) 0 1 Male 19 15 39 1 2 Male 21 15 81 2 3 Female 20 16 6 3 4 Female 23 16 77 4 5 Female 31 17 40 5 6 Female 22 17 76 6 7 Female 35 18 6 7 8 Female 23 18 94 8 9 Male 64 19 3 9 10 Female 30 19 72 10 11 Male 67 19 14 11 12 Female 35 19 99 12 13 Female 58 20 15 13 14 Female 24 20 77 14 15 Male 37 20 13 15 16 Male 22 20 79 16 17 Female 35 21 35 17 18 Male 20 21 66 18 19 Male 52 23 29 19 20 Female 35 23 98 20 21 Male 35 24 35 21 22 Male 25 24 73 22 23 Female 46 25 5 23 24 Male 31 25 73 24 25 Female 54 28 14 25 26 Male 29 28 82 26 27 Female 45 28 32 27 28 Male 35 28 61 28 29 Female 40 29 31 29 30 Female 23 29 87 .. ... ... ... ... ... 170 171 Male 40 87 13 171 172 Male 28 87 75 172 173 Male 36 87 10 173 174 Male 36 87 92 174 175 Female 52 88 13 175 176 Female 30 88 86 176 177 Male 58 88 15 177 178 Male 27 88 69 178 179 Male 59 93 14 179 180 Male 35 93 90 180 181 Female 37 97 32 181 182 Female 32 97 86 182 183 Male 46 98 15 183 184 Female 29 98 88 184 185 Female 41 99 39 185 186 Male 30 99 97 186 187 Female 54 101 24 187 188 Male 28 101 68 188 189 Female 41 103 17 189 190 Female 36 103 85 190 191 Female 34 103 23 191 192 Female 32 103 69 192 193 Male 33 113 8 193 194 Female 38 113 91 194 195 Female 47 120 16 195 196 Female 35 120 79 196 197 Female 45 126 28 197 198 Male 32 126 74 198 199 Male 32 137 18 199 200 Male 30 137 83 [200 rows x 5 columns]
# Using the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
# Fitting K-Means to the dataset
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)
# Visualising the clusters
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Careful')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Standard')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Target')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Impulse')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Sensible')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()
For clustering in more than two dimentions, above visualizing code is no longer valid due to its limited dimentiality. However, using SOM, you will be able to reduce the diemention in to two.