Notebook

Analysis of Customer purchasing Patterns using K-Means¶

In [1]:

# K-Means Clustering

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [3]:

# Importing the dataset
dataset = pd.read_csv('datasets/K-Means/Mall_Customers.csv')
X = dataset.iloc[:, [3, 4]].values
# y = dataset.iloc[:, 3].values

In [8]:

print(dataset)

     CustomerID   Genre  Age  Annual Income (k$)  Spending Score (1-100)
0             1    Male   19                  15                      39
1             2    Male   21                  15                      81
2             3  Female   20                  16                       6
3             4  Female   23                  16                      77
4             5  Female   31                  17                      40
5             6  Female   22                  17                      76
6             7  Female   35                  18                       6
7             8  Female   23                  18                      94
8             9    Male   64                  19                       3
9            10  Female   30                  19                      72
10           11    Male   67                  19                      14
11           12  Female   35                  19                      99
12           13  Female   58                  20                      15
13           14  Female   24                  20                      77
14           15    Male   37                  20                      13
15           16    Male   22                  20                      79
16           17  Female   35                  21                      35
17           18    Male   20                  21                      66
18           19    Male   52                  23                      29
19           20  Female   35                  23                      98
20           21    Male   35                  24                      35
21           22    Male   25                  24                      73
22           23  Female   46                  25                       5
23           24    Male   31                  25                      73
24           25  Female   54                  28                      14
25           26    Male   29                  28                      82
26           27  Female   45                  28                      32
27           28    Male   35                  28                      61
28           29  Female   40                  29                      31
29           30  Female   23                  29                      87
..          ...     ...  ...                 ...                     ...
170         171    Male   40                  87                      13
171         172    Male   28                  87                      75
172         173    Male   36                  87                      10
173         174    Male   36                  87                      92
174         175  Female   52                  88                      13
175         176  Female   30                  88                      86
176         177    Male   58                  88                      15
177         178    Male   27                  88                      69
178         179    Male   59                  93                      14
179         180    Male   35                  93                      90
180         181  Female   37                  97                      32
181         182  Female   32                  97                      86
182         183    Male   46                  98                      15
183         184  Female   29                  98                      88
184         185  Female   41                  99                      39
185         186    Male   30                  99                      97
186         187  Female   54                 101                      24
187         188    Male   28                 101                      68
188         189  Female   41                 103                      17
189         190  Female   36                 103                      85
190         191  Female   34                 103                      23
191         192  Female   32                 103                      69
192         193    Male   33                 113                       8
193         194  Female   38                 113                      91
194         195  Female   47                 120                      16
195         196  Female   35                 120                      79
196         197  Female   45                 126                      28
197         198    Male   32                 126                      74
198         199    Male   32                 137                      18
199         200    Male   30                 137                      83

[200 rows x 5 columns]

In [4]:

# Using the elbow method to find the optimal number of clusters
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()

In [5]:

# Fitting K-Means to the dataset
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)

In [7]:

# Visualising the clusters
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Careful')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Standard')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Target')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Impulse')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Sensible')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()

For clustering in more than two dimentions, above visualizing code is no longer valid due to its limited dimentiality. However, using SOM, you will be able to reduce the diemention in to two.