DOC ENH Simplify the example code; Add plots for n_clusters = 3 and 5

raghavrv · raghavrv · commit 444bf22736da · 2015-01-20T06:02:25.000+05:30
diff --git a/examples/cluster/plot_kmeans_silhouette_analysis.py b/examples/cluster/plot_kmeans_silhouette_analysis.py
@@ -1,16 +1,33 @@
 """
 ===============================================================================
-Silhouette analysis for sample data clustered using KMeans clustering algorithm
+Selecting the number of clusters with silhouette analysis on KMeans clustering
 ===============================================================================
 
 Silhouette analysis can be used to study the separation distance between the
 resulting clusters. The silhouette plot displays a measure of how close each
-point in one cluster is to points in the neighboring clusters. This measure has
-a range of [-1, 1]. Silhoette coefficients (as these values are referred to as)
-near +1 indicate that the sample is far away from the neighboring clusters.
-A value of 0 indicates that the sample is on or very close to the decision 
-boundary between two neighboring clusters and negative values (upto -1) 
-indicate that those samples might have been assigned to the wrong cluster.
+point in one cluster is to points in the neighboring clusters and thus provides
+a way to assess parameters like number of clusters visually. This measure has a
+range of [-1, 1].
+
+Silhoette coefficients (as these values are referred to as) near +1 indicate
+that the sample is far away from the neighboring clusters. A value of 0
+indicates that the sample is on or very close to the decision boundary between
+two neighboring clusters and negative values indicate that those samples might
+have been assigned to the wrong cluster.
+
+In this example the silhouette analysis is used to choose an optimal value for
+``n_clusters``. The silhouette plot shows that the ``n_clusters`` value of 3, 5
+and 6 are a bad pick for the given data due to the presence of clusters with
+below average silhouette scores and also due to wide fluctuations in the size
+of the silhouette plots. Silhouette analysis is more ambivalent in deciding
+between 2 and 4.
+
+Also from the thickness of the silhouette plot the cluster size can be
+visualized. The silhouette plot for cluster 0 when ``n_clusters`` is equal to
+2, is bigger in size owing to the grouping of the 3 sub clusters into one big
+cluster. However when the ``n_clusters`` is equal to 4, all the plots are more
+or less of similar thickness and hence are of similar sizes as can be also
+verified from the labelled scatter plot on the right.
 """
 
 from __future__ import print_function
@@ -31,12 +48,12 @@
 X, y = make_blobs(n_samples=500,
                   n_features=2,
                   centers=4,
-                  cluster_std=1.0,
+                  cluster_std=1,
                   center_box=(-10.0, 10.0),
                   shuffle=True,
                   random_state=1)  # For reproducibility
 
-range_n_clusters = [2, 4, 6]
+range_n_clusters = [2, 3, 4, 5, 6]
 
 for n_clusters in range_n_clusters:
     # Create a subplot with 1 row and 2 columns
@@ -47,9 +64,9 @@
     # The silhouette coefficient can range from -1, 1 but in this example all
     # lie within [-0.1, 1]
     ax1.set_xlim([-0.1, 1])
-    # The n_clusters*10 are the additional samples to demarcate the space
-    # between silhouette plots of individual clusters.
-    ax1.set_ylim([0, len(X) + n_clusters * 10])
+    # The (n_clusters+1)*10 is for inserting blank space between silhouette
+    # plots of individual clusters, to demarcate them clearly.
+    ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
 
     # Initialize the clusterer with n_clusters value and a random generator
     # seed of 10 for reproducibility.
@@ -66,74 +83,58 @@
     # Compute the silhouette scores for each sample
     sample_silhouette_values = silhouette_samples(X, cluster_labels)
 
-    # This will hold the silhouette coefficient of all the clusters separated
-    # by 0 samples.
-    sorted_clustered_sample_silhouette_values = []
-
-    for i in np.unique(cluster_labels):
+    y_lower = 10
+    for i in range(n_clusters):
+        # Aggregate the silhouette scores for samples belonging to
+        # cluster i, and sort them
         ith_cluster_silhouette_values = \
             sample_silhouette_values[cluster_labels == i]
 
-        # Add the ith_cluster_silhouette_values after sorting them
         ith_cluster_silhouette_values.sort()
 
-        # The introduced 0 samples are to differentiate clearly between the
-        # different clusters
-        sorted_clustered_sample_silhouette_values += \
-            ith_cluster_silhouette_values.tolist() + [0] * 10
+        size_cluster_i = ith_cluster_silhouette_values.shape[0]
+        y_upper = y_lower + size_cluster_i
 
-    x_values = np.array(sorted_clustered_sample_silhouette_values)
-    y_range = np.arange(len(X) + 10 * n_clusters)
-
-    # Computing custom label coordinates for labeling the clusters
-    # Plot the silhouette with the corresponding cluster color
-    offset = 0
-    for i in range(n_clusters):
-        size_cluster_i = sum(cluster_labels == i)
+        color = cm.spectral(float(i) / n_clusters)
+        ax1.fill_betweenx(np.arange(y_lower, y_upper),
+                          0, ith_cluster_silhouette_values,
+                          facecolor=color, edgecolor=color, alpha=0.7)
 
-        x = -0.05
-        # Label them at the middle
-        dy = size_cluster_i
-        ax1.text(x, offset + 0.5 * dy, str(i))
+        # Label the silhouette plots with their cluster numbers at the middle
+        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
 
-        y_bottom = offset
-        y_top = offset + dy
-
-        color = cm.spectral(float(i) / n_clusters, 1)
-        ax1.fill_betweenx(y_range, 0, x_values,
-                          where=((y_range >= y_bottom) & (y_range < y_top)),
-                          facecolor=color, edgecolor=color)
-        # Compute the base offset for next plot
-        offset += size_cluster_i + 10  # 10 for the 0 samples
+        # Compute the new y_lower for next plot
+        y_lower = y_upper + 10  # 10 for the 0 samples
 
     ax1.set_title("The silhouette plot for the various clusters.")
     ax1.set_xlabel("The silhouette coefficient values")
     ax1.set_ylabel("Cluster label")
-    
+
     # The vertical line for average silhoutte score of all the values
-    ax1.axvline(x = silhouette_avg, color = "red", linestyle = "--")
+    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
 
     ax1.set_yticks([])  # Clear the yaxis labels / ticks
     ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
 
     # 2nd Plot showing the actual clusters formed
-    for k in range(len(X)):
-        color = cm.spectral(float(cluster_labels[k]) / n_clusters, 1)
-        ax2.scatter(X[k, 0], X[k, 1], marker='.', color=color)
-
-    # Label the cluster centers with the cluster number for identification and
-    # study of the corresponding silhouette plot.
-    for c in clusterer.cluster_centers_:
-        # Use the clusterer to know to which cluster number the current center
-        # c belongs to
-        i = clusterer.predict(c)[0]
-        ax2.scatter(c[0], c[1], marker='o', c="white", alpha=1, s=200)
-        ax2.scatter(c[0], c[1], marker="$%d$" % i, alpha=1, s=50)
+    ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
+                c=map(cm.spectral, cluster_labels.astype(float) / n_clusters))
+
+    # Labeling the clusters
+    centers = clusterer.cluster_centers_
+    # Draw white circles at cluster centers
+    ax2.scatter(centers[:, 0], centers[:, 1],
+                marker='o', c="white", alpha=1, s=200)
+
+    for i, c in enumerate(centers):
+        ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50)
 
     ax2.set_title("The visualization of the clustered data.")
+    ax2.set_xlabel("Feature space for the 1st feature")
+    ax2.set_ylabel("Feature space for the 2nd feature")
+
     plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                   "with n_clusters = %d" % n_clusters),
                  fontsize=14, fontweight='bold')
-    ax2.set_xlabel("Feature space for the 1st feature")
-    ax2.set_ylabel("Feature space for the 2nd feature")
+
     plt.show()