Merge pull request scikit-learn#4965 from mrphilroth/4922

amueller · amueller · commit 0bcbf922e5bb · 2015-07-29T11:20:27.000-04:00
[MRG+1] New k-means example
diff --git a/doc/modules/clustering.rst b/doc/modules/clustering.rst
@@ -156,6 +156,11 @@ It suffers from various drawbacks:
   prior to k-means clustering can alleviate this problem
   and speed up the computations.
 
+.. image:: ../auto_examples/cluster/images/plot_kmeans_assumptions_001.png
+   :target: ../auto_examples/cluster/plot_kmeans_assumptions.html
+   :align: center
+   :scale: 50
+
 K-means is often referred to as Lloyd's algorithm. In basic terms, the
 algorithm has three steps. The first step chooses the initial centroids, with
 the most basic method being to choose :math:`k` samples from the dataset
@@ -213,6 +218,8 @@ transform method of a trained model of :class:`KMeans`.
 
 .. topic:: Examples:
 
+ * :ref:`example_cluster_plot_kmeans_assumptions.py`: Demonstrating when
+   k-means performs intuitively and when it does not
  * :ref:`example_cluster_plot_kmeans_digits.py`: Clustering handwritten digits
 
 .. topic:: References:
diff --git a/examples/cluster/plot_kmeans_assumptions.py b/examples/cluster/plot_kmeans_assumptions.py
@@ -0,0 +1,63 @@
+"""
+====================================
+Demonstration of k-means assumptions
+====================================
+
+This example is meant to illustrate situations where k-means will produce
+unintuitive and possibly unexpected clusters. In the first three plots, the
+input data does not conform to some implicit assumption that k-means makes and
+undesirable clusters are produced as a result. In the last plot, k-means
+returns intuitive clusters despite unevenly sized blobs.
+"""
+print(__doc__)
+
+# Author: Phil Roth <mr.phil.roth@gmail.com>
+# License: BSD 3 clause
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn.cluster import KMeans
+from sklearn.datasets import make_blobs
+
+plt.figure(figsize=(12, 12))
+
+n_samples = 1500
+random_state = 170
+X, y = make_blobs(n_samples=n_samples, random_state=random_state)
+
+# Incorrect number of clusters
+y_pred = KMeans(n_clusters=2, random_state=random_state).fit_predict(X)
+
+plt.subplot(221)
+plt.scatter(X[:, 0], X[:, 1], c=y_pred)
+plt.title("Incorrect Number of Blobs")
+
+# Anisotropicly distributed data
+transformation = [[ 0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
+X_aniso = np.dot(X, transformation)
+y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_aniso)
+
+plt.subplot(222)
+plt.scatter(X_aniso[:, 0], X_aniso[:, 1], c=y_pred)
+plt.title("Anisotropicly Distributed Blobs")
+
+# Different variance
+X_varied, y_varied = make_blobs(n_samples=n_samples,
+                                cluster_std=[1.0, 2.5, 0.5],
+                                random_state=random_state)
+y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_varied)
+
+plt.subplot(223)
+plt.scatter(X_varied[:, 0], X_varied[:, 1], c=y_pred)
+plt.title("Unequal Variance")
+
+# Unevenly sized blobs
+X_filtered = np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
+y_pred = KMeans(n_clusters=3, random_state=random_state).fit_predict(X_filtered)
+
+plt.subplot(224)
+plt.scatter(X_filtered[:, 0], X_filtered[:, 1], c=y_pred)
+plt.title("Unevenly Sized Blobs")
+
+plt.show()