DOC: change the example to lighter dataset

GaelVaroquaux · GaelVaroquaux · commit 5d97a113e3a3 · 2013-01-20T19:48:11.000+01:00
diff --git a/examples/cluster/plot_dict_face_patches.py b/examples/cluster/plot_dict_face_patches.py
@@ -11,6 +11,12 @@
 and extract randomly 15 patches from this image. Once we have accumulated
 750 of these patches (using 50 images), we run the `partial_fit` method
 of the online KMeans object, MiniBatchKMeans.
+
+The verbose setting on the MiniBatchKMeans enables us to see that some
+clusters are reassigned during the successive calls to
+partial-fit. This is because the number of patches that they represent
+has become too low, and it is better to choose a random new
+cluster.
 """
 print __doc__
 
@@ -24,37 +30,38 @@
 from sklearn.cluster import MiniBatchKMeans
 from sklearn.feature_extraction.image import extract_patches_2d
 
-faces = datasets.fetch_lfw_people()
-data = faces.data
+faces = datasets.fetch_olivetti_faces()
 
 ###############################################################################
 # Learn the dictionary of images
 
 print 'Learning the dictionary... '
 rng = np.random.RandomState(0)
-kmeans = MiniBatchKMeans(n_clusters=81)
+kmeans = MiniBatchKMeans(n_clusters=81, random_state=rng, verbose=True)
 patch_size = (20, 20)
 
 buffer = []
 index = 1
 t0 = time.time()
 
-# The online learning part
-for index, img in enumerate(faces.images):
-    data = extract_patches_2d(img, patch_size,
-                                max_patches=15, random_state=rng)
-    data = np.reshape(data, (len(data), -1))
-    buffer.append(data)
-    index += 1
-    if index % 50 == 0:
-        data = np.concatenate(buffer, axis=0)
-        data -= np.mean(data, axis=0)
-        data /= np.std(data, axis=0)
-        kmeans.partial_fit(data)
-        buffer = []
-    if index % 500 == 0:
-        print 'Partial fit of %4i out of %i' % (index,
-                                                len(faces.images))
+# The online learning part: cycle over the whole dataset 4 times
+index = 0
+for _ in range(6):
+    for img in faces.images:
+        data = extract_patches_2d(img, patch_size,
+                                    max_patches=50, random_state=rng)
+        data = np.reshape(data, (len(data), -1))
+        buffer.append(data)
+        index += 1
+        if index % 10 == 0:
+            data = np.concatenate(buffer, axis=0)
+            data -= np.mean(data, axis=0)
+            data /= np.std(data, axis=0)
+            kmeans.partial_fit(data)
+            buffer = []
+        if index % 100 == 0:
+            print 'Partial fit of %4i out of %i' % (index,
+                                                    6 * len(faces.images))
 
 dt = time.time() - t0
 print 'done in %.2fs.' % dt
@@ -69,8 +76,9 @@
     pl.xticks(())
     pl.yticks(())
 
+
 pl.suptitle('Patches of faces\nTrain time %.1fs on %d patches' %
-            (dt, len(faces.images)), fontsize=16)
+            (dt, 8 * len(faces.images)), fontsize=16)
 pl.subplots_adjust(0.08, 0.02, 0.92, 0.85, 0.08, 0.23)
 
 pl.show()