Merge pull request scikit-learn#4653 from ssaeger/issue_4633

GaelVaroquaux · GaelVaroquaux · commit 4618cb309387 · 2015-05-03T19:34:56.000+02:00
[MRG + 1] Added verbose flag to GMM
diff --git a/sklearn/mixture/dpgmm.py b/sklearn/mixture/dpgmm.py
@@ -158,7 +158,7 @@ class DPGMM(GMM):
         process.  Can contain any combination of 'w' for weights,
         'm' for means, and 'c' for covars.  Defaults to 'wmc'.
 
-    verbose : boolean, default False
+    verbose : int, default 0
         Controls output verbosity.
 
     Attributes
@@ -198,15 +198,14 @@ class DPGMM(GMM):
     """
 
     def __init__(self, n_components=1, covariance_type='diag', alpha=1.0,
-                 random_state=None, thresh=None, tol=1e-3, verbose=False,
+                 random_state=None, thresh=None, tol=1e-3, verbose=0,
                  min_covar=None, n_iter=10, params='wmc', init_params='wmc'):
         self.alpha = alpha
-        self.verbose = verbose
         super(DPGMM, self).__init__(n_components, covariance_type,
                                     random_state=random_state, thresh=thresh,
                                     tol=tol, min_covar=min_covar,
                                     n_iter=n_iter, params=params,
-                                    init_params=init_params)
+                                    init_params=init_params, verbose=verbose)
 
     def _get_precisions(self):
         """Return precisions as a full matrix."""
@@ -367,7 +366,7 @@ def _monitor(self, X, z, n, end=False):
         expected.
 
         Note: this is very expensive and should not be used by default."""
-        if self.verbose:
+        if self.verbose > 0:
             print("Bound after updating %8s: %f" % (n, self.lower_bound(X, z)))
             if end:
                 print("Cluster proportions:", self.gamma_.T[1])
@@ -653,7 +652,7 @@ class VBGMM(DPGMM):
         process.  Can contain any combination of 'w' for weights,
         'm' for means, and 'c' for covars.  Defaults to 'wmc'.
 
-    verbose : boolean, default False
+    verbose : int, default 0
         Controls output verbosity.
 
     Attributes
@@ -695,7 +694,7 @@ class VBGMM(DPGMM):
     """
 
     def __init__(self, n_components=1, covariance_type='diag', alpha=1.0,
-                 random_state=None, thresh=None, tol=1e-3, verbose=False,
+                 random_state=None, thresh=None, tol=1e-3, verbose=0,
                  min_covar=None, n_iter=10, params='wmc', init_params='wmc'):
         super(VBGMM, self).__init__(
             n_components, covariance_type, random_state=random_state,
@@ -779,7 +778,7 @@ def _monitor(self, X, z, n, end=False):
         expected.
 
         Note: this is very expensive and should not be used by default."""
-        if self.verbose:
+        if self.verbose > 0:
             print("Bound after updating %8s: %f" % (n, self.lower_bound(X, z)))
             if end:
                 print("Cluster proportions:", self.gamma_)
diff --git a/sklearn/mixture/gmm.py b/sklearn/mixture/gmm.py
@@ -12,6 +12,7 @@
 import warnings
 import numpy as np
 from scipy import linalg
+from time import time
 
 from ..base import BaseEstimator
 from ..utils import check_random_state, check_array
@@ -156,6 +157,11 @@ class GMM(BaseEstimator):
         process.  Can contain any combination of 'w' for weights,
         'm' for means, and 'c' for covars.  Defaults to 'wmc'.
 
+    verbose : int, default: 0
+        Enable verbose output. If 1 then it always prints the current
+        initialization and iteration step. If greater than 1 then
+        it prints additionally the change and time needed for each step.
+
     Attributes
     ----------
     weights_ : array, shape (`n_components`,)
@@ -203,7 +209,7 @@ class GMM(BaseEstimator):
     >>> g.fit(obs) # doctest: +NORMALIZE_WHITESPACE
     GMM(covariance_type='diag', init_params='wmc', min_covar=0.001,
             n_components=2, n_init=1, n_iter=100, params='wmc',
-            random_state=None, thresh=None, tol=0.001)
+            random_state=None, thresh=None, tol=0.001, verbose=0)
     >>> np.round(g.weights_, 2)
     array([ 0.75,  0.25])
     >>> np.round(g.means_, 2)
@@ -221,15 +227,16 @@ class GMM(BaseEstimator):
     >>> g.fit(20 * [[0]] +  20 * [[10]]) # doctest: +NORMALIZE_WHITESPACE
     GMM(covariance_type='diag', init_params='wmc', min_covar=0.001,
             n_components=2, n_init=1, n_iter=100, params='wmc',
-            random_state=None, thresh=None, tol=0.001)
+            random_state=None, thresh=None, tol=0.001, verbose=0)
     >>> np.round(g.weights_, 2)
     array([ 0.5,  0.5])
 
     """
 
     def __init__(self, n_components=1, covariance_type='diag',
                  random_state=None, thresh=None, tol=1e-3, min_covar=1e-3,
-                 n_iter=100, n_init=1, params='wmc', init_params='wmc'):
+                 n_iter=100, n_init=1, params='wmc', init_params='wmc',
+                 verbose=0):
         if thresh is not None:
             warnings.warn("'thresh' has been replaced by 'tol' in 0.16 "
                           " and will be removed in 0.18.",
@@ -244,6 +251,7 @@ def __init__(self, n_components=1, covariance_type='diag',
         self.n_init = n_init
         self.params = params
         self.init_params = init_params
+        self.verbose = verbose
 
         if covariance_type not in ['spherical', 'tied', 'diag', 'full']:
             raise ValueError('Invalid value for covariance_type: %s' %
@@ -458,15 +466,26 @@ def _fit(self, X, y=None, do_prediction=False):
 
         max_log_prob = -np.infty
 
-        for _ in range(self.n_init):
+        if self.verbose > 0:
+            print('Expectation-maximization algorithm started.')
+
+        for init in range(self.n_init):
+            if self.verbose > 0:
+                print('Initialization '+str(init+1))
+                start_init_time = time()
+
             if 'm' in self.init_params or not hasattr(self, 'means_'):
                 self.means_ = cluster.KMeans(
                     n_clusters=self.n_components,
                     random_state=self.random_state).fit(X).cluster_centers_
+                if self.verbose > 1:
+                    print('\tMeans have been initialized.')
 
             if 'w' in self.init_params or not hasattr(self, 'weights_'):
                 self.weights_ = np.tile(1.0 / self.n_components,
                                         self.n_components)
+                if self.verbose > 1:
+                    print('\tWeights have been initialized.')
 
             if 'c' in self.init_params or not hasattr(self, 'covars_'):
                 cv = np.cov(X.T) + self.min_covar * np.eye(X.shape[1])
@@ -475,6 +494,8 @@ def _fit(self, X, y=None, do_prediction=False):
                 self.covars_ = \
                     distribute_covar_matrix_to_match_covariance_type(
                         cv, self.covariance_type, self.n_components)
+                if self.verbose > 1:
+                    print('\tCovariance matrices have been initialized.')
 
             # EM algorithms
             current_log_likelihood = None
@@ -486,23 +507,33 @@ def _fit(self, X, y=None, do_prediction=False):
                    else self.thresh / float(X.shape[0]))
 
             for i in range(self.n_iter):
+                if self.verbose > 0:
+                    print('\tEM iteration '+str(i+1))
+                    start_iter_time = time()
                 prev_log_likelihood = current_log_likelihood
                 # Expectation step
                 log_likelihoods, responsibilities = self.score_samples(X)
                 current_log_likelihood = log_likelihoods.mean()
 
                 # Check for convergence.
-                # (should compare to self.tol when dreprecated 'thresh' is
+                # (should compare to self.tol when deprecated 'thresh' is
                 # removed in v0.18)
                 if prev_log_likelihood is not None:
                     change = abs(current_log_likelihood - prev_log_likelihood)
+                    if self.verbose > 1:
+                        print('\t\tChange: '+str(change))
                     if change < tol:
                         self.converged_ = True
+                        if self.verbose > 0:
+                            print('\t\tEM algorithm converged.')
                         break
 
                 # Maximization step
                 self._do_mstep(X, responsibilities, self.params,
                                self.min_covar)
+                if self.verbose > 1:
+                    print('\t\tEM iteration '+str(i+1)+' took {0:.5f}s'.format(
+                        time()-start_iter_time))
 
             # if the results are better, keep it
             if self.n_iter:
@@ -511,6 +542,13 @@ def _fit(self, X, y=None, do_prediction=False):
                     best_params = {'weights': self.weights_,
                                    'means': self.means_,
                                    'covars': self.covars_}
+                    if self.verbose > 1:
+                        print('\tBetter parameters were found.')
+
+            if self.verbose > 1:
+                print('\tInitialization '+str(init+1)+' took {0:.5f}s'.format(
+                    time()-start_init_time))
+
         # check the existence of an init param that was not subject to
         # likelihood computation issue.
         if np.isneginf(max_log_prob) and self.n_iter:
@@ -661,7 +699,8 @@ def _log_multivariate_normal_density_full(X, means, covars, min_covar=1.e-7):
                 cv_chol = linalg.cholesky(cv + min_covar * np.eye(n_dim),
                                           lower=True)
             except linalg.LinAlgError:
-                raise ValueError("'covars' must be symmetric, positive-definite")
+                raise ValueError("'covars' must be symmetric, "
+                                 "positive-definite")
 
         cv_log_det = 2 * np.sum(np.log(np.diagonal(cv_chol)))
         cv_sol = linalg.solve_triangular(cv_chol, (X - mu).T, lower=True).T
diff --git a/sklearn/mixture/tests/test_dpgmm.py b/sklearn/mixture/tests/test_dpgmm.py
@@ -1,4 +1,5 @@
 import unittest
+import sys
 
 import nose
 
@@ -7,8 +8,9 @@
 from sklearn.mixture import DPGMM, VBGMM
 from sklearn.mixture.dpgmm import log_normalize
 from sklearn.datasets import make_blobs
-from sklearn.utils.testing import assert_array_less
+from sklearn.utils.testing import assert_array_less, assert_equal
 from sklearn.mixture.tests.test_gmm import GMMTester
+from sklearn.externals.six.moves import cStringIO as StringIO
 
 np.seterr(all='warn')
 
@@ -30,6 +32,65 @@ def test_class_weights():
         assert_array_less(dpgmm.weights_[~active], .05)
 
 
+def test_verbose_boolean():
+    # checks that the output for the verbose output is the same
+    # for the flag values '1' and 'True'
+    # simple 3 cluster dataset
+    X, y = make_blobs(random_state=1)
+    for Model in [DPGMM, VBGMM]:
+        dpgmm_bool = Model(n_components=10, random_state=1, alpha=20,
+                           n_iter=50, verbose=True)
+        dpgmm_int = Model(n_components=10, random_state=1, alpha=20,
+                          n_iter=50, verbose=1)
+
+        old_stdout = sys.stdout
+        sys.stdout = StringIO()
+        try:
+            # generate output with the boolean flag
+            dpgmm_bool.fit(X)
+            verbose_output = sys.stdout
+            verbose_output.seek(0)
+            bool_output = verbose_output.readline()
+            # generate output with the int flag
+            dpgmm_int.fit(X)
+            verbose_output = sys.stdout
+            verbose_output.seek(0)
+            int_output = verbose_output.readline()
+            assert_equal(bool_output, int_output)
+        finally:
+            sys.stdout = old_stdout
+
+
+def test_verbose_first_level():
+    # simple 3 cluster dataset
+    X, y = make_blobs(random_state=1)
+    for Model in [DPGMM, VBGMM]:
+        dpgmm = Model(n_components=10, random_state=1, alpha=20, n_iter=50,
+                      verbose=1)
+
+        old_stdout = sys.stdout
+        sys.stdout = StringIO()
+        try:
+            dpgmm.fit(X)
+        finally:
+            sys.stdout = old_stdout
+
+
+def test_verbose_second_level():
+    # simple 3 cluster dataset
+    X, y = make_blobs(random_state=1)
+    for Model in [DPGMM, VBGMM]:
+        dpgmm = Model(n_components=10, random_state=1, alpha=20, n_iter=50,
+                      verbose=2)
+
+        old_stdout = sys.stdout
+        sys.stdout = StringIO()
+        try:
+            dpgmm.fit(X)
+        finally:
+            sys.stdout = old_stdout
+
+
 def test_log_normalize():
     v = np.array([0.1, 0.8, 0.01, 0.09])
     a = np.log(2 * v)
diff --git a/sklearn/mixture/tests/test_gmm.py b/sklearn/mixture/tests/test_gmm.py
@@ -1,5 +1,6 @@
 import unittest
 import copy
+import sys
 
 from nose.tools import assert_true
 import numpy as np
@@ -11,6 +12,7 @@
 from sklearn.utils.testing import assert_greater
 from sklearn.utils.testing import assert_raise_message
 from sklearn.metrics.cluster import adjusted_rand_score
+from sklearn.externals.six.moves import cStringIO as StringIO
 
 rng = np.random.RandomState(0)
 
@@ -108,7 +110,7 @@ def test_lmvnpdf_full():
 
 
 def test_lvmpdf_full_cv_non_positive_definite():
-    n_features, n_components, n_samples = 2, 1, 10
+    n_features, n_samples = 2, 10
     rng = np.random.RandomState(0)
     X = rng.randint(10) * rng.rand(n_samples, n_features)
     mu = np.mean(X, 0)
@@ -263,7 +265,7 @@ def test_train_1d(self, params='wmc'):
         # Train on 1-D data
         # Create a training set by sampling from the predefined distribution.
         X = rng.randn(100, 1)
-        #X.T[1:] = 0
+        # X.T[1:] = 0
         g = self.model(n_components=2, covariance_type=self.covariance_type,
                        random_state=rng, min_covar=1e-7, n_iter=5,
                        init_params=params)
@@ -371,7 +373,7 @@ def test_fit_predict():
 
     model = mixture.GMM(n_components=n_comps, n_iter=0)
     z = model.fit_predict(X)
-    assert np.all(z==0), "Quick Initialization Failed!"
+    assert np.all(z == 0), "Quick Initialization Failed!"
 
 
 def test_aic():
@@ -443,6 +445,34 @@ def test_positive_definite_covars():
         yield check_positive_definite_covars, covariance_type
 
 
+def test_verbose_first_level():
+    # Create sample data
+    X = rng.randn(30, 5)
+    X[:10] += 2
+    g = mixture.GMM(n_components=2, n_init=2, verbose=1)
+
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        g.fit(X)
+    finally:
+        sys.stdout = old_stdout
+
+
+def test_verbose_second_level():
+    # Create sample data
+    X = rng.randn(30, 5)
+    X[:10] += 2
+    g = mixture.GMM(n_components=2, n_init=2, verbose=2)
+
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        g.fit(X)
+    finally:
+        sys.stdout = old_stdout
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()