Merge remote branch 'origin'

GaelVaroquaux · GaelVaroquaux · commit f1692db173e8 · 2011-04-01T13:00:46.000+02:00
Conflicts:
	scikits/learn/datasets/lfw.py
	scikits/learn/datasets/tests/test_lfw.py
diff --git a/scikits/learn/base.py b/scikits/learn/base.py
@@ -145,6 +145,8 @@ def _get_params(self, deep=True):
         """
         out = dict()
         for key in self._get_param_names():
+            if not hasattr(self, key):
+                continue
             value = getattr (self, key)
             if deep and hasattr (value, '_get_params'):
                 deep_items = value._get_params().items()
diff --git a/scikits/learn/datasets/tests/test_lfw.py b/scikits/learn/datasets/tests/test_lfw.py
@@ -69,7 +69,11 @@ def setup_module():
         for i in range(n_faces):
             file_path = os.path.join(folder_name, name + '_%04d.jpg' % i)
             uniface = np_rng.randint(0, 255, size=(250, 250, 3))
-            imsave(file_path, uniface)
+            try:
+                imsave(file_path, uniface)
+            except ImportError:
+                # PIL is not properly installed, skip those tests
+                raise SkipTest
 
     # add some random file pollution to test robustness
     f = open(os.path.join(LFW_HOME, 'lfw_funneled', '.test.swp'), 'wb')
diff --git a/scikits/learn/grid_search.py b/scikits/learn/grid_search.py
@@ -5,11 +5,13 @@
 # License: BSD Style.
 
 import copy
+import time
 
 import numpy as np
 import scipy.sparse as sp
 
 from .externals.joblib import Parallel, delayed
+from .externals.joblib.logger import short_format_time
 from .cross_val import KFold, StratifiedKFold
 from .base import BaseEstimator, is_classifier, clone
 
@@ -26,7 +28,7 @@ def product(*args, **kwds):
             yield tuple(prod)
 
 
-def iter_grid(param_grid):
+class IterGrid(object):
     """Generators on the combination of the various parameter lists given
 
     Parameters
@@ -43,79 +45,85 @@ def iter_grid(param_grid):
 
     Examples
     ---------
-    >>> from scikits.learn.grid_search import iter_grid
+    >>> from scikits.learn.grid_search import IterGrid
     >>> param_grid = {'a':[1, 2], 'b':[True, False]}
-    >>> list(iter_grid(param_grid))
+    >>> list(IterGrid(param_grid))
     [{'a': 1, 'b': True}, {'a': 1, 'b': False}, {'a': 2, 'b': True}, {'a': 2, 'b': False}]
 
     """
-    if hasattr(param_grid, 'has_key'):
-        param_grid = [param_grid]
-    for p in param_grid:
-        # Always sort the keys of a dictionary, for reproducibility
-        items = sorted(p.items())
-        keys, values = zip(*items)
-        for v in product(*values):
-            params = dict(zip(keys, v))
-            yield params
-
-
-def fit_grid_point(X, y, base_clf, clf_params, cv, loss_func, score_func, iid,
-                   **fit_params):
+    def __init__(self, param_grid):
+        self.param_grid = param_grid
+
+    def __iter__(self):
+        param_grid = self.param_grid
+        if hasattr(param_grid, 'has_key'):
+            param_grid = [param_grid]
+        for p in param_grid:
+            # Always sort the keys of a dictionary, for reproducibility
+            items = sorted(p.items())
+            keys, values = zip(*items)
+            for v in product(*values):
+                params = dict(zip(keys, v))
+                yield params
+
+
+def fit_grid_point(X, y, base_clf, clf_params, train, test, loss_func, 
+                score_func, verbose, **fit_params):
     """Run fit on one set of parameters
 
     Returns the score and the instance of the classifier
     """
+    if verbose > 1:
+        start_time = time.time()
+        msg = '%s' % (', '.join('%s=%s' % (k, v) 
+                                     for k, v in clf_params.iteritems()))
+        print "[GridSearchCV] %s %s" % (msg, (64-len(msg))*'.')
     # update parameters of the classifier after a copy of its base structure
     clf = copy.deepcopy(base_clf)
     clf._set_params(**clf_params)
 
-    score = 0.
-    n_test_samples = 0.
-    for train, test in cv:
-        if isinstance(X, list) or isinstance(X, tuple):
-            X_train = [X[i] for i, cond in enumerate(train) if cond]
-            X_test = [X[i] for i, cond in enumerate(test) if cond]
-        else:
-            if sp.issparse(X):
-                # For sparse matrices, slicing only works with indices
-                # (no masked array). Convert to CSR format for efficiency and
-                # because some sparse formats don't support row slicing.
-                X = sp.csr_matrix(X)
-                ind = np.arange(X.shape[0])
-                train = ind[train]
-                test = ind[test]
-            X_train = X[train]
-            X_test = X[test]
-        if y is not None:
-            y_test  = y[test]
-            y_train = y[train]
-        else:
-            y_test  = None
-            y_train = None
-
-        clf.fit(X_train, y_train, **fit_params)
-
-        if loss_func is not None:
-            y_pred = clf.predict(X_test)
-            this_score = -loss_func(y_test, y_pred)
-        elif score_func is not None:
-            y_pred = clf.predict(X_test)
-            this_score = score_func(y_test, y_pred)
-        else:
-            this_score = clf.score(X_test, y_test)
-        if iid:
-            if y is not None:
-                this_n_test_samples = y.shape[0]
-            else:
-                this_n_test_samples = X.shape[0]
-            this_score *= this_n_test_samples
-            n_test_samples += this_n_test_samples
-        score += this_score
-    if iid:
-        score /= n_test_samples
-
-    return score, clf
+    if isinstance(X, list) or isinstance(X, tuple):
+        X_train = [X[i] for i, cond in enumerate(train) if cond]
+        X_test = [X[i] for i, cond in enumerate(test) if cond]
+    else:
+        if sp.issparse(X):
+            # For sparse matrices, slicing only works with indices
+            # (no masked array). Convert to CSR format for efficiency and
+            # because some sparse formats don't support row slicing.
+            X = sp.csr_matrix(X)
+            ind = np.arange(X.shape[0])
+            train = ind[train]
+            test = ind[test]
+        X_train = X[train]
+        X_test = X[test]
+    if y is not None:
+        y_test  = y[test]
+        y_train = y[train]
+    else:
+        y_test  = None
+        y_train = None
+
+    clf.fit(X_train, y_train, **fit_params)
+
+    if loss_func is not None:
+        y_pred = clf.predict(X_test)
+        this_score = -loss_func(y_test, y_pred)
+    elif score_func is not None:
+        y_pred = clf.predict(X_test)
+        this_score = score_func(y_test, y_pred)
+    else:
+        this_score = clf.score(X_test, y_test)
+
+    if y is not None:
+        this_n_test_samples = y.shape[0]
+    else:
+        this_n_test_samples = X.shape[0]
+
+    if verbose > 1:
+        end_msg = "%s -%s" % (msg, 
+                                short_format_time(time.time() - start_time))
+        print "[GridSearchCV] %s %s" % ((64-len(end_msg))*'.', end_msg)
+    return this_score, clf, this_n_test_samples
 
 
 class GridSearchCV(BaseEstimator):
@@ -162,6 +170,9 @@ class GridSearchCV(BaseEstimator):
     refit: boolean
         refit the best estimator with the entire dataset
 
+    verbose: integer
+        Controls the verbosity: the higher, the more messages.
+
     Examples
     --------
     >>> from scikits.learn import svm, grid_search, datasets
@@ -170,8 +181,8 @@ class GridSearchCV(BaseEstimator):
     >>> svr = svm.SVR()
     >>> clf = grid_search.GridSearchCV(svr, parameters)
     >>> clf.fit(iris.data, iris.target) # doctest: +ELLIPSIS
-    GridSearchCV(n_jobs=1, fit_params={}, loss_func=None, refit=True, cv=None,
-           iid=True,
+    GridSearchCV(n_jobs=1, verbose=0, fit_params={}, loss_func=None, refit=True,
+           cv=None, iid=True,
            estimator=SVR(kernel='rbf', C=1.0, probability=False, ...
            ...
 
@@ -187,6 +198,7 @@ class GridSearchCV(BaseEstimator):
 
     def __init__(self, estimator, param_grid, loss_func=None, score_func=None,
                  fit_params={}, n_jobs=1, iid=True, refit=True, cv=None,
+                 verbose=0,
                  ):
         assert hasattr(estimator, 'fit') and (hasattr(estimator, 'predict')
                         or hasattr(estimator, 'score')), (
@@ -210,6 +222,7 @@ def __init__(self, estimator, param_grid, loss_func=None, score_func=None,
         self.iid = iid
         self.refit = refit
         self.cv = cv
+        self.verbose = verbose
 
     def fit(self, X, y=None, **params):
         """Run fit with all sets of parameters
@@ -242,20 +255,38 @@ def fit(self, X, y=None, **params):
             else:
                 cv = KFold(n_samples, k=3)
 
-        grid = iter_grid(self.param_grid)
+        grid = IterGrid(self.param_grid)
         base_clf = clone(self.estimator)
-        out = Parallel(n_jobs=self.n_jobs)(
+        # XXX: Need to make use of Parallel's new pre_dispatch
+        out = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
             delayed(fit_grid_point)(
-                X, y, base_clf, clf_params, cv, self.loss_func,
-                self.score_func, self.iid, **self.fit_params)
-                    for clf_params in grid)
-
-        # Out is a list of pairs: score, estimator
-
+                X, y, base_clf, clf_params, train, test, self.loss_func,
+                self.score_func, self.verbose, **self.fit_params)
+                    for clf_params in grid for train, test in cv)
+
+        # Out is a list of triplet: score, estimator, n_test_samples
+        n_grid_points = len(list(grid))
+        n_fits        = len(out)
+        n_folds       = n_fits//n_grid_points
+        
+        scores = list()
+        for grid_start in range(0, n_fits, n_folds):
+            n_test_samples = 0
+            score = 0
+            for this_score, estimator, this_n_test_samples in \
+                                    out[grid_start:grid_start+n_folds]:
+                if self.iid:
+                    this_score *= this_n_test_samples
+                score += this_score
+                n_test_samples += this_n_test_samples
+            if self.iid:
+                score /= float(n_test_samples)
+            scores.append((score, estimator))
+            
         # Note: we do not use max(out) to make ties deterministic even if
         # comparison on estimator instances is not deterministic
         best_score = None
-        for score, estimator in out:
+        for score, estimator in scores:
             if best_score is None:
                 best_score = score
                 best_estimator = estimator
@@ -277,11 +308,10 @@ def fit(self, X, y=None, **params):
             self.score = best_estimator.score
 
         # Store the computed scores
-        grid = iter_grid(self.param_grid)
         # XXX: the name is too specific, it shouldn't have
         # 'grid' in it. Also, we should be retrieving/storing variance
         self.grid_points_scores_ = dict((tuple(clf_params.items()), score)
-                    for clf_params, (score, _) in zip(grid, out))
+                    for clf_params, (score, _) in zip(grid, scores))
 
         return self