Parallelize embarrassingly parallel loop in RFECV.fit

MechCoder · MechCoder · commit d13f519ba547 · 2015-12-06T20:58:06.000-05:00
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -42,7 +42,6 @@ Enhancements
      method ``decision_path`` which returns the decision path of samples in
      the tree. By `Arnaud Joly`_.
 
-
    - The random forest, extra tree and decision tree estimators now has a
      method ``decision_path`` which returns the decision path of samples in
      the tree. By `Arnaud Joly`_.
@@ -64,6 +63,9 @@ Enhancements
      (`#5251 <https://github.com/scikit-learn/scikit-learn/pull/5251>`_)
      By `Tom Dupre la Tour`_.
 
+   - Added ``n_jobs`` parameter to :class:`feature_selection.RFECV` to compute
+     the score on the test folds in parallel. By `Manoj Kumar`_
+
 Bug fixes
 .........
 
diff --git a/sklearn/feature_selection/rfe.py b/sklearn/feature_selection/rfe.py
@@ -13,12 +13,23 @@
 from ..base import MetaEstimatorMixin
 from ..base import clone
 from ..base import is_classifier
+from ..externals.joblib import Parallel, delayed
 from ..model_selection import check_cv
 from ..model_selection._validation import _safe_split, _score
 from ..metrics.scorer import check_scoring
 from .base import SelectorMixin
 
 
+def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
+    """
+    Return the score for a fit across one fold.
+    """
+    X_train, y_train = _safe_split(estimator, X, y, train)
+    X_test, y_test = _safe_split(estimator, X, y, test, train)
+    return rfe._fit(
+        X_train, y_train, lambda estimator, features:
+        _score(estimator, X_test[:, features], y_test, scorer)).scores_
+
 class RFE(BaseEstimator, MetaEstimatorMixin, SelectorMixin):
     """Feature ranking with recursive feature elimination.
 
@@ -296,6 +307,11 @@ class RFECV(RFE, MetaEstimatorMixin):
     verbose : int, default=0
         Controls verbosity of output.
 
+    n_jobs : int, default 1
+        Number of cores to run in parallel while fitting across folds.
+        Defaults to 1 core. If `n_jobs=-1`, then number of jobs is set
+        to number of cores.
+
     Attributes
     ----------
     n_features_ : int
@@ -349,12 +365,14 @@ class RFECV(RFE, MetaEstimatorMixin):
            for cancer classification using support vector machines",
            Mach. Learn., 46(1-3), 389--422, 2002.
     """
-    def __init__(self, estimator, step=1, cv=None, scoring=None, verbose=0):
+    def __init__(self, estimator, step=1, cv=None, scoring=None, verbose=0,
+                 n_jobs=1):
         self.estimator = estimator
         self.step = step
         self.cv = cv
         self.scoring = scoring
         self.verbose = verbose
+        self.n_jobs = n_jobs
 
     def fit(self, X, y):
         """Fit the RFE model and automatically tune the number of selected
@@ -377,23 +395,33 @@ def fit(self, X, y):
         scorer = check_scoring(self.estimator, scoring=self.scoring)
         n_features = X.shape[1]
         n_features_to_select = 1
+        rfe = RFE(estimator=self.estimator,
+                  n_features_to_select=n_features_to_select,
+                  step=self.step, verbose=self.verbose - 1)
+
 
-        # Determine the number of subsets of features
-        scores = []
+        # Determine the number of subsets of features by fitting across
+        # the train folds and choosing the "features_to_select" parameter
+        # that gives the least averaged error across all folds.
 
-        # Cross-validation
-        for n, (train, test) in enumerate(cv.split(X, y)):
-            X_train, y_train = _safe_split(self.estimator, X, y, train)
-            X_test, y_test = _safe_split(self.estimator, X, y, test, train)
+        # Note that joblib raises a non-picklable error for bound methods
+        # even if n_jobs is set to 1 with the default multiprocessing
+        # backend.
+        # This branching is done so that to
+        # make sure that user code that sets n_jobs to 1
+        # and provides bound methods as scorers is not broken with the
+        # addition of n_jobs parameter in version 0.18.
+
+        if self.n_jobs == 1:
+            parallel, func = list, _rfe_single_fit
+        else:
+            parallel, func, = Parallel(n_jobs=self.n_jobs), delayed(_rfe_single_fit)
 
-            rfe = RFE(estimator=self.estimator,
-                      n_features_to_select=n_features_to_select,
-                      step=self.step, verbose=self.verbose - 1)
+        scores = parallel(
+            func(rfe, self.estimator, X, y, train, test, scorer)
+            for train, test in cv.split(X, y))
 
-            rfe._fit(X_train, y_train, lambda estimator, features:
-                     _score(estimator, X_test[:, features], y_test, scorer))
-            scores.append(np.array(rfe.scores_[::-1]).reshape(1, -1))
-        scores = np.sum(np.concatenate(scores, 0), 0)
+        scores = np.sum(scores, axis=0)[::-1]
         # The index in 'scores' when 'n_features' features are selected
         n_feature_index = np.ceil((n_features - n_features_to_select) /
                                   float(self.step))
diff --git a/sklearn/feature_selection/tests/test_rfe.py b/sklearn/feature_selection/tests/test_rfe.py
@@ -288,3 +288,20 @@ def formula2(n_features, n_features_to_select, step):
                      formula1(n_features, n_features_to_select, step))
         assert_equal(rfecv.grid_scores_.shape[0],
                      formula2(n_features, n_features_to_select, step))
+
+
+def test_rfe_cv_n_jobs():
+    generator = check_random_state(0)
+    iris = load_iris()
+    X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
+    y = iris.target
+
+    rfecv = RFECV(estimator=SVC(kernel='linear'))
+    rfecv.fit(X, y)
+    rfecv_ranking = rfecv.ranking_
+    rfecv_grid_scores = rfecv.grid_scores_
+
+    rfecv.set_params(n_jobs=2)
+    rfecv.fit(X, y)
+    assert_array_almost_equal(rfecv.ranking_, rfecv_ranking)
+    assert_array_almost_equal(rfecv.grid_scores_, rfecv_grid_scores)