Skip to content

Commit d13f519

Browse files
committed
Parallelize embarrassingly parallel loop in RFECV.fit
1 parent 43e5454 commit d13f519

File tree

3 files changed

+62
-15
lines changed

3 files changed

+62
-15
lines changed

doc/whats_new.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@ Enhancements
4242
method ``decision_path`` which returns the decision path of samples in
4343
the tree. By `Arnaud Joly`_.
4444

45-
4645
- The random forest, extra tree and decision tree estimators now has a
4746
method ``decision_path`` which returns the decision path of samples in
4847
the tree. By `Arnaud Joly`_.
@@ -64,6 +63,9 @@ Enhancements
6463
(`#5251 <https://github.com/scikit-learn/scikit-learn/pull/5251>`_)
6564
By `Tom Dupre la Tour`_.
6665

66+
- Added ``n_jobs`` parameter to :class:`feature_selection.RFECV` to compute
67+
the score on the test folds in parallel. By `Manoj Kumar`_
68+
6769
Bug fixes
6870
.........
6971

sklearn/feature_selection/rfe.py

Lines changed: 42 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,23 @@
1313
from ..base import MetaEstimatorMixin
1414
from ..base import clone
1515
from ..base import is_classifier
16+
from ..externals.joblib import Parallel, delayed
1617
from ..model_selection import check_cv
1718
from ..model_selection._validation import _safe_split, _score
1819
from ..metrics.scorer import check_scoring
1920
from .base import SelectorMixin
2021

2122

23+
def _rfe_single_fit(rfe, estimator, X, y, train, test, scorer):
24+
"""
25+
Return the score for a fit across one fold.
26+
"""
27+
X_train, y_train = _safe_split(estimator, X, y, train)
28+
X_test, y_test = _safe_split(estimator, X, y, test, train)
29+
return rfe._fit(
30+
X_train, y_train, lambda estimator, features:
31+
_score(estimator, X_test[:, features], y_test, scorer)).scores_
32+
2233
class RFE(BaseEstimator, MetaEstimatorMixin, SelectorMixin):
2334
"""Feature ranking with recursive feature elimination.
2435
@@ -296,6 +307,11 @@ class RFECV(RFE, MetaEstimatorMixin):
296307
verbose : int, default=0
297308
Controls verbosity of output.
298309
310+
n_jobs : int, default 1
311+
Number of cores to run in parallel while fitting across folds.
312+
Defaults to 1 core. If `n_jobs=-1`, then number of jobs is set
313+
to number of cores.
314+
299315
Attributes
300316
----------
301317
n_features_ : int
@@ -349,12 +365,14 @@ class RFECV(RFE, MetaEstimatorMixin):
349365
for cancer classification using support vector machines",
350366
Mach. Learn., 46(1-3), 389--422, 2002.
351367
"""
352-
def __init__(self, estimator, step=1, cv=None, scoring=None, verbose=0):
368+
def __init__(self, estimator, step=1, cv=None, scoring=None, verbose=0,
369+
n_jobs=1):
353370
self.estimator = estimator
354371
self.step = step
355372
self.cv = cv
356373
self.scoring = scoring
357374
self.verbose = verbose
375+
self.n_jobs = n_jobs
358376

359377
def fit(self, X, y):
360378
"""Fit the RFE model and automatically tune the number of selected
@@ -377,23 +395,33 @@ def fit(self, X, y):
377395
scorer = check_scoring(self.estimator, scoring=self.scoring)
378396
n_features = X.shape[1]
379397
n_features_to_select = 1
398+
rfe = RFE(estimator=self.estimator,
399+
n_features_to_select=n_features_to_select,
400+
step=self.step, verbose=self.verbose - 1)
401+
380402

381-
# Determine the number of subsets of features
382-
scores = []
403+
# Determine the number of subsets of features by fitting across
404+
# the train folds and choosing the "features_to_select" parameter
405+
# that gives the least averaged error across all folds.
383406

384-
# Cross-validation
385-
for n, (train, test) in enumerate(cv.split(X, y)):
386-
X_train, y_train = _safe_split(self.estimator, X, y, train)
387-
X_test, y_test = _safe_split(self.estimator, X, y, test, train)
407+
# Note that joblib raises a non-picklable error for bound methods
408+
# even if n_jobs is set to 1 with the default multiprocessing
409+
# backend.
410+
# This branching is done so that to
411+
# make sure that user code that sets n_jobs to 1
412+
# and provides bound methods as scorers is not broken with the
413+
# addition of n_jobs parameter in version 0.18.
414+
415+
if self.n_jobs == 1:
416+
parallel, func = list, _rfe_single_fit
417+
else:
418+
parallel, func, = Parallel(n_jobs=self.n_jobs), delayed(_rfe_single_fit)
388419

389-
rfe = RFE(estimator=self.estimator,
390-
n_features_to_select=n_features_to_select,
391-
step=self.step, verbose=self.verbose - 1)
420+
scores = parallel(
421+
func(rfe, self.estimator, X, y, train, test, scorer)
422+
for train, test in cv.split(X, y))
392423

393-
rfe._fit(X_train, y_train, lambda estimator, features:
394-
_score(estimator, X_test[:, features], y_test, scorer))
395-
scores.append(np.array(rfe.scores_[::-1]).reshape(1, -1))
396-
scores = np.sum(np.concatenate(scores, 0), 0)
424+
scores = np.sum(scores, axis=0)[::-1]
397425
# The index in 'scores' when 'n_features' features are selected
398426
n_feature_index = np.ceil((n_features - n_features_to_select) /
399427
float(self.step))

sklearn/feature_selection/tests/test_rfe.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,3 +288,20 @@ def formula2(n_features, n_features_to_select, step):
288288
formula1(n_features, n_features_to_select, step))
289289
assert_equal(rfecv.grid_scores_.shape[0],
290290
formula2(n_features, n_features_to_select, step))
291+
292+
293+
def test_rfe_cv_n_jobs():
294+
generator = check_random_state(0)
295+
iris = load_iris()
296+
X = np.c_[iris.data, generator.normal(size=(len(iris.data), 6))]
297+
y = iris.target
298+
299+
rfecv = RFECV(estimator=SVC(kernel='linear'))
300+
rfecv.fit(X, y)
301+
rfecv_ranking = rfecv.ranking_
302+
rfecv_grid_scores = rfecv.grid_scores_
303+
304+
rfecv.set_params(n_jobs=2)
305+
rfecv.fit(X, y)
306+
assert_array_almost_equal(rfecv.ranking_, rfecv_ranking)
307+
assert_array_almost_equal(rfecv.grid_scores_, rfecv_grid_scores)

0 commit comments

Comments
 (0)