Skip to content

Commit a7e1231

Browse files
ergamueller
authored andcommitted
BUG: Build random forests the same way regardless of n_jobs and add a test for this. Don't predict in parallel since the cost of copying memory in joblib outweighs the speedups for random forests. Fixes scikit-learn#1685.
1 parent 4194ffc commit a7e1231

File tree

2 files changed

+41
-34
lines changed

2 files changed

+41
-34
lines changed

sklearn/ensemble/forest.py

Lines changed: 11 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -65,12 +65,12 @@ class calls the ``fit`` method of each sub-estimator on random samples
6565

6666

6767
def _parallel_build_trees(n_trees, forest, X, y, sample_weight,
68-
sample_mask, X_argsorted, seed, verbose):
68+
sample_mask, X_argsorted, seeds, verbose):
6969
"""Private function used to build a batch of trees within a job."""
70-
random_state = check_random_state(seed)
7170
trees = []
7271

7372
for i in range(n_trees):
73+
random_state = check_random_state(seeds[i])
7474
if verbose > 1:
7575
print("building tree %d of %d" % (i + 1, n_trees))
7676
seed = random_state.randint(MAX_INT)
@@ -356,6 +356,9 @@ def fit(self, X, y, sample_weight=None):
356356
# Assign chunk of trees to jobs
357357
n_jobs, n_trees, _ = _partition_trees(self)
358358

359+
# Precalculate the random states
360+
seeds = [random_state.randint(MAX_INT, size=n_trees[i]) for i in xrange(len(n_trees))]
361+
359362
# Parallel loop
360363
all_trees = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
361364
delayed(_parallel_build_trees)(
@@ -366,7 +369,7 @@ def fit(self, X, y, sample_weight=None):
366369
sample_weight,
367370
sample_mask,
368371
X_argsorted,
369-
random_state.randint(MAX_INT),
372+
seeds[i],
370373
verbose=self.verbose)
371374
for i in range(n_jobs))
372375

@@ -563,32 +566,17 @@ def predict_proba(self, X):
563566
if getattr(X, "dtype", None) != DTYPE or X.ndim != 2:
564567
X = array2d(X, dtype=DTYPE)
565568

566-
# Assign chunk of trees to jobs
567-
n_jobs, n_trees, starts = _partition_trees(self)
568-
569-
# Parallel loop
570-
all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
571-
delayed(_parallel_predict_proba)(
572-
self.estimators_[starts[i]:starts[i + 1]],
569+
# Running with n_jobs > 1 is slower
570+
proba = _parallel_predict_proba(
571+
self.estimators_,
573572
X,
574573
self.n_classes_,
575574
self.n_outputs_)
576-
for i in range(n_jobs))
577-
578-
# Reduce
579-
proba = all_proba[0]
580575

581576
if self.n_outputs_ == 1:
582-
for j in xrange(1, len(all_proba)):
583-
proba += all_proba[j]
584-
585577
proba /= self.n_estimators
586578

587579
else:
588-
for j in xrange(1, len(all_proba)):
589-
for k in xrange(self.n_outputs_):
590-
proba[k] += all_proba[j][k]
591-
592580
for k in xrange(self.n_outputs_):
593581
proba[k] /= self.n_estimators
594582

@@ -674,17 +662,8 @@ def predict(self, X):
674662
if getattr(X, "dtype", None) != DTYPE or X.ndim != 2:
675663
X = array2d(X, dtype=DTYPE)
676664

677-
# Assign chunk of trees to jobs
678-
n_jobs, n_trees, starts = _partition_trees(self)
679-
680-
# Parallel loop
681-
all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose)(
682-
delayed(_parallel_predict_regression)(
683-
self.estimators_[starts[i]:starts[i + 1]], X)
684-
for i in range(n_jobs))
685-
686-
# Reduce
687-
y_hat = sum(all_y_hat) / self.n_estimators
665+
y_hat = _parallel_predict_regression(self.estimators_, X)
666+
y_hat /= self.n_estimators
688667

689668
return y_hat
690669

sklearn/ensemble/tests/test_forest.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -396,12 +396,13 @@ def test_random_hasher():
396396
# test random forest hashing on circles dataset
397397
# make sure that it is linearly separable.
398398
# even after projected to two pca dimensions
399-
hasher = RandomTreesEmbedding(n_estimators=30, random_state=0)
399+
# Note: Not all random_states produce perfect results.
400+
hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
400401
X, y = datasets.make_circles(factor=0.5)
401402
X_transformed = hasher.fit_transform(X)
402403

403404
# test fit and transform:
404-
hasher = RandomTreesEmbedding(n_estimators=30, random_state=0)
405+
hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
405406
assert_array_equal(hasher.fit(X).transform(X).toarray(),
406407
X_transformed.toarray())
407408

@@ -415,6 +416,33 @@ def test_random_hasher():
415416
assert_equal(linear_clf.score(X_reduced, y), 1.)
416417

417418

419+
def test_parallel_train():
420+
rng = np.random.RandomState(12321)
421+
422+
X = rng.randn(100, 1000)
423+
y = rng.randint(0, 2, 100)
424+
425+
clfs = [
426+
RandomForestClassifier(n_estimators=20,
427+
n_jobs=n_jobs,
428+
random_state=12345)
429+
for n_jobs in range(1, 9)
430+
]
431+
432+
for clf in clfs:
433+
clf.fit(X, y)
434+
435+
X2 = rng.randn(100, 1000)
436+
437+
probas = []
438+
for clf in clfs:
439+
proba = clf.predict_proba(X2)
440+
probas.append(proba)
441+
442+
for proba1, proba2 in zip(probas, probas[1:]):
443+
assert np.allclose(proba1, proba2)
444+
445+
418446
if __name__ == "__main__":
419447
import nose
420448
nose.runmodule()

0 commit comments

Comments
 (0)