Pre-initialize all trees before dispatching

glouppe · glouppe · commit 36ab69262bb3 · 2014-01-07T10:24:25.000+01:00
diff --git a/sklearn/ensemble/forest.py b/sklearn/ensemble/forest.py
@@ -67,19 +67,11 @@ class calls the ``fit`` method of each sub-estimator on random samples
 MAX_INT = np.iinfo(np.int32).max
 
 
-def _parallel_build_trees(n_trees, forest, X, y,
-                          sample_weight, seeds, verbose):
+def _parallel_build_trees(trees, forest, X, y, sample_weight, verbose):
     """Private function used to build a batch of trees within a job."""
-    trees = []
-
-    for i in range(n_trees):
-        random_state = check_random_state(seeds[i])
+    for i, tree in enumerate(trees):
         if verbose > 1:
-            print("building tree %d of %d" % (i + 1, n_trees))
-        seed = random_state.randint(MAX_INT)
-
-        tree = forest._make_estimator(append=False)
-        tree.set_params(random_state=seed)
+            print("building tree %d of %d" % (i + 1, len(trees)))
 
         if forest.bootstrap:
             n_samples = X.shape[0]
@@ -88,6 +80,7 @@ def _parallel_build_trees(n_trees, forest, X, y,
             else:
                 curr_sample_weight = sample_weight.copy()
 
+            random_state = check_random_state(tree.random_state)
             indices = random_state.randint(0, n_samples, n_samples)
             sample_counts = bincount(indices, minlength=n_samples)
             curr_sample_weight *= sample_counts
@@ -103,8 +96,6 @@ def _parallel_build_trees(n_trees, forest, X, y,
                      sample_weight=sample_weight,
                      check_input=False)
 
-        trees.append(tree)
-
     return trees
 
 
@@ -264,10 +255,13 @@ def fit(self, X, y, sample_weight=None):
                              " if bootstrap=True")
 
         # Assign chunk of trees to jobs
-        n_jobs, n_trees, _ = _partition_estimators(self)
+        n_jobs, n_trees, starts = _partition_estimators(self)
+        trees = []
 
-        # Precalculate the random states
-        seeds = [random_state.randint(MAX_INT, size=i) for i in n_trees]
+        for i in range(self.n_estimators):
+            tree = self._make_estimator(append=False)
+            tree.set_params(random_state=random_state.randint(MAX_INT))
+            trees.append(tree)
 
         # Free allocated memory, if any
         self.estimators_ = None
@@ -278,12 +272,11 @@ def fit(self, X, y, sample_weight=None):
         all_trees = Parallel(n_jobs=n_jobs, verbose=self.verbose,
                              backend="threading")(
             delayed(_parallel_build_trees)(
-                n_trees[i],
+                trees[starts[i]:starts[i + 1]],
                 self,
                 X,
                 y,
                 sample_weight,
-                seeds[i],
                 verbose=self.verbose)
             for i in range(n_jobs))
 
diff --git a/sklearn/ensemble/tests/test_forest.py b/sklearn/ensemble/tests/test_forest.py
@@ -474,9 +474,9 @@ def test_distribution():
     # Single variable with 4 values
     X = rng.randint(0, 4, size=(1000, 1))
     y = rng.rand(1000)
-    n_trees = 200
+    n_trees = 500
 
-    clf = ExtraTreesRegressor(n_estimators=n_trees, random_state=1).fit(X, y)
+    clf = ExtraTreesRegressor(n_estimators=n_trees, random_state=42).fit(X, y)
 
     uniques = defaultdict(int)
     for tree in clf.estimators_: