ENH Pass original dataset to Stacking final estimator (scikit-learn#15138)

jcusick13 · jnothman · commit 132ad993680b · 2019-10-30T15:45:48.000+11:00
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -285,6 +285,11 @@ Changelog
   by the max of the samples with non-null weights only.
   :pr:`14294` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- |Enhancement| Adds ``passthrough`` to :class: `ensemble.StackingClassifier`
+  and :class: `ensemble.StackingRegressor` allowing for the original dataset
+  to be used in the final estimator.
+  :pr:`15138` by :user:`Jon Cusick <jcusick13>`.
+
 :mod:`sklearn.feature_extraction`
 .................................
 
diff --git a/sklearn/ensemble/_stacking.py b/sklearn/ensemble/_stacking.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 from joblib import Parallel, delayed
+import scipy.sparse as sparse
 
 from ..base import clone
 from ..base import ClassifierMixin, RegressorMixin, TransformerMixin
@@ -37,22 +38,30 @@ class _BaseStacking(TransformerMixin, _BaseHeterogeneousEnsemble,
 
     @abstractmethod
     def __init__(self, estimators, final_estimator=None, cv=None,
-                 stack_method='auto', n_jobs=None, verbose=0):
+                 stack_method='auto', n_jobs=None, verbose=0,
+                 passthrough=False):
         super().__init__(estimators=estimators)
         self.final_estimator = final_estimator
         self.cv = cv
         self.stack_method = stack_method
         self.n_jobs = n_jobs
         self.verbose = verbose
+        self.passthrough = passthrough
 
     def _clone_final_estimator(self, default):
         if self.final_estimator is not None:
             self.final_estimator_ = clone(self.final_estimator)
         else:
             self.final_estimator_ = clone(default)
 
-    def _concatenate_predictions(self, predictions):
-        """Concatenate the predictions of each first layer learner.
+    def _concatenate_predictions(self, X, predictions):
+        """Concatenate the predictions of each first layer learner and
+        possibly the input dataset `X`.
+
+        If `X` is sparse and `self.passthrough` is False, the output of
+        `transform` will be dense (the predictions). If `X` is sparse
+        and `self.passthrough` is True, the output of `transform` will
+        be sparse.
 
         This helper is in charge of ensuring the preditions are 2D arrays and
         it will drop one of the probability column when using probabilities
@@ -72,7 +81,12 @@ def _concatenate_predictions(self, predictions):
                     X_meta.append(preds[:, 1:])
                 else:
                     X_meta.append(preds)
-        return np.concatenate(X_meta, axis=1)
+        if self.passthrough:
+            X_meta.append(X)
+            if sparse.issparse(X):
+                return sparse.hstack(X_meta, format=X.format)
+
+        return np.hstack(X_meta)
 
     @staticmethod
     def _method_name(name, estimator, method):
@@ -165,7 +179,7 @@ def fit(self, X, y, sample_weight=None):
             if est != 'drop'
         ]
 
-        X_meta = self._concatenate_predictions(predictions)
+        X_meta = self._concatenate_predictions(X, predictions)
         if sample_weight is not None:
             try:
                 self.final_estimator_.fit(
@@ -192,7 +206,7 @@ def _transform(self, X):
             for est, meth in zip(self.estimators_, self.stack_method_)
             if est != 'drop'
         ]
-        return self._concatenate_predictions(predictions)
+        return self._concatenate_predictions(X, predictions)
 
     @if_delegate_has_method(delegate='final_estimator_')
     def predict(self, X, **predict_params):
@@ -288,6 +302,12 @@ class StackingClassifier(ClassifierMixin, _BaseStacking):
         `None` means 1 unless in a `joblib.parallel_backend` context. -1 means
         using all processors. See Glossary for more details.
 
+    passthrough : bool, default=False
+        When False, only the predictions of estimators will be used as
+        training data for `final_estimator`. When True, the
+        `final_estimator` is trained on the predictions as well as the
+        original training data.
+
     Attributes
     ----------
     estimators_ : list of estimators
@@ -344,13 +364,15 @@ class StackingClassifier(ClassifierMixin, _BaseStacking):
 
     """
     def __init__(self, estimators, final_estimator=None, cv=None,
-                 stack_method='auto', n_jobs=None, verbose=0):
+                 stack_method='auto', n_jobs=None, passthrough=False,
+                 verbose=0):
         super().__init__(
             estimators=estimators,
             final_estimator=final_estimator,
             cv=cv,
             stack_method=stack_method,
             n_jobs=n_jobs,
+            passthrough=passthrough,
             verbose=verbose
         )
 
@@ -525,6 +547,12 @@ class StackingRegressor(RegressorMixin, _BaseStacking):
         `None` means 1 unless in a `joblib.parallel_backend` context. -1 means
         using all processors. See Glossary for more details.
 
+    passthrough : bool, default=False
+        When False, only the predictions of estimators will be used as
+        training data for `final_estimator`. When True, the
+        `final_estimator` is trained on the predictions as well as the
+        original training data.
+
     Attributes
     ----------
     estimators_ : list of estimator
@@ -569,13 +597,14 @@ class StackingRegressor(RegressorMixin, _BaseStacking):
 
     """
     def __init__(self, estimators, final_estimator=None, cv=None, n_jobs=None,
-                 verbose=0):
+                 passthrough=False, verbose=0):
         super().__init__(
             estimators=estimators,
             final_estimator=final_estimator,
             cv=cv,
             stack_method="predict",
             n_jobs=n_jobs,
+            passthrough=passthrough,
             verbose=verbose
         )
 
diff --git a/sklearn/ensemble/tests/test_stacking.py b/sklearn/ensemble/tests/test_stacking.py
@@ -5,6 +5,7 @@
 
 import pytest
 import numpy as np
+import scipy.sparse as sparse
 
 from sklearn.base import BaseEstimator
 from sklearn.base import ClassifierMixin
@@ -38,6 +39,7 @@
 from sklearn.model_selection import KFold
 
 from sklearn.utils._testing import assert_allclose
+from sklearn.utils._testing import assert_allclose_dense_sparse
 from sklearn.utils._testing import ignore_warnings
 from sklearn.utils.estimator_checks import check_estimator
 from sklearn.utils.estimator_checks import check_no_attributes_set_in_init
@@ -52,23 +54,28 @@
 @pytest.mark.parametrize(
     "final_estimator", [None, RandomForestClassifier(random_state=42)]
 )
-def test_stacking_classifier_iris(cv, final_estimator):
+@pytest.mark.parametrize("passthrough", [False, True])
+def test_stacking_classifier_iris(cv, final_estimator, passthrough):
     # prescale the data to avoid convergence warning without using a pipeline
     # for later assert
     X_train, X_test, y_train, y_test = train_test_split(
         scale(X_iris), y_iris, stratify=y_iris, random_state=42
     )
     estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
     clf = StackingClassifier(
-        estimators=estimators, final_estimator=final_estimator, cv=cv
+        estimators=estimators, final_estimator=final_estimator, cv=cv,
+        passthrough=passthrough
     )
     clf.fit(X_train, y_train)
     clf.predict(X_test)
     clf.predict_proba(X_test)
     assert clf.score(X_test, y_test) > 0.8
 
     X_trans = clf.transform(X_test)
-    assert X_trans.shape[1] == 6
+    expected_column_count = 10 if passthrough else 6
+    assert X_trans.shape[1] == expected_column_count
+    if passthrough:
+        assert_allclose(X_test, X_trans[:, -4:])
 
     clf.set_params(lr='drop')
     clf.fit(X_train, y_train)
@@ -79,7 +86,10 @@ def test_stacking_classifier_iris(cv, final_estimator):
         clf.decision_function(X_test)
 
     X_trans = clf.transform(X_test)
-    assert X_trans.shape[1] == 3
+    expected_column_count_drop = 7 if passthrough else 3
+    assert X_trans.shape[1] == expected_column_count_drop
+    if passthrough:
+        assert_allclose(X_test, X_trans[:, -4:])
 
 
 def test_stacking_classifier_drop_column_binary_classification():
@@ -161,15 +171,18 @@ def test_stacking_regressor_drop_estimator():
      (RandomForestRegressor(random_state=42), {}),
      (DummyRegressor(), {'return_std': True})]
 )
-def test_stacking_regressor_diabetes(cv, final_estimator, predict_params):
+@pytest.mark.parametrize("passthrough", [False, True])
+def test_stacking_regressor_diabetes(cv, final_estimator, predict_params,
+                                     passthrough):
     # prescale the data to avoid convergence warning without using a pipeline
     # for later assert
     X_train, X_test, y_train, _ = train_test_split(
         scale(X_diabetes), y_diabetes, random_state=42
     )
     estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
     reg = StackingRegressor(
-        estimators=estimators, final_estimator=final_estimator, cv=cv
+        estimators=estimators, final_estimator=final_estimator, cv=cv,
+        passthrough=passthrough
     )
     reg.fit(X_train, y_train)
     result = reg.predict(X_test, **predict_params)
@@ -178,14 +191,58 @@ def test_stacking_regressor_diabetes(cv, final_estimator, predict_params):
         assert len(result) == expected_result_length
 
     X_trans = reg.transform(X_test)
-    assert X_trans.shape[1] == 2
+    expected_column_count = 12 if passthrough else 2
+    assert X_trans.shape[1] == expected_column_count
+    if passthrough:
+        assert_allclose(X_test, X_trans[:, -10:])
 
     reg.set_params(lr='drop')
     reg.fit(X_train, y_train)
     reg.predict(X_test)
 
     X_trans = reg.transform(X_test)
-    assert X_trans.shape[1] == 1
+    expected_column_count_drop = 11 if passthrough else 1
+    assert X_trans.shape[1] == expected_column_count_drop
+    if passthrough:
+        assert_allclose(X_test, X_trans[:, -10:])
+
+
+@pytest.mark.parametrize('fmt', ['csc', 'csr', 'coo'])
+def test_stacking_regressor_sparse_passthrough(fmt):
+    # Check passthrough behavior on a sparse X matrix
+    X_train, X_test, y_train, _ = train_test_split(
+        sparse.coo_matrix(scale(X_diabetes)).asformat(fmt),
+        y_diabetes, random_state=42
+    )
+    estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
+    rf = RandomForestRegressor(n_estimators=10, random_state=42)
+    clf = StackingRegressor(
+        estimators=estimators, final_estimator=rf, cv=5, passthrough=True
+    )
+    clf.fit(X_train, y_train)
+    X_trans = clf.transform(X_test)
+    assert_allclose_dense_sparse(X_test, X_trans[:, -10:])
+    assert sparse.issparse(X_trans)
+    assert X_test.format == X_trans.format
+
+
+@pytest.mark.parametrize('fmt', ['csc', 'csr', 'coo'])
+def test_stacking_classifier_sparse_passthrough(fmt):
+    # Check passthrough behavior on a sparse X matrix
+    X_train, X_test, y_train, _ = train_test_split(
+        sparse.coo_matrix(scale(X_iris)).asformat(fmt),
+        y_iris, random_state=42
+    )
+    estimators = [('lr', LogisticRegression()), ('svc', LinearSVC())]
+    rf = RandomForestClassifier(n_estimators=10, random_state=42)
+    clf = StackingClassifier(
+        estimators=estimators, final_estimator=rf, cv=5, passthrough=True
+    )
+    clf.fit(X_train, y_train)
+    X_trans = clf.transform(X_test)
+    assert_allclose_dense_sparse(X_test, X_trans[:, -4:])
+    assert sparse.issparse(X_trans)
+    assert X_test.format == X_trans.format
 
 
 def test_stacking_classifier_drop_binary_prob():