[MRG] MNT requires_y tag with y=None validation (scikit-learn#16622)

NicolasHug · web-flow · commit 089c8a17a166 · 2020-04-22T22:41:45.000+10:00
diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
@@ -510,6 +510,11 @@ requires_fit (default=True)
 requires_positive_X (default=False)
     whether the estimator requires positive X.
 
+requires_y (default=False)
+    whether the estimator requires y to be passed to `fit`, `fit_predict` or
+    `fit_transform` methods. The tag is True for estimators inheriting from
+    `~sklearn.base.RegressorMixin` and `~sklearn.base.ClassifierMixin`.
+
 requires_positive_y (default=False)
     whether the estimator requires a positive y (only applicable for regression).
 
diff --git a/doc/whats_new/v0.23.rst b/doc/whats_new/v0.23.rst
@@ -476,4 +476,10 @@ Miscellaneous
   attribute is equal to the number of features passed to the `fit` method.
   See `SLEP010
   <https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html>`_
-  for details. :pr:`16112` by `Nicolas Hug`_.
+  for details. :pr:`16112` and :pr:`16622` by `Nicolas Hug`_.
+
+- |API| Estimators now have a `requires_y` tags which is False by default
+  except for estimators that inherit from `~sklearn.base.RegressorMixin` or
+  `~sklearn.base.ClassifierMixin`. This tag is used to ensure that a proper
+  error message is raised when y was expected but None was passed.
+  :pr:`16622` by `Nicolas Hug`_.
diff --git a/sklearn/base.py b/sklearn/base.py
@@ -36,7 +36,9 @@
     '_xfail_checks': False,
     'multioutput_only': False,
     'binary_only': False,
-    'requires_fit': True}
+    'requires_fit': True,
+    'requires_y': False,
+    }
 
 
 def clone(estimator, safe=True):
@@ -374,7 +376,8 @@ def _check_n_features(self, X, reset):
                                        self.n_features_in_)
                 )
 
-    def _validate_data(self, X, y=None, reset=True, **check_params):
+    def _validate_data(self, X, y=None, reset=True,
+                       validate_separately=False, **check_params):
         """Validate input data and set or check the `n_features_in_` attribute.
 
         Parameters
@@ -389,9 +392,14 @@ def _validate_data(self, X, y=None, reset=True, **check_params):
             Whether to reset the `n_features_in_` attribute.
             If False, the input will be checked for consistency with data
             provided when reset was last True.
+        validate_separately : False or tuple of dicts, default=False
+            Only used if y is not None.
+            If False, call validate_X_y(). Else, it must be a tuple of kwargs
+            to be used for calling check_array() on X and y respectively.
         **check_params : kwargs
             Parameters passed to :func:`sklearn.utils.check_array` or
-            :func:`sklearn.utils.check_X_y`.
+            :func:`sklearn.utils.check_X_y`. Ignored if validate_separately
+            is not False.
 
         Returns
         -------
@@ -400,10 +408,24 @@ def _validate_data(self, X, y=None, reset=True, **check_params):
         """
 
         if y is None:
+            if self._get_tags()['requires_y']:
+                raise ValueError(
+                    f"This {self.__class__.__name__} estimator "
+                    f"requires y to be passed, but the target y is None."
+                )
             X = check_array(X, **check_params)
             out = X
         else:
-            X, y = check_X_y(X, y, **check_params)
+            if validate_separately:
+                # We need this because some estimators validate X and y
+                # separately, and in general, separately calling check_array()
+                # on X and y isn't equivalent to just calling check_X_y()
+                # :(
+                check_X_params, check_y_params = validate_separately
+                X = check_array(X, **check_X_params)
+                y = check_array(y, **check_y_params)
+            else:
+                X, y = check_X_y(X, y, **check_params)
             out = X, y
 
         if check_params.get('ensure_2d', True):
@@ -444,6 +466,9 @@ def score(self, X, y, sample_weight=None):
         from .metrics import accuracy_score
         return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 class RegressorMixin:
     """Mixin class for all regression estimators in scikit-learn."""
@@ -494,6 +519,9 @@ def score(self, X, y, sample_weight=None):
         y_pred = self.predict(X)
         return r2_score(y, y_pred, sample_weight=sample_weight)
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 class ClusterMixin:
     """Mixin class for all cluster estimators in scikit-learn."""
diff --git a/sklearn/covariance/_empirical_covariance.py b/sklearn/covariance/_empirical_covariance.py
@@ -79,6 +79,7 @@ def empirical_covariance(X, assume_centered=False):
            [0.25, 0.25, 0.25]])
     """
     X = np.asarray(X)
+
     if X.ndim == 1:
         X = np.reshape(X, (1, -1))
 
diff --git a/sklearn/cross_decomposition/_pls.py b/sklearn/cross_decomposition/_pls.py
@@ -519,7 +519,8 @@ def fit_transform(self, X, y=None):
         return self.fit(X, y).transform(X, y)
 
     def _more_tags(self):
-        return {'poor_score': True}
+        return {'poor_score': True,
+                'requires_y': False}
 
 
 class PLSRegression(_PLS):
diff --git a/sklearn/ensemble/_gb.py b/sklearn/ensemble/_gb.py
@@ -405,15 +405,15 @@ def fit(self, X, y, sample_weight=None, monitor=None):
         # Check input
         # Since check_array converts both X and y to the same dtype, but the
         # trees use different types for X and y, checking them separately.
-        X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
-                                dtype=DTYPE)
+
+        X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
+                                   dtype=DTYPE, multi_output=True)
         n_samples, self.n_features_ = X.shape
 
         sample_weight_is_none = sample_weight is None
 
         sample_weight = _check_sample_weight(sample_weight, X)
 
-        y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
         y = column_or_1d(y, warn=True)
         y = self._validate_y(y, sample_weight)
 
diff --git a/sklearn/feature_selection/_rfe.py b/sklearn/feature_selection/_rfe.py
@@ -340,7 +340,9 @@ def predict_log_proba(self, X):
     def _more_tags(self):
         estimator_tags = self.estimator._get_tags()
         return {'poor_score': True,
-                'allow_nan': estimator_tags.get('allow_nan', True)}
+                'allow_nan': estimator_tags.get('allow_nan', True),
+                'requires_y': True,
+                }
 
 
 class RFECV(RFE):
diff --git a/sklearn/feature_selection/_univariate_selection.py b/sklearn/feature_selection/_univariate_selection.py
@@ -363,6 +363,9 @@ def fit(self, X, y):
     def _check_params(self, X, y):
         pass
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 ######################################################################
 # Specific filters
diff --git a/sklearn/linear_model/_base.py b/sklearn/linear_model/_base.py
@@ -246,6 +246,9 @@ def _set_intercept(self, X_offset, y_offset, X_scale):
         else:
             self.intercept_ = 0.
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 # XXX Should this derive from LinearModel? It should be a mixin, not an ABC.
 # Maybe the n_features checking can be moved to LinearModel.
diff --git a/sklearn/linear_model/_coordinate_descent.py b/sklearn/linear_model/_coordinate_descent.py
@@ -1157,8 +1157,52 @@ def fit(self, X, y):
         y : array-like of shape (n_samples,) or (n_samples, n_targets)
             Target values
         """
-        y = check_array(y, copy=False, dtype=[np.float64, np.float32],
-                        ensure_2d=False)
+        # This makes sure that there is no duplication in memory.
+        # Dealing right with copy_X is important in the following:
+        # Multiple functions touch X and subsamples of X and can induce a
+        # lot of duplication of memory
+        copy_X = self.copy_X and self.fit_intercept
+
+        check_y_params = dict(copy=False, dtype=[np.float64, np.float32],
+                              ensure_2d=False)
+        if isinstance(X, np.ndarray) or sparse.isspmatrix(X):
+            # Keep a reference to X
+            reference_to_old_X = X
+            # Let us not impose fortran ordering so far: it is
+            # not useful for the cross-validation loop and will be done
+            # by the model fitting itself
+
+            # Need to validate separately here.
+            # We can't pass multi_ouput=True because that would allow y to be
+            # csr. We also want to allow y to be 64 or 32 but check_X_y only
+            # allows to convert for 64.
+            check_X_params = dict(accept_sparse='csc',
+                                  dtype=[np.float64, np.float32], copy=False)
+            X, y = self._validate_data(X, y,
+                                       validate_separately=(check_X_params,
+                                                            check_y_params))
+            if sparse.isspmatrix(X):
+                if (hasattr(reference_to_old_X, "data") and
+                   not np.may_share_memory(reference_to_old_X.data, X.data)):
+                    # X is a sparse matrix and has been copied
+                    copy_X = False
+            elif not np.may_share_memory(reference_to_old_X, X):
+                # X has been copied
+                copy_X = False
+            del reference_to_old_X
+        else:
+            # Need to validate separately here.
+            # We can't pass multi_ouput=True because that would allow y to be
+            # csr. We also want to allow y to be 64 or 32 but check_X_y only
+            # allows to convert for 64.
+            check_X_params = dict(accept_sparse='csc',
+                                  dtype=[np.float64, np.float32], order='F',
+                                  copy=copy_X)
+            X, y = self._validate_data(X, y,
+                                       validate_separately=(check_X_params,
+                                                            check_y_params))
+            copy_X = False
+
         if y.shape[0] == 0:
             raise ValueError("y has 0 samples: %r" % y)
 
@@ -1191,35 +1235,6 @@ def fit(self, X, y):
         if self.selection not in ["random", "cyclic"]:
             raise ValueError("selection should be either random or cyclic.")
 
-        # This makes sure that there is no duplication in memory.
-        # Dealing right with copy_X is important in the following:
-        # Multiple functions touch X and subsamples of X and can induce a
-        # lot of duplication of memory
-        copy_X = self.copy_X and self.fit_intercept
-
-        if isinstance(X, np.ndarray) or sparse.isspmatrix(X):
-            # Keep a reference to X
-            reference_to_old_X = X
-            # Let us not impose fortran ordering so far: it is
-            # not useful for the cross-validation loop and will be done
-            # by the model fitting itself
-            X = self._validate_data(X, accept_sparse='csc',
-                                    dtype=[np.float64, np.float32], copy=False)
-            if sparse.isspmatrix(X):
-                if (hasattr(reference_to_old_X, "data") and
-                   not np.may_share_memory(reference_to_old_X.data, X.data)):
-                    # X is a sparse matrix and has been copied
-                    copy_X = False
-            elif not np.may_share_memory(reference_to_old_X, X):
-                # X has been copied
-                copy_X = False
-            del reference_to_old_X
-        else:
-            X = self._validate_data(X, accept_sparse='csc',
-                                    dtype=[np.float64, np.float32], order='F',
-                                    copy=copy_X)
-            copy_X = False
-
         if X.shape[0] != y.shape[0]:
             raise ValueError("X and y have inconsistent dimensions (%d != %d)"
                              % (X.shape[0], y.shape[0]))
@@ -1842,9 +1857,15 @@ def fit(self, X, y):
         To avoid memory re-allocation it is advised to allocate the
         initial data in memory directly using that format.
         """
-        X = self._validate_data(X, dtype=[np.float64, np.float32], order='F',
-                                copy=self.copy_X and self.fit_intercept)
-        y = check_array(y, dtype=X.dtype.type, ensure_2d=False)
+
+        # Need to validate separately here.
+        # We can't pass multi_ouput=True because that would allow y to be csr.
+        check_X_params = dict(dtype=[np.float64, np.float32], order='F',
+                              copy=self.copy_X and self.fit_intercept)
+        check_y_params = dict(ensure_2d=False)
+        X, y = self._validate_data(X, y, validate_separately=(check_X_params,
+                                                              check_y_params))
+        y = y.astype(X.dtype)
 
         if hasattr(self, 'l1_ratio'):
             model_str = 'ElasticNet'
diff --git a/sklearn/linear_model/_ransac.py b/sklearn/linear_model/_ransac.py
@@ -9,7 +9,7 @@
 
 from ..base import BaseEstimator, MetaEstimatorMixin, RegressorMixin, clone
 from ..base import MultiOutputMixin
-from ..utils import check_random_state, check_array, check_consistent_length
+from ..utils import check_random_state, check_consistent_length
 from ..utils.random import sample_without_replacement
 from ..utils.validation import check_is_fitted, _check_sample_weight
 from ..utils.validation import _deprecate_positional_args
@@ -247,8 +247,12 @@ def fit(self, X, y, sample_weight=None):
             `max_trials` randomly chosen sub-samples.
 
         """
-        X = self._validate_data(X, accept_sparse='csr')
-        y = check_array(y, ensure_2d=False)
+        # Need to validate separately here.
+        # We can't pass multi_ouput=True because that would allow y to be csr.
+        check_X_params = dict(accept_sparse='csr')
+        check_y_params = dict(ensure_2d=False)
+        X, y = self._validate_data(X, y, validate_separately=(check_X_params,
+                                                              check_y_params))
         check_consistent_length(X, y)
 
         if self.base_estimator is not None:
diff --git a/sklearn/neighbors/_base.py b/sklearn/neighbors/_base.py
@@ -23,7 +23,7 @@
 from ..base import BaseEstimator, MultiOutputMixin
 from ..metrics import pairwise_distances_chunked
 from ..metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
-from ..utils import check_X_y, check_array, gen_even_slices
+from ..utils import check_array, gen_even_slices
 from ..utils import _to_object_array
 from ..utils.multiclass import check_classification_targets
 from ..utils.validation import check_is_fitted
@@ -1104,10 +1104,14 @@ def fit(self, X, y):
              or [n_samples, n_outputs]
         """
         if not isinstance(X, (KDTree, BallTree)):
-            X, y = check_X_y(X, y, "csr", multi_output=True)
+            X, y = self._validate_data(X, y, accept_sparse="csr",
+                                       multi_output=True)
         self._y = y
         return self._fit(X)
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 class SupervisedIntegerMixin:
     def fit(self, X, y):
@@ -1124,7 +1128,8 @@ def fit(self, X, y):
 
         """
         if not isinstance(X, (KDTree, BallTree)):
-            X, y = check_X_y(X, y, "csr", multi_output=True)
+            X, y = self._validate_data(X, y, accept_sparse="csr",
+                                       multi_output=True)
 
         if y.ndim == 1 or y.ndim == 2 and y.shape[1] == 1:
             if y.ndim != 1:
@@ -1151,6 +1156,9 @@ def fit(self, X, y):
 
         return self._fit(X)
 
+    def _more_tags(self):
+        return {'requires_y': True}
+
 
 class UnsupervisedMixin:
     def fit(self, X, y=None):
diff --git a/sklearn/neighbors/_graph.py b/sklearn/neighbors/_graph.py
@@ -192,8 +192,8 @@ def radius_neighbors_graph(X, radius, mode='connectivity', metric='minkowski',
     return X.radius_neighbors_graph(query, radius, mode)
 
 
-class KNeighborsTransformer(NeighborsBase, KNeighborsMixin,
-                            UnsupervisedMixin, TransformerMixin):
+class KNeighborsTransformer(KNeighborsMixin, UnsupervisedMixin,
+                            TransformerMixin, NeighborsBase):
     """Transform X into a (weighted) graph of k nearest neighbors
 
     The transformed data is a sparse graph as returned by kneighbors_graph.
@@ -335,8 +335,8 @@ def fit_transform(self, X, y=None):
         return self.fit(X).transform(X)
 
 
-class RadiusNeighborsTransformer(NeighborsBase, RadiusNeighborsMixin,
-                                 UnsupervisedMixin, TransformerMixin):
+class RadiusNeighborsTransformer(RadiusNeighborsMixin, UnsupervisedMixin,
+                                 TransformerMixin, NeighborsBase):
     """Transform X into a (weighted) graph of neighbors nearer than a radius
 
     The transformed data is a sparse graph as returned by
diff --git a/sklearn/neighbors/_lof.py b/sklearn/neighbors/_lof.py
@@ -16,8 +16,8 @@
 __all__ = ["LocalOutlierFactor"]
 
 
-class LocalOutlierFactor(NeighborsBase, KNeighborsMixin, UnsupervisedMixin,
-                         OutlierMixin):
+class LocalOutlierFactor(KNeighborsMixin, UnsupervisedMixin,
+                         OutlierMixin, NeighborsBase):
     """Unsupervised Outlier Detection using Local Outlier Factor (LOF)
 
     The anomaly score of each sample is called Local Outlier Factor.
diff --git a/sklearn/neighbors/_nca.py b/sklearn/neighbors/_nca.py
@@ -520,3 +520,6 @@ def _loss_grad_lbfgs(self, transformation, X, same_class_mask, sign=1.0):
             sys.stdout.flush()
 
         return sign * loss, sign * gradient.ravel()
+
+    def _more_tags(self):
+        return {'requires_y': True}
diff --git a/sklearn/neighbors/_unsupervised.py b/sklearn/neighbors/_unsupervised.py
diff --git a/sklearn/tree/_classes.py b/sklearn/tree/_classes.py
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py