ENH Checks n_features_in_ in preprocessing module (scikit-learn#18577)

thomasjpfan · ogrisel · jnothman · web-flow · commit d933c20befea · 2020-10-21T18:11:44.000+02:00
Co-authored-by: Olivier Grisel &lt;olivier.grisel@gmail.com&gt;
Co-authored-by: Joel Nothman &lt;joel.nothman@gmail.com&gt;
Co-authored-by: Christian Lorentzen &lt;lorentzen.ch@gmail.com&gt;
Co-authored-by: Nicolas Hug &lt;contact@nicolas-hug.com&gt;
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
Co-authored-by: Guillaume Lemaitre &lt;g.lemaitre58@gmail.com&gt;
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
@@ -432,8 +432,8 @@ def transform(self, X):
         """
         check_is_fitted(self)
 
-        X = check_array(X, copy=self.copy, dtype=FLOAT_DTYPES,
-                        force_all_finite="allow-nan")
+        X = self._validate_data(X, copy=self.copy, dtype=FLOAT_DTYPES,
+                                force_all_finite="allow-nan", reset=False)
 
         X *= self.scale_
         X += self.min_
@@ -760,9 +760,10 @@ def partial_fit(self, X, y=None, sample_weight=None):
         self : object
             Fitted scaler.
         """
+        first_call = not hasattr(self, "n_samples_seen_")
         X = self._validate_data(X, accept_sparse=('csr', 'csc'),
                                 estimator=self, dtype=FLOAT_DTYPES,
-                                force_all_finite='allow-nan')
+                                force_all_finite='allow-nan', reset=first_call)
 
         if sample_weight is not None:
             sample_weight = _check_sample_weight(sample_weight, X,
@@ -1097,9 +1098,10 @@ def transform(self, X):
             Transformed array.
         """
         check_is_fitted(self)
-        X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
-                        estimator=self, dtype=FLOAT_DTYPES,
-                        force_all_finite='allow-nan')
+        X = self._validate_data(X, accept_sparse=('csr', 'csc'),
+                                copy=self.copy, reset=False,
+                                estimator=self, dtype=FLOAT_DTYPES,
+                                force_all_finite='allow-nan')
 
         if sparse.issparse(X):
             inplace_column_scale(X, 1.0 / self.scale_)
@@ -1398,9 +1400,10 @@ def transform(self, X):
             Transformed array.
         """
         check_is_fitted(self)
-        X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
-                        estimator=self, dtype=FLOAT_DTYPES,
-                        force_all_finite='allow-nan')
+        X = self._validate_data(X, accept_sparse=('csr', 'csc'),
+                                copy=self.copy, estimator=self,
+                                dtype=FLOAT_DTYPES, reset=False,
+                                force_all_finite='allow-nan')
 
         if sparse.issparse(X):
             if self.with_scaling:
@@ -1735,8 +1738,8 @@ def transform(self, X):
         """
         check_is_fitted(self)
 
-        X = check_array(X, order='F', dtype=FLOAT_DTYPES,
-                        accept_sparse=('csr', 'csc'))
+        X = self._validate_data(X, order='F', dtype=FLOAT_DTYPES, reset=False,
+                                accept_sparse=('csr', 'csc'))
 
         n_samples, n_features = X.shape
 
@@ -2038,7 +2041,7 @@ def transform(self, X, copy=None):
             Transformed array.
         """
         copy = copy if copy is not None else self.copy
-        X = check_array(X, accept_sparse='csr')
+        X = self._validate_data(X, accept_sparse='csr', reset=False)
         return normalize(X, norm=self.norm, axis=1, copy=copy)
 
     def _more_tags(self):
@@ -2195,7 +2198,11 @@ def transform(self, X, copy=None):
             Transformed array.
         """
         copy = copy if copy is not None else self.copy
-        return binarize(X, threshold=self.threshold, copy=copy)
+        # TODO: This should be refactored because binarize also calls
+        # check_array
+        X = self._validate_data(X, accept_sparse=['csr', 'csc'], copy=copy,
+                                reset=False)
+        return binarize(X, threshold=self.threshold, copy=False)
 
     def _more_tags(self):
         return {'stateless': True}
@@ -2291,7 +2298,7 @@ def transform(self, K, copy=True):
         """
         check_is_fitted(self)
 
-        K = check_array(K, copy=copy, dtype=FLOAT_DTYPES)
+        K = self._validate_data(K, copy=copy, dtype=FLOAT_DTYPES, reset=False)
 
         K_pred_cols = (np.sum(K, axis=1) /
                        self.K_fit_rows_.shape[0])[:, np.newaxis]
@@ -2689,16 +2696,7 @@ def _transform_col(self, X_col, quantiles, inverse):
     def _check_inputs(self, X, in_fit, accept_sparse_negative=False,
                       copy=False):
         """Check inputs before fit and transform."""
-        # In theory reset should be equal to `in_fit`, but there are tests
-        # checking the input number of feature and they expect a specific
-        # string, which is not the same one raised by check_n_features. So we
-        # don't check n_features_in_ here for now (it's done with adhoc code in
-        # the estimator anyway).
-        # TODO: set reset=in_fit when addressing reset in
-        # predict/transform/etc.
-        reset = True
-
-        X = self._validate_data(X, reset=reset,
+        X = self._validate_data(X, reset=in_fit,
                                 accept_sparse='csc', copy=copy,
                                 dtype=FLOAT_DTYPES,
                                 force_all_finite='allow-nan')
@@ -2718,16 +2716,6 @@ def _check_inputs(self, X, in_fit, accept_sparse_negative=False,
 
         return X
 
-    def _check_is_fitted(self, X):
-        """Check the inputs before transforming."""
-        check_is_fitted(self)
-        # check that the dimension of X are adequate with the fitted data
-        if X.shape[1] != self.quantiles_.shape[1]:
-            raise ValueError('X does not have the same number of features as'
-                             ' the previously fitted data. Got {} instead of'
-                             ' {}.'.format(X.shape[1],
-                                           self.quantiles_.shape[1]))
-
     def _transform(self, X, inverse=False):
         """Forward and inverse transform.
 
@@ -2777,8 +2765,8 @@ def transform(self, X):
         Xt : {ndarray, sparse matrix} of shape (n_samples, n_features)
             The projected data.
         """
+        check_is_fitted(self)
         X = self._check_inputs(X, in_fit=False, copy=self.copy)
-        self._check_is_fitted(X)
 
         return self._transform(X, inverse=False)
 
@@ -2798,9 +2786,9 @@ def inverse_transform(self, X):
         Xt : {ndarray, sparse matrix} of (n_samples, n_features)
             The projected data.
         """
+        check_is_fitted(self)
         X = self._check_inputs(X, in_fit=False, accept_sparse_negative=True,
                                copy=self.copy)
-        self._check_is_fitted(X)
 
         return self._transform(X, inverse=True)
 
@@ -3262,6 +3250,10 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False,
         ----------
         X : array-like of shape (n_samples, n_features)
 
+        in_fit : bool
+            Whether or not `_check_input` is called from `fit` or other
+            methods, e.g. `predict`, `transform`, etc.
+
         check_positive : bool, default=False
             If True, check that all data is positive and non-zero (only if
             ``self.method=='box-cox'``).
@@ -3273,7 +3265,8 @@ def _check_input(self, X, in_fit, check_positive=False, check_shape=False,
             If True, check that the transformation method is valid.
         """
         X = self._validate_data(X, ensure_2d=True, dtype=FLOAT_DTYPES,
-                                copy=self.copy, force_all_finite='allow-nan')
+                                copy=self.copy, force_all_finite='allow-nan',
+                                reset=in_fit)
 
         with np.warnings.catch_warnings():
             np.warnings.filterwarnings(
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
@@ -289,12 +289,7 @@ def transform(self, X):
 
         # check input and attribute dtypes
         dtype = (np.float64, np.float32) if self.dtype is None else self.dtype
-        Xt = check_array(X, copy=True, dtype=dtype)
-
-        n_features = self.n_bins_.shape[0]
-        if Xt.shape[1] != n_features:
-            raise ValueError("Incorrect number of features. Expecting {}, "
-                             "received {}.".format(n_features, Xt.shape[1]))
+        Xt = self._validate_data(X, copy=True, dtype=dtype, reset=False)
 
         bin_edges = self.bin_edges_
         for jj in range(Xt.shape[1]):
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
@@ -1310,12 +1310,8 @@ def test_quantile_transform_check_error():
 
     X_bad_feat = np.transpose([[0, 25, 50, 0, 0, 0, 75, 0, 0, 100],
                                [0, 0, 2.6, 4.1, 0, 0, 2.3, 0, 9.5, 0.1]])
-    err_msg = ("X does not have the same number of features as the previously"
-               " fitted " "data. Got 2 instead of 3.")
-    with pytest.raises(ValueError, match=err_msg):
-        transformer.transform(X_bad_feat)
-    err_msg = ("X does not have the same number of features "
-               "as the previously fitted data. Got 2 instead of 3.")
+    err_msg = ("X has 2 features, but QuantileTransformer is expecting "
+               "3 features as input.")
     with pytest.raises(ValueError, match=err_msg):
         transformer.inverse_transform(X_bad_feat)
 
@@ -2434,7 +2430,8 @@ def test_power_transformer_shape_exception(method):
 
     # Exceptions should be raised for arrays with different num_columns
     # than during fitting
-    wrong_shape_message = 'Input data has a different number of features'
+    wrong_shape_message = (r"X has \d+ features, but PowerTransformer is "
+                           r"expecting \d+ features")
 
     with pytest.raises(ValueError, match=wrong_shape_message):
         pt.transform(X[:, 0:1])
diff --git a/sklearn/preprocessing/tests/test_discretization.py b/sklearn/preprocessing/tests/test_discretization.py
@@ -101,14 +101,6 @@ def test_fit_transform_n_bins_array(strategy, expected):
         assert bin_edges.shape == (n_bins + 1, )
 
 
-def test_invalid_n_features():
-    est = KBinsDiscretizer(n_bins=3).fit(X)
-    bad_X = np.arange(25).reshape(5, -1)
-    err_msg = "Incorrect number of features. Expecting 4, received 5"
-    with pytest.raises(ValueError, match=err_msg):
-        est.transform(bad_X)
-
-
 @pytest.mark.parametrize('strategy', ['uniform', 'kmeans', 'quantile'])
 def test_same_min_max(strategy):
     warnings.simplefilter("always")
diff --git a/sklearn/tests/test_common.py b/sklearn/tests/test_common.py
@@ -358,7 +358,6 @@ def test_search_cv(estimator, check, request):
     'naive_bayes',
     'neighbors',
     'pipeline',
-    'preprocessing',
     'random_projection',
     'semi_supervised',
     'svm',