Skip to content

Commit 089c8a1

Browse files
authored
[MRG] MNT requires_y tag with y=None validation (scikit-learn#16622)
1 parent f82a2cb commit 089c8a1

File tree

18 files changed

+185
-60
lines changed

18 files changed

+185
-60
lines changed

doc/developers/develop.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,11 @@ requires_fit (default=True)
510510
requires_positive_X (default=False)
511511
whether the estimator requires positive X.
512512

513+
requires_y (default=False)
514+
whether the estimator requires y to be passed to `fit`, `fit_predict` or
515+
`fit_transform` methods. The tag is True for estimators inheriting from
516+
`~sklearn.base.RegressorMixin` and `~sklearn.base.ClassifierMixin`.
517+
513518
requires_positive_y (default=False)
514519
whether the estimator requires a positive y (only applicable for regression).
515520

doc/whats_new/v0.23.rst

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -476,4 +476,10 @@ Miscellaneous
476476
attribute is equal to the number of features passed to the `fit` method.
477477
See `SLEP010
478478
<https://scikit-learn-enhancement-proposals.readthedocs.io/en/latest/slep010/proposal.html>`_
479-
for details. :pr:`16112` by `Nicolas Hug`_.
479+
for details. :pr:`16112` and :pr:`16622` by `Nicolas Hug`_.
480+
481+
- |API| Estimators now have a `requires_y` tags which is False by default
482+
except for estimators that inherit from `~sklearn.base.RegressorMixin` or
483+
`~sklearn.base.ClassifierMixin`. This tag is used to ensure that a proper
484+
error message is raised when y was expected but None was passed.
485+
:pr:`16622` by `Nicolas Hug`_.

sklearn/base.py

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@
3636
'_xfail_checks': False,
3737
'multioutput_only': False,
3838
'binary_only': False,
39-
'requires_fit': True}
39+
'requires_fit': True,
40+
'requires_y': False,
41+
}
4042

4143

4244
def clone(estimator, safe=True):
@@ -374,7 +376,8 @@ def _check_n_features(self, X, reset):
374376
self.n_features_in_)
375377
)
376378

377-
def _validate_data(self, X, y=None, reset=True, **check_params):
379+
def _validate_data(self, X, y=None, reset=True,
380+
validate_separately=False, **check_params):
378381
"""Validate input data and set or check the `n_features_in_` attribute.
379382
380383
Parameters
@@ -389,9 +392,14 @@ def _validate_data(self, X, y=None, reset=True, **check_params):
389392
Whether to reset the `n_features_in_` attribute.
390393
If False, the input will be checked for consistency with data
391394
provided when reset was last True.
395+
validate_separately : False or tuple of dicts, default=False
396+
Only used if y is not None.
397+
If False, call validate_X_y(). Else, it must be a tuple of kwargs
398+
to be used for calling check_array() on X and y respectively.
392399
**check_params : kwargs
393400
Parameters passed to :func:`sklearn.utils.check_array` or
394-
:func:`sklearn.utils.check_X_y`.
401+
:func:`sklearn.utils.check_X_y`. Ignored if validate_separately
402+
is not False.
395403
396404
Returns
397405
-------
@@ -400,10 +408,24 @@ def _validate_data(self, X, y=None, reset=True, **check_params):
400408
"""
401409

402410
if y is None:
411+
if self._get_tags()['requires_y']:
412+
raise ValueError(
413+
f"This {self.__class__.__name__} estimator "
414+
f"requires y to be passed, but the target y is None."
415+
)
403416
X = check_array(X, **check_params)
404417
out = X
405418
else:
406-
X, y = check_X_y(X, y, **check_params)
419+
if validate_separately:
420+
# We need this because some estimators validate X and y
421+
# separately, and in general, separately calling check_array()
422+
# on X and y isn't equivalent to just calling check_X_y()
423+
# :(
424+
check_X_params, check_y_params = validate_separately
425+
X = check_array(X, **check_X_params)
426+
y = check_array(y, **check_y_params)
427+
else:
428+
X, y = check_X_y(X, y, **check_params)
407429
out = X, y
408430

409431
if check_params.get('ensure_2d', True):
@@ -444,6 +466,9 @@ def score(self, X, y, sample_weight=None):
444466
from .metrics import accuracy_score
445467
return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
446468

469+
def _more_tags(self):
470+
return {'requires_y': True}
471+
447472

448473
class RegressorMixin:
449474
"""Mixin class for all regression estimators in scikit-learn."""
@@ -494,6 +519,9 @@ def score(self, X, y, sample_weight=None):
494519
y_pred = self.predict(X)
495520
return r2_score(y, y_pred, sample_weight=sample_weight)
496521

522+
def _more_tags(self):
523+
return {'requires_y': True}
524+
497525

498526
class ClusterMixin:
499527
"""Mixin class for all cluster estimators in scikit-learn."""

sklearn/covariance/_empirical_covariance.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def empirical_covariance(X, assume_centered=False):
7979
[0.25, 0.25, 0.25]])
8080
"""
8181
X = np.asarray(X)
82+
8283
if X.ndim == 1:
8384
X = np.reshape(X, (1, -1))
8485

sklearn/cross_decomposition/_pls.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,8 @@ def fit_transform(self, X, y=None):
519519
return self.fit(X, y).transform(X, y)
520520

521521
def _more_tags(self):
522-
return {'poor_score': True}
522+
return {'poor_score': True,
523+
'requires_y': False}
523524

524525

525526
class PLSRegression(_PLS):

sklearn/ensemble/_gb.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -405,15 +405,15 @@ def fit(self, X, y, sample_weight=None, monitor=None):
405405
# Check input
406406
# Since check_array converts both X and y to the same dtype, but the
407407
# trees use different types for X and y, checking them separately.
408-
X = self._validate_data(X, accept_sparse=['csr', 'csc', 'coo'],
409-
dtype=DTYPE)
408+
409+
X, y = self._validate_data(X, y, accept_sparse=['csr', 'csc', 'coo'],
410+
dtype=DTYPE, multi_output=True)
410411
n_samples, self.n_features_ = X.shape
411412

412413
sample_weight_is_none = sample_weight is None
413414

414415
sample_weight = _check_sample_weight(sample_weight, X)
415416

416-
y = check_array(y, accept_sparse='csc', ensure_2d=False, dtype=None)
417417
y = column_or_1d(y, warn=True)
418418
y = self._validate_y(y, sample_weight)
419419

sklearn/feature_selection/_rfe.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,9 @@ def predict_log_proba(self, X):
340340
def _more_tags(self):
341341
estimator_tags = self.estimator._get_tags()
342342
return {'poor_score': True,
343-
'allow_nan': estimator_tags.get('allow_nan', True)}
343+
'allow_nan': estimator_tags.get('allow_nan', True),
344+
'requires_y': True,
345+
}
344346

345347

346348
class RFECV(RFE):

sklearn/feature_selection/_univariate_selection.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,9 @@ def fit(self, X, y):
363363
def _check_params(self, X, y):
364364
pass
365365

366+
def _more_tags(self):
367+
return {'requires_y': True}
368+
366369

367370
######################################################################
368371
# Specific filters

sklearn/linear_model/_base.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -246,6 +246,9 @@ def _set_intercept(self, X_offset, y_offset, X_scale):
246246
else:
247247
self.intercept_ = 0.
248248

249+
def _more_tags(self):
250+
return {'requires_y': True}
251+
249252

250253
# XXX Should this derive from LinearModel? It should be a mixin, not an ABC.
251254
# Maybe the n_features checking can be moved to LinearModel.

sklearn/linear_model/_coordinate_descent.py

Lines changed: 55 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1157,8 +1157,52 @@ def fit(self, X, y):
11571157
y : array-like of shape (n_samples,) or (n_samples, n_targets)
11581158
Target values
11591159
"""
1160-
y = check_array(y, copy=False, dtype=[np.float64, np.float32],
1161-
ensure_2d=False)
1160+
# This makes sure that there is no duplication in memory.
1161+
# Dealing right with copy_X is important in the following:
1162+
# Multiple functions touch X and subsamples of X and can induce a
1163+
# lot of duplication of memory
1164+
copy_X = self.copy_X and self.fit_intercept
1165+
1166+
check_y_params = dict(copy=False, dtype=[np.float64, np.float32],
1167+
ensure_2d=False)
1168+
if isinstance(X, np.ndarray) or sparse.isspmatrix(X):
1169+
# Keep a reference to X
1170+
reference_to_old_X = X
1171+
# Let us not impose fortran ordering so far: it is
1172+
# not useful for the cross-validation loop and will be done
1173+
# by the model fitting itself
1174+
1175+
# Need to validate separately here.
1176+
# We can't pass multi_ouput=True because that would allow y to be
1177+
# csr. We also want to allow y to be 64 or 32 but check_X_y only
1178+
# allows to convert for 64.
1179+
check_X_params = dict(accept_sparse='csc',
1180+
dtype=[np.float64, np.float32], copy=False)
1181+
X, y = self._validate_data(X, y,
1182+
validate_separately=(check_X_params,
1183+
check_y_params))
1184+
if sparse.isspmatrix(X):
1185+
if (hasattr(reference_to_old_X, "data") and
1186+
not np.may_share_memory(reference_to_old_X.data, X.data)):
1187+
# X is a sparse matrix and has been copied
1188+
copy_X = False
1189+
elif not np.may_share_memory(reference_to_old_X, X):
1190+
# X has been copied
1191+
copy_X = False
1192+
del reference_to_old_X
1193+
else:
1194+
# Need to validate separately here.
1195+
# We can't pass multi_ouput=True because that would allow y to be
1196+
# csr. We also want to allow y to be 64 or 32 but check_X_y only
1197+
# allows to convert for 64.
1198+
check_X_params = dict(accept_sparse='csc',
1199+
dtype=[np.float64, np.float32], order='F',
1200+
copy=copy_X)
1201+
X, y = self._validate_data(X, y,
1202+
validate_separately=(check_X_params,
1203+
check_y_params))
1204+
copy_X = False
1205+
11621206
if y.shape[0] == 0:
11631207
raise ValueError("y has 0 samples: %r" % y)
11641208

@@ -1191,35 +1235,6 @@ def fit(self, X, y):
11911235
if self.selection not in ["random", "cyclic"]:
11921236
raise ValueError("selection should be either random or cyclic.")
11931237

1194-
# This makes sure that there is no duplication in memory.
1195-
# Dealing right with copy_X is important in the following:
1196-
# Multiple functions touch X and subsamples of X and can induce a
1197-
# lot of duplication of memory
1198-
copy_X = self.copy_X and self.fit_intercept
1199-
1200-
if isinstance(X, np.ndarray) or sparse.isspmatrix(X):
1201-
# Keep a reference to X
1202-
reference_to_old_X = X
1203-
# Let us not impose fortran ordering so far: it is
1204-
# not useful for the cross-validation loop and will be done
1205-
# by the model fitting itself
1206-
X = self._validate_data(X, accept_sparse='csc',
1207-
dtype=[np.float64, np.float32], copy=False)
1208-
if sparse.isspmatrix(X):
1209-
if (hasattr(reference_to_old_X, "data") and
1210-
not np.may_share_memory(reference_to_old_X.data, X.data)):
1211-
# X is a sparse matrix and has been copied
1212-
copy_X = False
1213-
elif not np.may_share_memory(reference_to_old_X, X):
1214-
# X has been copied
1215-
copy_X = False
1216-
del reference_to_old_X
1217-
else:
1218-
X = self._validate_data(X, accept_sparse='csc',
1219-
dtype=[np.float64, np.float32], order='F',
1220-
copy=copy_X)
1221-
copy_X = False
1222-
12231238
if X.shape[0] != y.shape[0]:
12241239
raise ValueError("X and y have inconsistent dimensions (%d != %d)"
12251240
% (X.shape[0], y.shape[0]))
@@ -1842,9 +1857,15 @@ def fit(self, X, y):
18421857
To avoid memory re-allocation it is advised to allocate the
18431858
initial data in memory directly using that format.
18441859
"""
1845-
X = self._validate_data(X, dtype=[np.float64, np.float32], order='F',
1846-
copy=self.copy_X and self.fit_intercept)
1847-
y = check_array(y, dtype=X.dtype.type, ensure_2d=False)
1860+
1861+
# Need to validate separately here.
1862+
# We can't pass multi_ouput=True because that would allow y to be csr.
1863+
check_X_params = dict(dtype=[np.float64, np.float32], order='F',
1864+
copy=self.copy_X and self.fit_intercept)
1865+
check_y_params = dict(ensure_2d=False)
1866+
X, y = self._validate_data(X, y, validate_separately=(check_X_params,
1867+
check_y_params))
1868+
y = y.astype(X.dtype)
18481869

18491870
if hasattr(self, 'l1_ratio'):
18501871
model_str = 'ElasticNet'

0 commit comments

Comments
 (0)