Skip to content

Commit 5718466

Browse files
glemaitreogrisel
authored andcommitted
[MRG] ENH: Ignore NaNs in StandardScaler and scale (scikit-learn#11206)
1 parent fdb3c7c commit 5718466

File tree

11 files changed

+328
-151
lines changed

11 files changed

+328
-151
lines changed

doc/whats_new/v0.20.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,10 @@ Preprocessing
230230
:issue:`10404` and :issue:`11243` by :user:`Lucija Gregov <LucijaGregov>` and
231231
:user:`Guillaume Lemaitre <glemaitre>`.
232232

233+
- :class:`preprocessing.StandardScaler` and :func:`preprocessing.scale`
234+
ignore and pass-through NaN values.
235+
:issue:`11206` by :user:`Guillaume Lemaitre <glemaitre>`.
236+
233237
Model evaluation and meta-estimators
234238

235239
- A scorer based on :func:`metrics.brier_score_loss` is also available.

sklearn/decomposition/incremental_pca.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -243,9 +243,10 @@ def partial_fit(self, X, y=None, check_input=True):
243243

244244
# Update stats - they are 0 if this is the fisrt step
245245
col_mean, col_var, n_total_samples = \
246-
_incremental_mean_and_var(X, last_mean=self.mean_,
247-
last_variance=self.var_,
248-
last_sample_count=self.n_samples_seen_)
246+
_incremental_mean_and_var(
247+
X, last_mean=self.mean_, last_variance=self.var_,
248+
last_sample_count=np.repeat(self.n_samples_seen_, X.shape[1]))
249+
n_total_samples = n_total_samples[0]
249250

250251
# Whitening
251252
if self.n_samples_seen_ == 0:

sklearn/preprocessing/data.py

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,9 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
126126
127127
To avoid memory copy the caller should pass a CSC matrix.
128128
129+
NaNs are treated as missing values: disregarded to compute the statistics,
130+
and maintained during the data transformation.
131+
129132
For a comparison of the different scalers, transformers, and normalizers,
130133
see :ref:`examples/preprocessing/plot_all_scaling.py
131134
<sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
@@ -138,7 +141,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
138141
""" # noqa
139142
X = check_array(X, accept_sparse='csc', copy=copy, ensure_2d=False,
140143
warn_on_dtype=True, estimator='the scale function',
141-
dtype=FLOAT_DTYPES)
144+
dtype=FLOAT_DTYPES, force_all_finite='allow-nan')
142145
if sparse.issparse(X):
143146
if with_mean:
144147
raise ValueError(
@@ -154,15 +157,15 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
154157
else:
155158
X = np.asarray(X)
156159
if with_mean:
157-
mean_ = np.mean(X, axis)
160+
mean_ = np.nanmean(X, axis)
158161
if with_std:
159-
scale_ = np.std(X, axis)
162+
scale_ = np.nanstd(X, axis)
160163
# Xr is a view on the original array that enables easy use of
161164
# broadcasting on the axis in which we are interested in
162165
Xr = np.rollaxis(X, axis)
163166
if with_mean:
164167
Xr -= mean_
165-
mean_1 = Xr.mean(axis=0)
168+
mean_1 = np.nanmean(Xr, axis=0)
166169
# Verify that mean_1 is 'close to zero'. If X contains very
167170
# large values, mean_1 can also be very large, due to a lack of
168171
# precision of mean_. In this case, a pre-scaling of the
@@ -179,7 +182,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
179182
scale_ = _handle_zeros_in_scale(scale_, copy=False)
180183
Xr /= scale_
181184
if with_mean:
182-
mean_2 = Xr.mean(axis=0)
185+
mean_2 = np.nanmean(Xr, axis=0)
183186
# If mean_2 is not 'close to zero', it comes from the fact that
184187
# scale_ is very small so that mean_2 = mean_1/scale_ > 0, even
185188
# if mean_1 was close to zero. The problem is thus essentially
@@ -520,27 +523,31 @@ class StandardScaler(BaseEstimator, TransformerMixin):
520523
521524
Attributes
522525
----------
523-
scale_ : ndarray, shape (n_features,)
524-
Per feature relative scaling of the data.
526+
scale_ : ndarray or None, shape (n_features,)
527+
Per feature relative scaling of the data. Equal to ``None`` when
528+
``with_std=False``.
525529
526530
.. versionadded:: 0.17
527531
*scale_*
528532
529-
mean_ : array of floats with shape [n_features]
533+
mean_ : ndarray or None, shape (n_features,)
530534
The mean value for each feature in the training set.
535+
Equal to ``None`` when ``with_mean=False``.
531536
532-
var_ : array of floats with shape [n_features]
537+
var_ : ndarray or None, shape (n_features,)
533538
The variance for each feature in the training set. Used to compute
534-
`scale_`
539+
`scale_`. Equal to ``None`` when ``with_std=False``.
535540
536-
n_samples_seen_ : int
537-
The number of samples processed by the estimator. Will be reset on
538-
new calls to fit, but increments across ``partial_fit`` calls.
541+
n_samples_seen_ : int or array, shape (n_features,)
542+
The number of samples processed by the estimator for each feature.
543+
If there are not missing samples, the ``n_samples_seen`` will be an
544+
integer, otherwise it will be an array.
545+
Will be reset on new calls to fit, but increments across
546+
``partial_fit`` calls.
539547
540548
Examples
541549
--------
542550
>>> from sklearn.preprocessing import StandardScaler
543-
>>>
544551
>>> data = [[0, 0], [0, 0], [1, 1], [1, 1]]
545552
>>> scaler = StandardScaler()
546553
>>> print(scaler.fit(data))
@@ -564,6 +571,9 @@ class StandardScaler(BaseEstimator, TransformerMixin):
564571
565572
Notes
566573
-----
574+
NaNs are treated as missing values: disregarded in fit, and maintained in
575+
transform.
576+
567577
For a comparison of the different scalers, transformers, and normalizers,
568578
see :ref:`examples/preprocessing/plot_all_scaling.py
569579
<sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
@@ -626,22 +636,41 @@ def partial_fit(self, X, y=None):
626636
Ignored
627637
"""
628638
X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
629-
warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES)
639+
warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES,
640+
force_all_finite='allow-nan')
630641

631642
# Even in the case of `with_mean=False`, we update the mean anyway
632643
# This is needed for the incremental computation of the var
633644
# See incr_mean_variance_axis and _incremental_mean_variance_axis
634645

646+
# if n_samples_seen_ is an integer (i.e. no missing values), we need to
647+
# transform it to a NumPy array of shape (n_features,) required by
648+
# incr_mean_variance_axis and _incremental_variance_axis
649+
if (hasattr(self, 'n_samples_seen_') and
650+
isinstance(self.n_samples_seen_, (int, np.integer))):
651+
self.n_samples_seen_ = np.repeat(self.n_samples_seen_,
652+
X.shape[1]).astype(np.int64)
653+
635654
if sparse.issparse(X):
636655
if self.with_mean:
637656
raise ValueError(
638657
"Cannot center sparse matrices: pass `with_mean=False` "
639658
"instead. See docstring for motivation and alternatives.")
659+
660+
sparse_constructor = (sparse.csr_matrix
661+
if X.format == 'csr' else sparse.csc_matrix)
662+
counts_nan = sparse_constructor(
663+
(np.isnan(X.data), X.indices, X.indptr),
664+
shape=X.shape).sum(axis=0).A.ravel()
665+
666+
if not hasattr(self, 'n_samples_seen_'):
667+
self.n_samples_seen_ = (X.shape[0] -
668+
counts_nan).astype(np.int64)
669+
640670
if self.with_std:
641671
# First pass
642-
if not hasattr(self, 'n_samples_seen_'):
672+
if not hasattr(self, 'scale_'):
643673
self.mean_, self.var_ = mean_variance_axis(X, axis=0)
644-
self.n_samples_seen_ = X.shape[0]
645674
# Next passes
646675
else:
647676
self.mean_, self.var_, self.n_samples_seen_ = \
@@ -652,15 +681,15 @@ def partial_fit(self, X, y=None):
652681
else:
653682
self.mean_ = None
654683
self.var_ = None
655-
if not hasattr(self, 'n_samples_seen_'):
656-
self.n_samples_seen_ = X.shape[0]
657-
else:
658-
self.n_samples_seen_ += X.shape[0]
684+
if hasattr(self, 'scale_'):
685+
self.n_samples_seen_ += X.shape[0] - counts_nan
659686
else:
660-
# First pass
661687
if not hasattr(self, 'n_samples_seen_'):
688+
self.n_samples_seen_ = np.zeros(X.shape[1], dtype=np.int64)
689+
690+
# First pass
691+
if not hasattr(self, 'scale_'):
662692
self.mean_ = .0
663-
self.n_samples_seen_ = 0
664693
if self.with_std:
665694
self.var_ = .0
666695
else:
@@ -669,12 +698,18 @@ def partial_fit(self, X, y=None):
669698
if not self.with_mean and not self.with_std:
670699
self.mean_ = None
671700
self.var_ = None
672-
self.n_samples_seen_ += X.shape[0]
701+
self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)
673702
else:
674703
self.mean_, self.var_, self.n_samples_seen_ = \
675704
_incremental_mean_and_var(X, self.mean_, self.var_,
676705
self.n_samples_seen_)
677706

707+
# for backward-compatibility, reduce n_samples_seen_ to an integer
708+
# if the number of samples is the same for each feature (i.e. no
709+
# missing values)
710+
if np.ptp(self.n_samples_seen_) == 0:
711+
self.n_samples_seen_ = self.n_samples_seen_[0]
712+
678713
if self.with_std:
679714
self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_))
680715
else:
@@ -704,7 +739,8 @@ def transform(self, X, y='deprecated', copy=None):
704739

705740
copy = copy if copy is not None else self.copy
706741
X = check_array(X, accept_sparse='csr', copy=copy, warn_on_dtype=True,
707-
estimator=self, dtype=FLOAT_DTYPES)
742+
estimator=self, dtype=FLOAT_DTYPES,
743+
force_all_finite='allow-nan')
708744

709745
if sparse.issparse(X):
710746
if self.with_mean:

sklearn/preprocessing/tests/test_common.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,11 @@
99
from sklearn.base import clone
1010

1111
from sklearn.preprocessing import minmax_scale
12+
from sklearn.preprocessing import scale
1213
from sklearn.preprocessing import quantile_transform
1314

1415
from sklearn.preprocessing import MinMaxScaler
16+
from sklearn.preprocessing import StandardScaler
1517
from sklearn.preprocessing import QuantileTransformer
1618

1719
from sklearn.utils.testing import assert_array_equal
@@ -28,6 +30,8 @@ def _get_valid_samples_by_column(X, col):
2830
@pytest.mark.parametrize(
2931
"est, func, support_sparse",
3032
[(MinMaxScaler(), minmax_scale, False),
33+
(StandardScaler(), scale, False),
34+
(StandardScaler(with_mean=False), scale, True),
3135
(QuantileTransformer(n_quantiles=10), quantile_transform, True)]
3236
)
3337
def test_missing_value_handling(est, func, support_sparse):
@@ -66,7 +70,7 @@ def test_missing_value_handling(est, func, support_sparse):
6670
est.fit(_get_valid_samples_by_column(X_train, i))
6771
# check transforming with NaN works even when training without NaN
6872
Xt_col = est.transform(X_test[:, [i]])
69-
assert_array_equal(Xt_col, Xt[:, [i]])
73+
assert_allclose(Xt_col, Xt[:, [i]])
7074
# check non-NaN is handled as before - the 1st column is all nan
7175
if not np.isnan(X_test[:, i]).all():
7276
Xt_col_nonan = est.transform(

sklearn/preprocessing/tests/test_data.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from sklearn.utils.testing import assert_warns_message
3434
from sklearn.utils.testing import assert_no_warnings
3535
from sklearn.utils.testing import assert_allclose
36+
from sklearn.utils.testing import assert_allclose_dense_sparse
3637
from sklearn.utils.testing import skip_if_32bit
3738

3839
from sklearn.utils.sparsefuncs import mean_variance_axis
@@ -699,6 +700,28 @@ def test_scaler_without_centering():
699700
assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
700701

701702

703+
@pytest.mark.parametrize("with_mean", [True, False])
704+
@pytest.mark.parametrize("with_std", [True, False])
705+
@pytest.mark.parametrize("array_constructor",
706+
[np.asarray, sparse.csc_matrix, sparse.csr_matrix])
707+
def test_scaler_n_samples_seen_with_nan(with_mean, with_std,
708+
array_constructor):
709+
X = np.array([[0, 1, 3],
710+
[np.nan, 6, 10],
711+
[5, 4, np.nan],
712+
[8, 0, np.nan]],
713+
dtype=np.float64)
714+
X = array_constructor(X)
715+
716+
if sparse.issparse(X) and with_mean:
717+
pytest.skip("'with_mean=True' cannot be used with sparse matrix.")
718+
719+
transformer = StandardScaler(with_mean=with_mean, with_std=with_std)
720+
transformer.fit(X)
721+
722+
assert_array_equal(transformer.n_samples_seen_, np.array([3, 4, 2]))
723+
724+
702725
def _check_identity_scalers_attributes(scaler_1, scaler_2):
703726
assert scaler_1.mean_ is scaler_2.mean_ is None
704727
assert scaler_1.var_ is scaler_2.var_ is None
@@ -725,8 +748,8 @@ def test_scaler_return_identity():
725748
transformer_csc = clone(transformer_dense)
726749
X_trans_csc = transformer_csc.fit_transform(X_csc)
727750

728-
assert_allclose(X_trans_csr.toarray(), X_csr.toarray())
729-
assert_allclose(X_trans_csc.toarray(), X_csc.toarray())
751+
assert_allclose_dense_sparse(X_trans_csr, X_csr)
752+
assert_allclose_dense_sparse(X_trans_csc, X_csc)
730753
assert_allclose(X_trans_dense, X_dense)
731754

732755
for trans_1, trans_2 in itertools.combinations([transformer_dense,
@@ -877,14 +900,9 @@ def test_scale_sparse_with_mean_raise_exception():
877900

878901
def test_scale_input_finiteness_validation():
879902
# Check if non finite inputs raise ValueError
880-
X = [[np.nan, 5, 6, 7, 8]]
881-
assert_raises_regex(ValueError,
882-
"Input contains NaN, infinity or a value too large",
883-
scale, X)
884-
885903
X = [[np.inf, 5, 6, 7, 8]]
886904
assert_raises_regex(ValueError,
887-
"Input contains NaN, infinity or a value too large",
905+
"Input contains infinity or a value too large",
888906
scale, X)
889907

890908

sklearn/utils/estimator_checks.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@
7878
'RandomForestRegressor', 'Ridge', 'RidgeCV']
7979

8080
ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MICEImputer',
81-
'MinMaxScaler', 'QuantileTransformer']
81+
'MinMaxScaler', 'StandardScaler', 'QuantileTransformer']
8282

8383

8484
def _yield_non_meta_checks(name, estimator):

sklearn/utils/extmath.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -647,8 +647,7 @@ def make_nonnegative(X, min_value=0):
647647
return X
648648

649649

650-
def _incremental_mean_and_var(X, last_mean=.0, last_variance=None,
651-
last_sample_count=0):
650+
def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
652651
"""Calculate mean update and a Youngs and Cramer variance update.
653652
654653
last_mean and last_variance are statistics computed at the last step by the
@@ -669,7 +668,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None,
669668
670669
last_variance : array-like, shape: (n_features,)
671670
672-
last_sample_count : int
671+
last_sample_count : array-like, shape (n_features,)
673672
674673
Returns
675674
-------
@@ -678,7 +677,11 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None,
678677
updated_variance : array, shape (n_features,)
679678
If None, only mean is computed
680679
681-
updated_sample_count : int
680+
updated_sample_count : array, shape (n_features,)
681+
682+
Notes
683+
-----
684+
NaNs are ignored during the algorithm.
682685
683686
References
684687
----------
@@ -694,27 +697,28 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None,
694697
# new = the current increment
695698
# updated = the aggregated stats
696699
last_sum = last_mean * last_sample_count
697-
new_sum = X.sum(axis=0)
700+
new_sum = np.nansum(X, axis=0)
698701

699-
new_sample_count = X.shape[0]
702+
new_sample_count = np.sum(~np.isnan(X), axis=0)
700703
updated_sample_count = last_sample_count + new_sample_count
701704

702705
updated_mean = (last_sum + new_sum) / updated_sample_count
703706

704707
if last_variance is None:
705708
updated_variance = None
706709
else:
707-
new_unnormalized_variance = X.var(axis=0) * new_sample_count
708-
if last_sample_count == 0: # Avoid division by 0
709-
updated_unnormalized_variance = new_unnormalized_variance
710-
else:
710+
new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count
711+
last_unnormalized_variance = last_variance * last_sample_count
712+
713+
with np.errstate(divide='ignore'):
711714
last_over_new_count = last_sample_count / new_sample_count
712-
last_unnormalized_variance = last_variance * last_sample_count
713715
updated_unnormalized_variance = (
714-
last_unnormalized_variance +
715-
new_unnormalized_variance +
716+
last_unnormalized_variance + new_unnormalized_variance +
716717
last_over_new_count / updated_sample_count *
717718
(last_sum / last_over_new_count - new_sum) ** 2)
719+
720+
zeros = last_sample_count == 0
721+
updated_unnormalized_variance[zeros] = new_unnormalized_variance[zeros]
718722
updated_variance = updated_unnormalized_variance / updated_sample_count
719723

720724
return updated_mean, updated_variance, updated_sample_count

0 commit comments

Comments
 (0)