abhinav-upadhyay
diff --git a/‎doc/whats_new/v0.20.rst‎
Lines changed: 4 additions & 0 deletions b/‎doc/whats_new/v0.20.rst‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎sklearn/decomposition/incremental_pca.py‎
Lines changed: 4 additions & 3 deletions b/‎sklearn/decomposition/incremental_pca.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎sklearn/preprocessing/data.py‎
Lines changed: 61 additions & 25 deletions b/‎sklearn/preprocessing/data.py‎
Lines changed: 61 additions & 25 deletions
diff --git a/‎sklearn/preprocessing/tests/test_common.py‎
Lines changed: 5 additions & 1 deletion b/‎sklearn/preprocessing/tests/test_common.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎sklearn/preprocessing/tests/test_data.py‎
Lines changed: 26 additions & 8 deletions b/‎sklearn/preprocessing/tests/test_data.py‎
Lines changed: 26 additions & 8 deletions
diff --git a/‎sklearn/utils/estimator_checks.py‎
Lines changed: 1 addition & 1 deletion b/‎sklearn/utils/estimator_checks.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎sklearn/utils/extmath.py‎
Lines changed: 17 additions & 13 deletions b/‎sklearn/utils/extmath.py‎
Lines changed: 17 additions & 13 deletions
@@ -230,6 +230,10 @@ Preprocessing
   :issue:`10404` and :issue:`11243` by :user:`Lucija Gregov <LucijaGregov>` and
   :user:`Guillaume Lemaitre <glemaitre>`.
 
+- :class:`preprocessing.StandardScaler` and :func:`preprocessing.scale`
+  ignore and pass-through NaN values.
+  :issue:`11206` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 Model evaluation and meta-estimators
 
 - A scorer based on :func:`metrics.brier_score_loss` is also available.
 
@@ -243,9 +243,10 @@ def partial_fit(self, X, y=None, check_input=True):
 
         # Update stats - they are 0 if this is the fisrt step
         col_mean, col_var, n_total_samples = \
-            _incremental_mean_and_var(X, last_mean=self.mean_,
-                                      last_variance=self.var_,
-                                      last_sample_count=self.n_samples_seen_)
+            _incremental_mean_and_var(
+                X, last_mean=self.mean_, last_variance=self.var_,
+                last_sample_count=np.repeat(self.n_samples_seen_, X.shape[1]))
+        n_total_samples = n_total_samples[0]
 
         # Whitening
         if self.n_samples_seen_ == 0:
 
@@ -126,6 +126,9 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
 
     To avoid memory copy the caller should pass a CSC matrix.
 
+    NaNs are treated as missing values: disregarded to compute the statistics,
+    and maintained during the data transformation.
+
     For a comparison of the different scalers, transformers, and normalizers,
     see :ref:`examples/preprocessing/plot_all_scaling.py
     <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
@@ -138,7 +141,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
     """  # noqa
     X = check_array(X, accept_sparse='csc', copy=copy, ensure_2d=False,
                     warn_on_dtype=True, estimator='the scale function',
-                    dtype=FLOAT_DTYPES)
+                    dtype=FLOAT_DTYPES, force_all_finite='allow-nan')
     if sparse.issparse(X):
         if with_mean:
             raise ValueError(
@@ -154,15 +157,15 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
     else:
         X = np.asarray(X)
         if with_mean:
-            mean_ = np.mean(X, axis)
+            mean_ = np.nanmean(X, axis)
         if with_std:
-            scale_ = np.std(X, axis)
+            scale_ = np.nanstd(X, axis)
         # Xr is a view on the original array that enables easy use of
         # broadcasting on the axis in which we are interested in
         Xr = np.rollaxis(X, axis)
         if with_mean:
             Xr -= mean_
-            mean_1 = Xr.mean(axis=0)
+            mean_1 = np.nanmean(Xr, axis=0)
             # Verify that mean_1 is 'close to zero'. If X contains very
             # large values, mean_1 can also be very large, due to a lack of
             # precision of mean_. In this case, a pre-scaling of the
@@ -179,7 +182,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
             scale_ = _handle_zeros_in_scale(scale_, copy=False)
             Xr /= scale_
             if with_mean:
-                mean_2 = Xr.mean(axis=0)
+                mean_2 = np.nanmean(Xr, axis=0)
                 # If mean_2 is not 'close to zero', it comes from the fact that
                 # scale_ is very small so that mean_2 = mean_1/scale_ > 0, even
                 # if mean_1 was close to zero. The problem is thus essentially
@@ -520,27 +523,31 @@ class StandardScaler(BaseEstimator, TransformerMixin):
 
     Attributes
     ----------
-    scale_ : ndarray, shape (n_features,)
-        Per feature relative scaling of the data.
+    scale_ : ndarray or None, shape (n_features,)
+        Per feature relative scaling of the data. Equal to ``None`` when
+        ``with_std=False``.
 
         .. versionadded:: 0.17
            *scale_*
 
-    mean_ : array of floats with shape [n_features]
+    mean_ : ndarray or None, shape (n_features,)
         The mean value for each feature in the training set.
+        Equal to ``None`` when ``with_mean=False``.
 
-    var_ : array of floats with shape [n_features]
+    var_ : ndarray or None, shape (n_features,)
         The variance for each feature in the training set. Used to compute
-        `scale_`
+        `scale_`. Equal to ``None`` when ``with_std=False``.
 
-    n_samples_seen_ : int
-        The number of samples processed by the estimator. Will be reset on
-        new calls to fit, but increments across ``partial_fit`` calls.
+    n_samples_seen_ : int or array, shape (n_features,)
+        The number of samples processed by the estimator for each feature.
+        If there are not missing samples, the ``n_samples_seen`` will be an
+        integer, otherwise it will be an array.
+        Will be reset on new calls to fit, but increments across
+        ``partial_fit`` calls.
 
     Examples
     --------
     >>> from sklearn.preprocessing import StandardScaler
-    >>>
     >>> data = [[0, 0], [0, 0], [1, 1], [1, 1]]
     >>> scaler = StandardScaler()
     >>> print(scaler.fit(data))
@@ -564,6 +571,9 @@ class StandardScaler(BaseEstimator, TransformerMixin):
 
     Notes
     -----
+    NaNs are treated as missing values: disregarded in fit, and maintained in
+    transform.
+
     For a comparison of the different scalers, transformers, and normalizers,
     see :ref:`examples/preprocessing/plot_all_scaling.py
     <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
@@ -626,22 +636,41 @@ def partial_fit(self, X, y=None):
             Ignored
         """
         X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
-                        warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES)
+                        warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES,
+                        force_all_finite='allow-nan')
 
         # Even in the case of `with_mean=False`, we update the mean anyway
         # This is needed for the incremental computation of the var
         # See incr_mean_variance_axis and _incremental_mean_variance_axis
 
+        # if n_samples_seen_ is an integer (i.e. no missing values), we need to
+        # transform it to a NumPy array of shape (n_features,) required by
+        # incr_mean_variance_axis and _incremental_variance_axis
+        if (hasattr(self, 'n_samples_seen_') and
+                isinstance(self.n_samples_seen_, (int, np.integer))):
+            self.n_samples_seen_ = np.repeat(self.n_samples_seen_,
+                                             X.shape[1]).astype(np.int64)
+
         if sparse.issparse(X):
             if self.with_mean:
                 raise ValueError(
                     "Cannot center sparse matrices: pass `with_mean=False` "
                     "instead. See docstring for motivation and alternatives.")
+
+            sparse_constructor = (sparse.csr_matrix
+                                  if X.format == 'csr' else sparse.csc_matrix)
+            counts_nan = sparse_constructor(
+                        (np.isnan(X.data), X.indices, X.indptr),
+                        shape=X.shape).sum(axis=0).A.ravel()
+
+            if not hasattr(self, 'n_samples_seen_'):
+                self.n_samples_seen_ = (X.shape[0] -
+                                        counts_nan).astype(np.int64)
+
             if self.with_std:
                 # First pass
-                if not hasattr(self, 'n_samples_seen_'):
+                if not hasattr(self, 'scale_'):
                     self.mean_, self.var_ = mean_variance_axis(X, axis=0)
-                    self.n_samples_seen_ = X.shape[0]
                 # Next passes
                 else:
                     self.mean_, self.var_, self.n_samples_seen_ = \
@@ -652,15 +681,15 @@ def partial_fit(self, X, y=None):
             else:
                 self.mean_ = None
                 self.var_ = None
-                if not hasattr(self, 'n_samples_seen_'):
-                    self.n_samples_seen_ = X.shape[0]
-                else:
-                    self.n_samples_seen_ += X.shape[0]
+                if hasattr(self, 'scale_'):
+                    self.n_samples_seen_ += X.shape[0] - counts_nan
         else:
-            # First pass
             if not hasattr(self, 'n_samples_seen_'):
+                self.n_samples_seen_ = np.zeros(X.shape[1], dtype=np.int64)
+
+            # First pass
+            if not hasattr(self, 'scale_'):
                 self.mean_ = .0
-                self.n_samples_seen_ = 0
                 if self.with_std:
                     self.var_ = .0
                 else:
@@ -669,12 +698,18 @@ def partial_fit(self, X, y=None):
             if not self.with_mean and not self.with_std:
                 self.mean_ = None
                 self.var_ = None
-                self.n_samples_seen_ += X.shape[0]
+                self.n_samples_seen_ += X.shape[0] - np.isnan(X).sum(axis=0)
             else:
                 self.mean_, self.var_, self.n_samples_seen_ = \
                     _incremental_mean_and_var(X, self.mean_, self.var_,
                                               self.n_samples_seen_)
 
+        # for backward-compatibility, reduce n_samples_seen_ to an integer
+        # if the number of samples is the same for each feature (i.e. no
+        # missing values)
+        if np.ptp(self.n_samples_seen_) == 0:
+            self.n_samples_seen_ = self.n_samples_seen_[0]
+
         if self.with_std:
             self.scale_ = _handle_zeros_in_scale(np.sqrt(self.var_))
         else:
@@ -704,7 +739,8 @@ def transform(self, X, y='deprecated', copy=None):
 
         copy = copy if copy is not None else self.copy
         X = check_array(X, accept_sparse='csr', copy=copy, warn_on_dtype=True,
-                        estimator=self, dtype=FLOAT_DTYPES)
+                        estimator=self, dtype=FLOAT_DTYPES,
+                        force_all_finite='allow-nan')
 
         if sparse.issparse(X):
             if self.with_mean:
 
@@ -9,9 +9,11 @@
 from sklearn.base import clone
 
 from sklearn.preprocessing import minmax_scale
+from sklearn.preprocessing import scale
 from sklearn.preprocessing import quantile_transform
 
 from sklearn.preprocessing import MinMaxScaler
+from sklearn.preprocessing import StandardScaler
 from sklearn.preprocessing import QuantileTransformer
 
 from sklearn.utils.testing import assert_array_equal
@@ -28,6 +30,8 @@ def _get_valid_samples_by_column(X, col):
 @pytest.mark.parametrize(
     "est, func, support_sparse",
     [(MinMaxScaler(), minmax_scale, False),
+     (StandardScaler(), scale, False),
+     (StandardScaler(with_mean=False), scale, True),
      (QuantileTransformer(n_quantiles=10), quantile_transform, True)]
 )
 def test_missing_value_handling(est, func, support_sparse):
@@ -66,7 +70,7 @@ def test_missing_value_handling(est, func, support_sparse):
         est.fit(_get_valid_samples_by_column(X_train, i))
         # check transforming with NaN works even when training without NaN
         Xt_col = est.transform(X_test[:, [i]])
-        assert_array_equal(Xt_col, Xt[:, [i]])
+        assert_allclose(Xt_col, Xt[:, [i]])
         # check non-NaN is handled as before - the 1st column is all nan
         if not np.isnan(X_test[:, i]).all():
             Xt_col_nonan = est.transform(
 
@@ -33,6 +33,7 @@
 from sklearn.utils.testing import assert_warns_message
 from sklearn.utils.testing import assert_no_warnings
 from sklearn.utils.testing import assert_allclose
+from sklearn.utils.testing import assert_allclose_dense_sparse
 from sklearn.utils.testing import skip_if_32bit
 
 from sklearn.utils.sparsefuncs import mean_variance_axis
@@ -699,6 +700,28 @@ def test_scaler_without_centering():
     assert_array_almost_equal(X_csc_scaled_back.toarray(), X)
 
 
+@pytest.mark.parametrize("with_mean", [True, False])
+@pytest.mark.parametrize("with_std", [True, False])
+@pytest.mark.parametrize("array_constructor",
+                         [np.asarray, sparse.csc_matrix, sparse.csr_matrix])
+def test_scaler_n_samples_seen_with_nan(with_mean, with_std,
+                                        array_constructor):
+    X = np.array([[0, 1, 3],
+                  [np.nan, 6, 10],
+                  [5, 4, np.nan],
+                  [8, 0, np.nan]],
+                 dtype=np.float64)
+    X = array_constructor(X)
+
+    if sparse.issparse(X) and with_mean:
+        pytest.skip("'with_mean=True' cannot be used with sparse matrix.")
+
+    transformer = StandardScaler(with_mean=with_mean, with_std=with_std)
+    transformer.fit(X)
+
+    assert_array_equal(transformer.n_samples_seen_, np.array([3, 4, 2]))
+
+
 def _check_identity_scalers_attributes(scaler_1, scaler_2):
     assert scaler_1.mean_ is scaler_2.mean_ is None
     assert scaler_1.var_ is scaler_2.var_ is None
@@ -725,8 +748,8 @@ def test_scaler_return_identity():
     transformer_csc = clone(transformer_dense)
     X_trans_csc = transformer_csc.fit_transform(X_csc)
 
-    assert_allclose(X_trans_csr.toarray(), X_csr.toarray())
-    assert_allclose(X_trans_csc.toarray(), X_csc.toarray())
+    assert_allclose_dense_sparse(X_trans_csr, X_csr)
+    assert_allclose_dense_sparse(X_trans_csc, X_csc)
     assert_allclose(X_trans_dense, X_dense)
 
     for trans_1, trans_2 in itertools.combinations([transformer_dense,
@@ -877,14 +900,9 @@ def test_scale_sparse_with_mean_raise_exception():
 
 def test_scale_input_finiteness_validation():
     # Check if non finite inputs raise ValueError
-    X = [[np.nan, 5, 6, 7, 8]]
-    assert_raises_regex(ValueError,
-                        "Input contains NaN, infinity or a value too large",
-                        scale, X)
-
     X = [[np.inf, 5, 6, 7, 8]]
     assert_raises_regex(ValueError,
-                        "Input contains NaN, infinity or a value too large",
+                        "Input contains infinity or a value too large",
                         scale, X)
 
 
 
@@ -78,7 +78,7 @@
                 'RandomForestRegressor', 'Ridge', 'RidgeCV']
 
 ALLOW_NAN = ['Imputer', 'SimpleImputer', 'MICEImputer',
-             'MinMaxScaler', 'QuantileTransformer']
+             'MinMaxScaler', 'StandardScaler', 'QuantileTransformer']
 
 
 def _yield_non_meta_checks(name, estimator):
 
@@ -647,8 +647,7 @@ def make_nonnegative(X, min_value=0):
     return X
 
 
-def _incremental_mean_and_var(X, last_mean=.0, last_variance=None,
-                              last_sample_count=0):
+def _incremental_mean_and_var(X, last_mean, last_variance, last_sample_count):
     """Calculate mean update and a Youngs and Cramer variance update.
 
     last_mean and last_variance are statistics computed at the last step by the
@@ -669,7 +668,7 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None,
 
     last_variance : array-like, shape: (n_features,)
 
-    last_sample_count : int
+    last_sample_count : array-like, shape (n_features,)
 
     Returns
     -------
@@ -678,7 +677,11 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None,
     updated_variance : array, shape (n_features,)
         If None, only mean is computed
 
-    updated_sample_count : int
+    updated_sample_count : array, shape (n_features,)
+
+    Notes
+    -----
+    NaNs are ignored during the algorithm.
 
     References
     ----------
@@ -694,27 +697,28 @@ def _incremental_mean_and_var(X, last_mean=.0, last_variance=None,
     # new = the current increment
     # updated = the aggregated stats
     last_sum = last_mean * last_sample_count
-    new_sum = X.sum(axis=0)
+    new_sum = np.nansum(X, axis=0)
 
-    new_sample_count = X.shape[0]
+    new_sample_count = np.sum(~np.isnan(X), axis=0)
     updated_sample_count = last_sample_count + new_sample_count
 
     updated_mean = (last_sum + new_sum) / updated_sample_count
 
     if last_variance is None:
         updated_variance = None
     else:
-        new_unnormalized_variance = X.var(axis=0) * new_sample_count
-        if last_sample_count == 0:  # Avoid division by 0
-            updated_unnormalized_variance = new_unnormalized_variance
-        else:
+        new_unnormalized_variance = np.nanvar(X, axis=0) * new_sample_count
+        last_unnormalized_variance = last_variance * last_sample_count
+
+        with np.errstate(divide='ignore'):
             last_over_new_count = last_sample_count / new_sample_count
-            last_unnormalized_variance = last_variance * last_sample_count
             updated_unnormalized_variance = (
-                last_unnormalized_variance +
-                new_unnormalized_variance +
+                last_unnormalized_variance + new_unnormalized_variance +
                 last_over_new_count / updated_sample_count *
                 (last_sum / last_over_new_count - new_sum) ** 2)
+
+        zeros = last_sample_count == 0
+        updated_unnormalized_variance[zeros] = new_unnormalized_variance[zeros]
         updated_variance = updated_unnormalized_variance / updated_sample_count
 
     return updated_mean, updated_variance, updated_sample_count