Merge pull request scikit-learn#4828 from untom/maxabs_scaler

jnothman · jnothman · commit dc8578ae41cf · 2015-06-12T05:17:07.000+10:00
[MRG] add MaxAbsScaler
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -102,8 +102,10 @@ Scaling features to a range
 ---------------------------
 
 An alternative standardization is scaling features to
-lie between a given minimum and maximum value, often between zero and one.
-This can be achieved using :class:`MinMaxScaler`.
+lie between a given minimum and maximum value, often between zero and one,
+or so that the maximum absolute value of each feature is scaled to unit size.
+This can be achieved using :class:`MinMaxScaler` or :class:`MaxAbsScaler`,
+respectively.
 
 The motivation to use this scaling include robustness to very small
 standard deviations of features and preserving zero entries in sparse data.
@@ -146,6 +148,62 @@ full formula is::
 
     X_scaled = X_std / (max - min) + min
 
+:class:`MaxAbsScaler` works in a very similar fashion, but scales in a way
+that the training data lies within the range ``[-1, 1]`` by dividing through
+the largest maximum value in each feature. It is meant for data
+that is already centered at zero or sparse data.
+
+Here is how to use the toy data from the previous example with this scaler::
+
+  >>> X_train = np.array([[ 1., -1.,  2.],
+  ...                     [ 2.,  0.,  0.],
+  ...                     [ 0.,  1., -1.]])
+  ...
+  >>> max_abs_scaler = preprocessing.MaxAbsScaler()
+  >>> X_train_maxabs = max_abs_scaler.fit_transform(X_train)
+  >>> X_train_maxabs                # doctest +NORMALIZE_WHITESPACE^
+  array([[ 0.5, -1. ,  1. ],
+         [ 1. ,  0. ,  0. ],
+         [ 0. ,  1. , -0.5]])
+  >>> X_test = np.array([[ -3., -1.,  4.]])
+  >>> X_test_maxabs = max_abs_scaler.transform(X_test)
+  >>> X_test_maxabs                 # doctest: +NORMALIZE_WHITESPACE
+  array([[-1.5, -1. ,  2. ]])
+  >>> max_abs_scaler.scale_         # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
+  array([ 2.,  1.,  2.])
+
+
+As with :func:`scale`, the module further provides a
+convenience function :func:`maxabs_scale` if you don't want to
+create an object.
+
+
+Scaling sparse data
+-------------------
+Centering sparse data would destroy the sparseness structure in the data, and
+thus rarely is a sensible thing to do. However, it can make sense to scale
+sparse inputs, especially if features are on different scales.
+
+:class:`MaxAbsScaler`  and :func:`maxabs_scale` were specifically designed
+for scaling sparse data, and are the recommended way to go about this.
+However, :func:`scale` and :class:`StandardScaler` can accept ``scipy.sparse``
+matrices  as input, as long as ``with_centering=False`` is explicitly passed
+to the constructor. Otherwise a ``ValueError`` will be raised as
+silently centering would break the sparsity and would often crash the
+execution by allocating excessive amounts of memory unintentionally.
+:class:`RobustScaler` cannot be fited to sparse inputs, but you can use
+the ``transform`` method on sparse inputs.
+
+Note that the scalers accept both Compressed Sparse Rows and Compressed
+Sparse Columns format (see ``scipy.sparse.csr_matrix`` and
+``scipy.sparse.csc_matrix``). Any other sparse input will be **converted to
+the Compressed Sparse Rows representation**.  To avoid unnecessary memory
+copies, it is recommended to choose the CSR or CSC representation upstream.
+
+Finally, if the centered data is expected to be small enough, explicitly
+converting the input to an array using the ``toarray`` method of sparse matrices
+is another option.
+
 
 Scaling data with outliers
 --------------------------
@@ -173,23 +231,6 @@ data.
   or :class:`sklearn.decomposition.RandomizedPCA` with ``whiten=True``
   to further remove the linear correlation across features.
 
-.. topic:: Sparse input
-
-  :func:`scale` and :class:`StandardScaler` accept ``scipy.sparse`` matrices
-  as input **only when with_mean=False is explicitly passed to the
-  constructor**. Otherwise a ``ValueError`` will be raised as
-  silently centering would break the sparsity and would often crash the
-  execution by allocating excessive amounts of memory unintentionally.
-
-  If the centered data is expected to be small enough, explicitly convert
-  the input to an array using the ``toarray`` method of sparse matrices
-  instead.
-
-  For sparse input the data is **converted to the Compressed Sparse Rows
-  representation** (see ``scipy.sparse.csr_matrix``).
-  To avoid unnecessary memory copies, it is recommended to choose the CSR
-  representation upstream.
-
 .. topic:: Scaling target variables in regression
 
     :func:`scale` and :class:`StandardScaler` work out-of-the-box with 1d arrays.
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -21,6 +21,11 @@ New features
      alternative to :class:`preprocessing.StandardScaler` for feature-wise
      centering and range normalization that is robust to outliers. By `Thomas Unterthiner`_.
 
+   - The new class :class:`preprocessing.MaxAbsScaler` provides an
+     alternative to :class:`preprocessing.MinMaxScaler` for feature-wise
+     range normalization when the data is already centered or sparse.
+     By `Thomas Unterthiner`_.
+
 Enhancements
 ............
 
diff --git a/sklearn/preprocessing/__init__.py b/sklearn/preprocessing/__init__.py
@@ -6,6 +6,7 @@
 from .data import Binarizer
 from .data import KernelCenterer
 from .data import MinMaxScaler
+from .data import MaxAbsScaler
 from .data import Normalizer
 from .data import RobustScaler
 from .data import StandardScaler
@@ -14,6 +15,7 @@
 from .data import normalize
 from .data import scale
 from .data import robust_scale
+from .data import maxabs_scale
 from .data import OneHotEncoder
 
 from .data import PolynomialFeatures
@@ -33,6 +35,7 @@
     'LabelEncoder',
     'MultiLabelBinarizer',
     'MinMaxScaler',
+    'MaxAbsScaler',
     'Normalizer',
     'OneHotEncoder',
     'RobustScaler',
@@ -43,5 +46,6 @@
     'normalize',
     'scale',
     'robust_scale',
+    'maxabs_scale',
     'label_binarize',
 ]
diff --git a/sklearn/preprocessing/data.py b/sklearn/preprocessing/data.py
@@ -32,6 +32,7 @@
     'Binarizer',
     'KernelCenterer',
     'MinMaxScaler',
+    'MaxAbsScaler',
     'Normalizer',
     'OneHotEncoder',
     'RobustScaler',
@@ -41,6 +42,7 @@
     'normalize',
     'scale',
     'robust_scale',
+    'maxabs_scale',
 ]
 
 
@@ -59,16 +61,28 @@ def _mean_and_std(X, axis=0, with_mean=True, with_std=True):
 
     if with_std:
         std_ = Xr.std(axis=0)
-        if isinstance(std_, np.ndarray):
-            std_[std_ == 0.] = 1.0
-        elif std_ == 0.:
-            std_ = 1.
+        std_ = _handle_zeros_in_scale(std_)
     else:
         std_ = None
 
     return mean_, std_
 
 
+def _handle_zeros_in_scale(scale):
+    ''' Makes sure that whenever scale is zero, we handle it correctly.
+
+    This happens in most scalers when we have constant features.'''
+
+    # if we are fitting on 1D arrays, scale might be a scalar
+    if np.isscalar(scale):
+        if scale == 0:
+            scale = 1.
+    elif isinstance(scale, np.ndarray):
+        scale[scale == 0.0] = 1.0
+        scale[~np.isfinite(scale)] = 1.0
+    return scale
+
+
 def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
     """Standardize a dataset along any axis
 
@@ -134,7 +148,7 @@ def scale(X, axis=0, with_mean=True, with_std=True, copy=True):
         if copy:
             X = X.copy()
         _, var = mean_variance_axis(X, axis=0)
-        var[var == 0.0] = 1.0
+        var = _handle_zeros_in_scale(var)
         inplace_column_scale(X, 1 / np.sqrt(var))
     else:
         X = np.asarray(X)
@@ -237,11 +251,7 @@ def fit(self, X, y=None):
                              " than maximum. Got %s." % str(feature_range))
         data_min = np.min(X, axis=0)
         data_range = np.max(X, axis=0) - data_min
-        # Do not scale constant features
-        if isinstance(data_range, np.ndarray):
-            data_range[data_range == 0.0] = 1.0
-        elif data_range == 0.:
-            data_range = 1.
+        data_range = _handle_zeros_in_scale(data_range)
         self.scale_ = (feature_range[1] - feature_range[0]) / data_range
         self.min_ = feature_range[0] - data_min * self.scale_
         self.data_range = data_range
@@ -366,7 +376,7 @@ def fit(self, X, y=None):
             if self.with_std:
                 var = mean_variance_axis(X, axis=0)[1]
                 self.std_ = np.sqrt(var)
-                self.std_[var == 0.0] = 1.0
+                self.std_ = _handle_zeros_in_scale(self.std_)
             else:
                 self.std_ = None
             return self
@@ -437,6 +447,119 @@ def inverse_transform(self, X, copy=None):
         return X
 
 
+class MaxAbsScaler(BaseEstimator, TransformerMixin):
+    """Scale each feature by its maximum absolute value.
+
+    This estimator scales and translates each feature individually such
+    that the maximal absolute value of each feature in the
+    training set will be 1.0. It does not shift/center the data, and
+    thus does not destroy any sparsity.
+
+    This scaler can also be applied to sparse CSR or CSC matrices.
+
+    Parameters
+    ----------
+    copy : boolean, optional, default is True
+        Set to False to perform inplace scaling and avoid a copy (if the input
+        is already a numpy array).
+
+    Attributes
+    ----------
+    scale_ : ndarray, shape (n_features,)
+        Per feature relative scaling of the data.
+    """
+
+    def __init__(self, copy=True):
+        self.copy = copy
+
+    def fit(self, X, y=None):
+        """Compute the minimum and maximum to be used for later scaling.
+
+        Parameters
+        ----------
+        X : array-like, shape [n_samples, n_features]
+            The data used to compute the per-feature minimum and maximum
+            used for later scaling along the features axis.
+        """
+        X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
+                        ensure_2d=False, estimator=self, dtype=FLOAT_DTYPES)
+        if sparse.issparse(X):
+            mins, maxs = min_max_axis(X, axis=0)
+            scales = np.maximum(np.abs(mins), np.abs(maxs))
+        else:
+            scales = np.abs(X).max(axis=0)
+        scales = np.array(scales)
+        scales = scales.reshape(-1)
+        self.scale_ = _handle_zeros_in_scale(scales)
+        return self
+
+    def transform(self, X, y=None):
+        """Scale the data
+
+        Parameters
+        ----------
+        X : array-like or CSR matrix.
+            The data that should be scaled.
+        """
+        check_is_fitted(self, 'scale_')
+        X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
+                        ensure_2d=False, estimator=self, dtype=FLOAT_DTYPES)
+        if sparse.issparse(X):
+            if X.shape[0] == 1:
+                inplace_row_scale(X, 1.0 / self.scale_)
+            else:
+                inplace_column_scale(X, 1.0 / self.scale_)
+        else:
+            X /= self.scale_
+        return X
+
+    def inverse_transform(self, X):
+        """Scale back the data to the original representation
+
+        Parameters
+        ----------
+        X : array-like or CSR matrix.
+            The data that should be transformed back.
+        """
+        check_is_fitted(self, 'scale_')
+        X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
+                        ensure_2d=False, estimator=self, dtype=FLOAT_DTYPES)
+        if sparse.issparse(X):
+            if X.shape[0] == 1:
+                inplace_row_scale(X, self.scale_)
+            else:
+                inplace_column_scale(X, self.scale_)
+        else:
+            X *= self.scale_
+        return X
+
+
+def maxabs_scale(X, axis=0, copy=True):
+    """Scale each feature to the [-1, 1] range without breaking the sparsity.
+
+    This estimator scales each feature individually such
+    that the maximal absolute value of each feature in the
+    training set will be 1.0.
+
+    This scaler can also be applied to sparse CSR or CSC matrices.
+
+    Parameters
+    ----------
+    axis : int (0 by default)
+        axis used to scale along. If 0, independently scale each feature,
+        otherwise (if 1) scale each sample.
+
+    copy : boolean, optional, default is True
+        Set to False to perform inplace scaling and avoid a copy (if the input
+        is already a numpy array).
+    """
+    s = MaxAbsScaler(copy=copy)
+    if axis == 0:
+        return s.fit_transform(X)
+    else:
+        return s.fit_transform(X.T).T
+
+
 class RobustScaler(BaseEstimator, TransformerMixin):
     """Scale features using statistics that are robust to outliers.
 
@@ -507,28 +630,15 @@ def __init__(self, with_centering=True, with_scaling=True, copy=True):
 
     def _check_array(self, X, copy):
         """Makes sure centering is not enabled for sparse matrices."""
-        X = check_array(X, accept_sparse=('csr', 'csc'), dtype=np.float,
-                        copy=copy, ensure_2d=False)
+        X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
+                        ensure_2d=False, estimator=self, dtype=FLOAT_DTYPES)
         if sparse.issparse(X):
             if self.with_centering:
                 raise ValueError(
                     "Cannot center sparse matrices: use `with_centering=False`"
                     " instead. See docstring for motivation and alternatives.")
         return X
 
-    def _handle_zeros_in_scale(self, scale):
-        ''' Makes sure that whenever scale is zero, we handle it correctly.
-
-        This happens in most scalers when we have constant features.'''
-        # if we are fitting on 1D arrays, scale might be a scalar
-        if np.isscalar(scale):
-            if scale == 0:
-                scale = 1.
-        elif isinstance(scale, np.ndarray):
-            scale[scale == 0.0] = 1.0
-            scale[~np.isfinite(scale)] = 1.0
-        return scale
-
     def fit(self, X, y=None):
         """Compute the median and quantiles to be used for scaling.
 
@@ -548,12 +658,7 @@ def fit(self, X, y=None):
         if self.with_scaling:
             q = np.percentile(X, (25, 75), axis=0)
             self.scale_ = (q[1] - q[0])
-            if np.isscalar(self.scale_):
-                if self.scale_ == 0:
-                    self.scale_ = 1.
-            else:
-                self.scale_[self.scale_ == 0.0] = 1.0
-                self.scale_[~np.isfinite(self.scale_)] = 1.0
+            self.scale_ = _handle_zeros_in_scale(self.scale_)
         return self
 
     def transform(self, X, y=None):
@@ -860,7 +965,7 @@ def normalize(X, norm='l2', axis=1, copy=True):
             norms = row_norms(X)
         elif norm == 'max':
             norms = np.max(X, axis=1)
-        norms[norms == 0.0] = 1.0
+        norms = _handle_zeros_in_scale(norms)
         X /= norms[:, np.newaxis]
 
     if axis == 0:
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py