FIX copy in OneHotEncoder and _transform_selected

larsmans · larsmans · commit d25f12bf9f02 · 2013-07-05T21:52:35.000+02:00
Avoids issues raised by @GaelVaroquaux: when a transformed matrix unexpectedly shares memory with its original, modifications to the original show up in the transformed version. Since this is an unlikely corner case in OneHotTransformer, I didn't add a copy argument to it.
diff --git a/sklearn/preprocessing.py b/sklearn/preprocessing.py
@@ -637,7 +637,7 @@ def transform(self, X, y=None, copy=None):
         return binarize(X, threshold=self.threshold, copy=copy)
 
 
-def _transform_selected(X, transform, selected="all"):
+def _transform_selected(X, transform, selected="all", copy=True):
     """Apply a transform function to portion of selected features
 
     Parameters
@@ -648,40 +648,46 @@ def _transform_selected(X, transform, selected="all"):
     transform : callable
         A callable transform(X) -> X_transformed
 
+    copy : boolean, optional
+        Copy X even if it could be avoided.
+
     selected: "all" or array of indices or mask
-        Specify what features to apply the transform to.
+        Specify which features to apply the transform to. May not be a mask
+        for sparse X.
 
     Returns
     -------
     X : array or sparse matrix, shape=(n_samples, n_features_new)
     """
     if selected == "all":
         return transform(X)
-    elif len(selected) == 0:
+
+    X = atleast2d_or_csc(X, copy=copy)
+
+    if len(selected) == 0:
+        return X
+
+    n_features = X.shape[1]
+    ind = np.arange(n_features)
+    sel = np.zeros(n_features, dtype=bool)
+    sel[np.asarray(selected)] = True
+    not_sel = np.logical_not(sel)
+    n_selected = np.sum(sel)
+
+    if n_selected == 0:
+        # No features selected.
         return X
+    elif n_selected == n_features:
+        # All features selected.
+        return transform(X)
     else:
-        X = atleast2d_or_csc(X)
-        n_features = X.shape[1]
-        ind = np.arange(n_features)
-        sel = np.zeros(n_features, dtype=bool)
-        sel[np.array(selected)] = True
-        not_sel = np.logical_not(sel)
-        n_selected = np.sum(sel)
-
-        if n_selected == 0:
-            # No features selected.
-            return X
-        elif n_selected == n_features:
-            # All features selected.
-            return transform(X)
-        else:
-            X_sel = transform(X[:, ind[sel]])
-            X_not_sel = X[:, ind[not_sel]]
+        X_sel = transform(X[:, ind[sel]])
+        X_not_sel = X[:, ind[not_sel]]
 
-            if sp.issparse(X_sel) or sp.issparse(X_not_sel):
-                return sp.hstack((X_sel, X_not_sel))
-            else:
-                return np.hstack((X_sel, X_not_sel))
+        if sp.issparse(X_sel) or sp.issparse(X_not_sel):
+            return sp.hstack((X_sel, X_not_sel))
+        else:
+            return np.hstack((X_sel, X_not_sel))
 
 
 class OneHotEncoder(BaseEstimator, TransformerMixin):
@@ -693,8 +699,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
     feature. It is assumed that input features take on values in the range
     [0, n_values).
 
-    This encoding is needed for feeding categorical data to scikit-learn
-    estimators.
+    This encoding is needed for feeding categorical data to many scikit-learn
+    estimators, notably linear models and SVMs with the standard kernels.
 
     Parameters
     ----------
@@ -778,7 +784,7 @@ def fit(self, X, y=None):
         return self
 
     def _fit_transform(self, X):
-        """Asssumes X contains only categorical features."""
+        """Assumes X contains only categorical features."""
         X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
         if np.any(X < 0):
             raise ValueError("X needs to contain only non-negative integers.")
@@ -826,7 +832,7 @@ def fit_transform(self, X, y=None):
         efficient. See fit for the parameters, transform for the return value.
         """
         return _transform_selected(X, self._fit_transform,
-                                   self.categorical_features)
+                                   self.categorical_features, copy=True)
 
     def _transform(self, X):
         """Asssumes X contains only categorical features."""
@@ -870,7 +876,7 @@ def transform(self, X):
             Transformed input.
         """
         return _transform_selected(X, self._transform,
-                                   self.categorical_features)
+                                   self.categorical_features, copy=True)
 
 
 class LabelEncoder(BaseEstimator, TransformerMixin):
diff --git a/sklearn/tests/test_preprocessing.py b/sklearn/tests/test_preprocessing.py
@@ -613,23 +613,23 @@ def test_one_hot_encoder():
     assert_raises(ValueError, enc.transform, [[0], [-1]])
 
 
-def _check_transform_selected(X, Xexpected, sel):
+def _check_transform_selected(X, X_expected, sel):
     for M in (X, sp.csr_matrix(X)):
         Xtr = _transform_selected(M, Binarizer().transform, sel)
-        assert_array_equal(toarray(Xtr), Xexpected)
+        assert_array_equal(toarray(Xtr), X_expected)
 
 
 def test_transform_selected():
     X = [[3, 2, 1], [0, 1, 1]]
 
-    Xexpected = [[1, 2, 1], [0, 1, 1]]
-    _check_transform_selected(X, Xexpected, [0])
-    _check_transform_selected(X, Xexpected, [True, False, False])
+    X_expected = [[1, 2, 1], [0, 1, 1]]
+    _check_transform_selected(X, X_expected, [0])
+    _check_transform_selected(X, X_expected, [True, False, False])
 
-    Xexpected = [[1, 1, 1], [0, 1, 1]]
-    _check_transform_selected(X, Xexpected, [0, 1, 2])
-    _check_transform_selected(X, Xexpected, [True, True, True])
-    _check_transform_selected(X, Xexpected, "all")
+    X_expected = [[1, 1, 1], [0, 1, 1]]
+    _check_transform_selected(X, X_expected, [0, 1, 2])
+    _check_transform_selected(X, X_expected, [True, True, True])
+    _check_transform_selected(X, X_expected, "all")
 
     _check_transform_selected(X, X, [])
     _check_transform_selected(X, X, [False, False, False])