Skip to content

Commit d25f12b

Browse files
committed
FIX copy in OneHotEncoder and _transform_selected
Avoids issues raised by @GaelVaroquaux: when a transformed matrix unexpectedly shares memory with its original, modifications to the original show up in the transformed version. Since this is an unlikely corner case in OneHotTransformer, I didn't add a copy argument to it.
1 parent 4fe51ba commit d25f12b

File tree

2 files changed

+44
-38
lines changed

2 files changed

+44
-38
lines changed

sklearn/preprocessing.py

Lines changed: 35 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,7 @@ def transform(self, X, y=None, copy=None):
637637
return binarize(X, threshold=self.threshold, copy=copy)
638638

639639

640-
def _transform_selected(X, transform, selected="all"):
640+
def _transform_selected(X, transform, selected="all", copy=True):
641641
"""Apply a transform function to portion of selected features
642642
643643
Parameters
@@ -648,40 +648,46 @@ def _transform_selected(X, transform, selected="all"):
648648
transform : callable
649649
A callable transform(X) -> X_transformed
650650
651+
copy : boolean, optional
652+
Copy X even if it could be avoided.
653+
651654
selected: "all" or array of indices or mask
652-
Specify what features to apply the transform to.
655+
Specify which features to apply the transform to. May not be a mask
656+
for sparse X.
653657
654658
Returns
655659
-------
656660
X : array or sparse matrix, shape=(n_samples, n_features_new)
657661
"""
658662
if selected == "all":
659663
return transform(X)
660-
elif len(selected) == 0:
664+
665+
X = atleast2d_or_csc(X, copy=copy)
666+
667+
if len(selected) == 0:
668+
return X
669+
670+
n_features = X.shape[1]
671+
ind = np.arange(n_features)
672+
sel = np.zeros(n_features, dtype=bool)
673+
sel[np.asarray(selected)] = True
674+
not_sel = np.logical_not(sel)
675+
n_selected = np.sum(sel)
676+
677+
if n_selected == 0:
678+
# No features selected.
661679
return X
680+
elif n_selected == n_features:
681+
# All features selected.
682+
return transform(X)
662683
else:
663-
X = atleast2d_or_csc(X)
664-
n_features = X.shape[1]
665-
ind = np.arange(n_features)
666-
sel = np.zeros(n_features, dtype=bool)
667-
sel[np.array(selected)] = True
668-
not_sel = np.logical_not(sel)
669-
n_selected = np.sum(sel)
670-
671-
if n_selected == 0:
672-
# No features selected.
673-
return X
674-
elif n_selected == n_features:
675-
# All features selected.
676-
return transform(X)
677-
else:
678-
X_sel = transform(X[:, ind[sel]])
679-
X_not_sel = X[:, ind[not_sel]]
684+
X_sel = transform(X[:, ind[sel]])
685+
X_not_sel = X[:, ind[not_sel]]
680686

681-
if sp.issparse(X_sel) or sp.issparse(X_not_sel):
682-
return sp.hstack((X_sel, X_not_sel))
683-
else:
684-
return np.hstack((X_sel, X_not_sel))
687+
if sp.issparse(X_sel) or sp.issparse(X_not_sel):
688+
return sp.hstack((X_sel, X_not_sel))
689+
else:
690+
return np.hstack((X_sel, X_not_sel))
685691

686692

687693
class OneHotEncoder(BaseEstimator, TransformerMixin):
@@ -693,8 +699,8 @@ class OneHotEncoder(BaseEstimator, TransformerMixin):
693699
feature. It is assumed that input features take on values in the range
694700
[0, n_values).
695701
696-
This encoding is needed for feeding categorical data to scikit-learn
697-
estimators.
702+
This encoding is needed for feeding categorical data to many scikit-learn
703+
estimators, notably linear models and SVMs with the standard kernels.
698704
699705
Parameters
700706
----------
@@ -778,7 +784,7 @@ def fit(self, X, y=None):
778784
return self
779785

780786
def _fit_transform(self, X):
781-
"""Asssumes X contains only categorical features."""
787+
"""Assumes X contains only categorical features."""
782788
X = check_arrays(X, sparse_format='dense', dtype=np.int)[0]
783789
if np.any(X < 0):
784790
raise ValueError("X needs to contain only non-negative integers.")
@@ -826,7 +832,7 @@ def fit_transform(self, X, y=None):
826832
efficient. See fit for the parameters, transform for the return value.
827833
"""
828834
return _transform_selected(X, self._fit_transform,
829-
self.categorical_features)
835+
self.categorical_features, copy=True)
830836

831837
def _transform(self, X):
832838
"""Asssumes X contains only categorical features."""
@@ -870,7 +876,7 @@ def transform(self, X):
870876
Transformed input.
871877
"""
872878
return _transform_selected(X, self._transform,
873-
self.categorical_features)
879+
self.categorical_features, copy=True)
874880

875881

876882
class LabelEncoder(BaseEstimator, TransformerMixin):

sklearn/tests/test_preprocessing.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -613,23 +613,23 @@ def test_one_hot_encoder():
613613
assert_raises(ValueError, enc.transform, [[0], [-1]])
614614

615615

616-
def _check_transform_selected(X, Xexpected, sel):
616+
def _check_transform_selected(X, X_expected, sel):
617617
for M in (X, sp.csr_matrix(X)):
618618
Xtr = _transform_selected(M, Binarizer().transform, sel)
619-
assert_array_equal(toarray(Xtr), Xexpected)
619+
assert_array_equal(toarray(Xtr), X_expected)
620620

621621

622622
def test_transform_selected():
623623
X = [[3, 2, 1], [0, 1, 1]]
624624

625-
Xexpected = [[1, 2, 1], [0, 1, 1]]
626-
_check_transform_selected(X, Xexpected, [0])
627-
_check_transform_selected(X, Xexpected, [True, False, False])
625+
X_expected = [[1, 2, 1], [0, 1, 1]]
626+
_check_transform_selected(X, X_expected, [0])
627+
_check_transform_selected(X, X_expected, [True, False, False])
628628

629-
Xexpected = [[1, 1, 1], [0, 1, 1]]
630-
_check_transform_selected(X, Xexpected, [0, 1, 2])
631-
_check_transform_selected(X, Xexpected, [True, True, True])
632-
_check_transform_selected(X, Xexpected, "all")
629+
X_expected = [[1, 1, 1], [0, 1, 1]]
630+
_check_transform_selected(X, X_expected, [0, 1, 2])
631+
_check_transform_selected(X, X_expected, [True, True, True])
632+
_check_transform_selected(X, X_expected, "all")
633633

634634
_check_transform_selected(X, X, [])
635635
_check_transform_selected(X, X, [False, False, False])

0 commit comments

Comments
 (0)