Skip to content

Commit 4aada4e

Browse files
authored
ENH Support unknown_value=np.nan in OrdinalEncoder (scikit-learn#18406)
1 parent f3b64db commit 4aada4e

File tree

3 files changed

+51
-16
lines changed

3 files changed

+51
-16
lines changed

doc/whats_new/v0.24.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -542,7 +542,8 @@ Changelog
542542
``use_encoded_value`` option, along with a new ``unknown_value`` parameter,
543543
to :class:`preprocessing.OrdinalEncoder` to allow unknown categories during
544544
transform and set the encoded value of the unknown categories.
545-
:pr:`17406` by :user:`Felix Wick <FelixWick>`.
545+
:pr:`17406` by :user:`Felix Wick <FelixWick>` and :pr:`18406` by
546+
`Nicolas Hug`_.
546547

547548
- |Feature| Add ``clip`` parameter to :class:`preprocessing.MinMaxScaler`,
548549
which clips the transformed values of test data to ``feature_range``.

sklearn/preprocessing/_encoders.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import numbers
88

99
from ..base import BaseEstimator, TransformerMixin
10-
from ..utils import check_array
10+
from ..utils import check_array, is_scalar_nan
1111
from ..utils.validation import check_is_fitted
1212
from ..utils.validation import _deprecate_positional_args
1313

@@ -631,11 +631,12 @@ class OrdinalEncoder(_BaseEncoder):
631631
632632
.. versionadded:: 0.24
633633
634-
unknown_value : int, default=None
634+
unknown_value : int or np.nan, default=None
635635
When the parameter handle_unknown is set to 'use_encoded_value', this
636636
parameter is required and will set the encoded value of unknown
637637
categories. It has to be distinct from the values used to encode any of
638-
the categories in `fit`.
638+
the categories in `fit`. If set to np.nan, the `dtype` parameter must
639+
be a float dtype.
639640
640641
.. versionadded:: 0.24
641642
@@ -699,13 +700,21 @@ def fit(self, X, y=None):
699700
self
700701
"""
701702
if self.handle_unknown == 'use_encoded_value':
702-
if not isinstance(self.unknown_value, numbers.Integral):
703-
raise TypeError(f"unknown_value should be an integer when "
704-
f"`handle_unknown is 'use_encoded_value'`, "
703+
if is_scalar_nan(self.unknown_value):
704+
if np.dtype(self.dtype).kind != 'f':
705+
raise ValueError(
706+
f"When unknown_value is np.nan, the dtype "
707+
"parameter should be "
708+
f"a float dtype. Got {self.dtype}."
709+
)
710+
elif not isinstance(self.unknown_value, numbers.Integral):
711+
raise TypeError(f"unknown_value should be an integer or "
712+
f"np.nan when "
713+
f"handle_unknown is 'use_encoded_value', "
705714
f"got {self.unknown_value}.")
706715
elif self.unknown_value is not None:
707716
raise TypeError(f"unknown_value should only be set when "
708-
f"`handle_unknown is 'use_encoded_value'`, "
717+
f"handle_unknown is 'use_encoded_value', "
709718
f"got {self.unknown_value}.")
710719

711720
self._fit(X)
@@ -735,11 +744,12 @@ def transform(self, X):
735744
Transformed input.
736745
"""
737746
X_int, X_mask = self._transform(X, handle_unknown=self.handle_unknown)
747+
X_trans = X_int.astype(self.dtype, copy=False)
738748

739749
# create separate category for unknown values
740750
if self.handle_unknown == 'use_encoded_value':
741-
X_int[~X_mask] = self.unknown_value
742-
return X_int.astype(self.dtype, copy=False)
751+
X_trans[~X_mask] = self.unknown_value
752+
return X_trans
743753

744754
def inverse_transform(self, X):
745755
"""

sklearn/preprocessing/tests/test_encoders.py

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -589,21 +589,21 @@ def test_ordinal_encoder_handle_unknowns_raise():
589589
X = np.array([['a', 'x'], ['b', 'y']], dtype=object)
590590

591591
enc = OrdinalEncoder(handle_unknown='use_encoded_value')
592-
msg = ("unknown_value should be an integer when `handle_unknown is "
593-
"'use_encoded_value'`, got None.")
592+
msg = ("unknown_value should be an integer or np.nan when handle_unknown "
593+
"is 'use_encoded_value', got None.")
594594
with pytest.raises(TypeError, match=msg):
595595
enc.fit(X)
596596

597597
enc = OrdinalEncoder(unknown_value=-2)
598-
msg = ("unknown_value should only be set when `handle_unknown is "
599-
"'use_encoded_value'`, got -2.")
598+
msg = ("unknown_value should only be set when handle_unknown is "
599+
"'use_encoded_value', got -2.")
600600
with pytest.raises(TypeError, match=msg):
601601
enc.fit(X)
602602

603603
enc = OrdinalEncoder(handle_unknown='use_encoded_value',
604604
unknown_value='bla')
605-
msg = ("unknown_value should be an integer when `handle_unknown is "
606-
"'use_encoded_value'`, got bla.")
605+
msg = ("unknown_value should be an integer or np.nan when handle_unknown "
606+
"is 'use_encoded_value', got bla.")
607607
with pytest.raises(TypeError, match=msg):
608608
enc.fit(X)
609609

@@ -614,6 +614,30 @@ def test_ordinal_encoder_handle_unknowns_raise():
614614
enc.fit(X)
615615

616616

617+
def test_ordinal_encoder_handle_unknowns_nan():
618+
# Make sure unknown_value=np.nan properly works
619+
620+
enc = OrdinalEncoder(handle_unknown='use_encoded_value',
621+
unknown_value=np.nan)
622+
623+
X_fit = np.array([[1], [2], [3]])
624+
enc.fit(X_fit)
625+
X_trans = enc.transform([[1], [2], [4]])
626+
assert_array_equal(X_trans, [[0], [1], [np.nan]])
627+
628+
629+
def test_ordinal_encoder_handle_unknowns_nan_non_float_dtype():
630+
# Make sure an error is raised when unknown_value=np.nan and the dtype
631+
# isn't a float dtype
632+
enc = OrdinalEncoder(handle_unknown='use_encoded_value',
633+
unknown_value=np.nan, dtype=int)
634+
635+
X_fit = np.array([[1], [2], [3]])
636+
with pytest.raises(ValueError,
637+
match="dtype parameter should be a float dtype"):
638+
enc.fit(X_fit)
639+
640+
617641
def test_ordinal_encoder_raise_categories_shape():
618642

619643
X = np.array([['Low', 'Medium', 'High', 'Medium', 'Low']], dtype=object).T

0 commit comments

Comments
 (0)