Skip to content

Commit 0d8a04b

Browse files
jeremiedbbogrisel
authored andcommitted
[MRG+1] SimpleImputer(strategy="constant") (scikit-learn#11211)
1 parent cd8c01d commit 0d8a04b

File tree

12 files changed

+525
-104
lines changed

12 files changed

+525
-104
lines changed

doc/conftest.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,13 @@ def setup_compose():
6868
raise SkipTest("Skipping compose.rst, pandas not installed")
6969

7070

71+
def setup_impute():
72+
try:
73+
import pandas # noqa
74+
except ImportError:
75+
raise SkipTest("Skipping impute.rst, pandas not installed")
76+
77+
7178
def pytest_runtest_setup(item):
7279
fname = item.fspath.strpath
7380
is_index = fname.endswith('datasets/index.rst')
@@ -84,6 +91,8 @@ def pytest_runtest_setup(item):
8491
setup_working_with_text_data()
8592
elif fname.endswith('modules/compose.rst') or is_index:
8693
setup_compose()
94+
elif fname.endswith('modules/impute.rst'):
95+
setup_impute()
8796

8897

8998
def pytest_runtest_teardown(item):

doc/modules/impute.rst

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,19 +20,20 @@ Univariate feature imputation
2020
=============================
2121

2222
The :class:`SimpleImputer` class provides basic strategies for imputing missing
23-
values, either using the mean, the median or the most frequent value of
24-
the row or column in which the missing values are located. This class
25-
also allows for different missing values encodings.
23+
values. Missing values can be imputed with a provided constant value, or using
24+
the statistics (mean, median or most frequent) of each column in which the
25+
missing values are located. This class also allows for different missing values
26+
encodings.
2627

2728
The following snippet demonstrates how to replace missing values,
2829
encoded as ``np.nan``, using the mean value of the columns (axis 0)
2930
that contain the missing values::
3031

3132
>>> import numpy as np
3233
>>> from sklearn.impute import SimpleImputer
33-
>>> imp = SimpleImputer(missing_values='NaN', strategy='mean')
34+
>>> imp = SimpleImputer(missing_values=np.nan, strategy='mean')
3435
>>> imp.fit([[1, 2], [np.nan, 3], [7, 6]]) # doctest: +NORMALIZE_WHITESPACE
35-
SimpleImputer(copy=True, missing_values='NaN', strategy='mean', verbose=0)
36+
SimpleImputer(copy=True, fill_value=None, missing_values=nan, strategy='mean', verbose=0)
3637
>>> X = [[np.nan, 2], [6, np.nan], [7, 6]]
3738
>>> print(imp.transform(X)) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
3839
[[4. 2. ]
@@ -45,7 +46,7 @@ The :class:`SimpleImputer` class also supports sparse matrices::
4546
>>> X = sp.csc_matrix([[1, 2], [0, 3], [7, 6]])
4647
>>> imp = SimpleImputer(missing_values=0, strategy='mean')
4748
>>> imp.fit(X) # doctest: +NORMALIZE_WHITESPACE
48-
SimpleImputer(copy=True, missing_values=0, strategy='mean', verbose=0)
49+
SimpleImputer(copy=True, fill_value=None, missing_values=0, strategy='mean', verbose=0)
4950
>>> X_test = sp.csc_matrix([[0, 2], [6, 0], [7, 6]])
5051
>>> print(imp.transform(X_test)) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
5152
[[4. 2. ]
@@ -56,6 +57,23 @@ Note that, here, missing values are encoded by 0 and are thus implicitly stored
5657
in the matrix. This format is thus suitable when there are many more missing
5758
values than observed values.
5859

60+
The :class:`SimpleImputer` class also supports categorical data represented as
61+
string values or pandas categoricals when using the ``'most_frequent'`` or
62+
``'constant'`` strategy::
63+
64+
>>> import pandas as pd
65+
>>> df = pd.DataFrame([["a", "x"],
66+
... [np.nan, "y"],
67+
... ["a", np.nan],
68+
... ["b", "y"]], dtype="category")
69+
...
70+
>>> imp = SimpleImputer(strategy="most_frequent")
71+
>>> print(imp.fit_transform(df)) # doctest: +NORMALIZE_WHITESPACE
72+
[['a' 'x']
73+
['a' 'y']
74+
['a' 'y']
75+
['b' 'y']]
76+
5977
.. _mice:
6078

6179
Multivariate feature imputation
@@ -76,7 +94,7 @@ Here is an example snippet::
7694
>>> imp = MICEImputer(n_imputations=10, random_state=0)
7795
>>> imp.fit([[1, 2], [np.nan, 3], [7, np.nan]])
7896
MICEImputer(imputation_order='ascending', initial_strategy='mean',
79-
max_value=None, min_value=None, missing_values='NaN', n_burn_in=10,
97+
max_value=None, min_value=None, missing_values=nan, n_burn_in=10,
8098
n_imputations=10, n_nearest_features=None, predictor=None,
8199
random_state=0, verbose=False)
82100
>>> X_test = [[np.nan, 2], [6, np.nan], [np.nan, 6]]

doc/whats_new/v0.20.rst

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -625,6 +625,17 @@ Imputer
625625
SimpleImputer().fit_transform(X.T).T)``). :issue:`10829` by :user:`Guillaume
626626
Lemaitre <glemaitre>` and :user:`Gilberto Olimpio <gilbertoolimpio>`.
627627

628+
- The :class:`impute.SimpleImputer` has a new strategy, ``'constant'``, to
629+
complete missing values with a fixed one, given by the ``fill_value``
630+
parameter. This strategy supports numeric and non-numeric data, and so does
631+
the ``'most_frequent'`` strategy now. :issue:`11211` by :user:`Jeremie du
632+
Boisberranger <jeremiedbb>`.
633+
634+
- The NaN marker for the missing values has been changed between the
635+
:class:`preprocessing.Imputer` and the :class:`impute.SimpleImputer`.
636+
``missing_values='NaN'`` should now be ``missing_values=np.nan``.
637+
:issue:`11211` by :user:`Jeremie du Boisberranger <jeremiedbb>`.
638+
628639
Outlier Detection models
629640

630641
- More consistent outlier detection API:

examples/compose/column_transformer_mixed_types.py renamed to examples/compose/plot_column_transformer_mixed_types.py

Lines changed: 25 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,16 @@
2727
from __future__ import print_function
2828

2929
import pandas as pd
30+
import numpy as np
3031

31-
from sklearn.compose import make_column_transformer
32-
from sklearn.pipeline import make_pipeline
32+
from sklearn.compose import ColumnTransformer
33+
from sklearn.pipeline import Pipeline
3334
from sklearn.impute import SimpleImputer
3435
from sklearn.preprocessing import StandardScaler, CategoricalEncoder
3536
from sklearn.linear_model import LogisticRegression
3637
from sklearn.model_selection import train_test_split, GridSearchCV
3738

39+
np.random.seed(0)
3840

3941
# Read data from Titanic dataset.
4042
titanic_url = ('https://raw.githubusercontent.com/amueller/'
@@ -49,36 +51,37 @@
4951
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
5052
# - sex: categories encoded as strings {'female', 'male'}.
5153
# - pclass: ordinal integers {1, 2, 3}.
52-
numeric_features = ['age', 'fare']
53-
categorical_features = ['embarked', 'sex', 'pclass']
54-
55-
# Provisionally, use pd.fillna() to impute missing values for categorical
56-
# features; SimpleImputer will eventually support strategy="constant".
57-
data[categorical_features] = data[categorical_features].fillna(value='missing')
5854

5955
# We create the preprocessing pipelines for both numeric and categorical data.
60-
numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler())
61-
categorical_transformer = CategoricalEncoder('onehot-dense',
62-
handle_unknown='ignore')
56+
numeric_features = ['age', 'fare']
57+
numeric_transformer = Pipeline(steps=[
58+
('imputer', SimpleImputer(strategy='median')),
59+
('scaler', StandardScaler())])
60+
61+
categorical_features = ['embarked', 'sex', 'pclass']
62+
categorical_transformer = Pipeline(steps=[
63+
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
64+
('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))])
6365

64-
preprocessing_pl = make_column_transformer(
65-
(numeric_features, numeric_transformer),
66-
(categorical_features, categorical_transformer),
67-
remainder='drop'
68-
)
66+
preprocessor = ColumnTransformer(
67+
transformers=[
68+
('num', numeric_transformer, numeric_features),
69+
('cat', categorical_transformer, categorical_features)],
70+
remainder='drop')
6971

7072
# Append classifier to preprocessing pipeline.
7173
# Now we have a full prediction pipeline.
72-
clf = make_pipeline(preprocessing_pl, LogisticRegression())
74+
clf = Pipeline(steps=[('preprocessor', preprocessor),
75+
('classifier', LogisticRegression())])
7376

7477
X = data.drop('survived', axis=1)
75-
y = data.survived.values
78+
y = data['survived']
7679

7780
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
7881
shuffle=True)
7982

8083
clf.fit(X_train, y_train)
81-
print("model score: %f" % clf.score(X_test, y_test))
84+
print("model score: %.3f" % clf.score(X_test, y_test))
8285

8386

8487
###############################################################################
@@ -93,12 +96,12 @@
9396

9497

9598
param_grid = {
96-
'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'],
97-
'logisticregression__C': [0.1, 1.0, 1.0],
99+
'preprocessor__num__imputer__strategy': ['mean', 'median'],
100+
'classifier__C': [0.1, 1.0, 10, 100],
98101
}
99102

100103
grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False)
101104
grid_search.fit(X_train, y_train)
102105

103-
print(("best logistic regression from grid search: %f"
106+
print(("best logistic regression from grid search: %.3f"
104107
% grid_search.score(X_test, y_test)))

0 commit comments

Comments
 (0)