Skip to content

Integrate Sklearn OneHotEncoder #830

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
Jun 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
5e2d533
Preliminary attempt to add sk's onehot
eccabay May 29, 2020
a407776
Integrated top_n into caategories (some test failures)
eccabay Jun 1, 2020
6ebdebf
SKlearn integrated fully except for missing and unknown value handling
eccabay Jun 2, 2020
db8b9f0
add support for top_n being None
eccabay Jun 2, 2020
ec1d143
Fake-out NaN values
eccabay Jun 2, 2020
c4d692e
Fake NaN values only for categorical columns
eccabay Jun 2, 2020
8659ef0
Add increased testing
eccabay Jun 3, 2020
331be4e
Merge branch 'master' into 776_sklearn_onehot
eccabay Jun 3, 2020
104dddc
Update changelog
eccabay Jun 3, 2020
1f4059b
Fix lint and unit test issues
eccabay Jun 3, 2020
24c75b7
Address PR comments
eccabay Jun 4, 2020
6511d28
Add missing support for categories argument (oops)
eccabay Jun 4, 2020
be1942b
Fix codecov by adding a test
eccabay Jun 4, 2020
d09c3a2
Merge branch 'master' into 776_sklearn_onehot
eccabay Jun 4, 2020
ab89776
fix master merge error
eccabay Jun 4, 2020
14dc0fd
Rework encoder to remove self.col_unique_vals (redundant w/categories)
eccabay Jun 5, 2020
6505d72
Address PR comments
eccabay Jun 11, 2020
64cc83f
Add missing tests
eccabay Jun 11, 2020
a7f5088
Merge branch 'master' into 776_sklearn_onehot
eccabay Jun 11, 2020
3813863
Remove comments and fix ValueError matching
eccabay Jun 11, 2020
ed36917
Merge master into branch
eccabay Jun 12, 2020
bbd8357
Fix test failing caused by merge
eccabay Jun 12, 2020
490a04f
Merge branch 'master' into 776_sklearn_onehot
eccabay Jun 15, 2020
f6e6d67
Final PR fixes
eccabay Jun 16, 2020
b7c3e56
Merge branch 'master' into 776_sklearn_onehot
eccabay Jun 16, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Changelog
* Added data check to check for problematic target labels :pr:`814`
* Added PerColumnImputer that allows imputation strategies per column :pr:`824`
* Added transformer to drop specific columns :pr:`827`
* Added support for `categories`, `handle_error`, and `drop` parameters in `OneHotEncoder` :pr:`830`
* Added preprocessing component to handle DateTime columns featurization :pr:`838`
* Added ability to clone pipelines and components :pr:`842`
* Define getter method for component `parameters` :pr:`847`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@

class LogisticRegressionBinaryPipeline(BinaryClassificationPipeline):
"""Logistic Regression Pipeline for binary classification"""
component_graph = ['One Hot Encoder', 'Simple Imputer', 'Standard Scaler', 'Logistic Regression Classifier']
component_graph = ['Simple Imputer', 'One Hot Encoder', 'Standard Scaler', 'Logistic Regression Classifier']
2 changes: 1 addition & 1 deletion evalml/pipelines/classification/random_forest_binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
class RFBinaryClassificationPipeline(BinaryClassificationPipeline):
"""Random Forest Pipeline for binary classification"""
custom_name = "Random Forest Binary Classification Pipeline"
component_graph = ['One Hot Encoder', 'Simple Imputer', 'Random Forest Classifier']
component_graph = ['Simple Imputer', 'One Hot Encoder', 'Random Forest Classifier']
2 changes: 1 addition & 1 deletion evalml/pipelines/classification/xgboost_binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
class XGBoostBinaryPipeline(BinaryClassificationPipeline):
"""XGBoost Pipeline for binary classification"""
custom_name = "XGBoost Binary Classification Pipeline"
component_graph = ['One Hot Encoder', 'Simple Imputer', 'XGBoost Classifier']
component_graph = ['Simple Imputer', 'One Hot Encoder', 'XGBoost Classifier']
103 changes: 84 additions & 19 deletions evalml/pipelines/components/transformers/encoders/onehot_encoder.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder as SKOneHotEncoder

from .encoder import CategoricalEncoder

Expand All @@ -11,9 +12,47 @@ class OneHotEncoder(CategoricalEncoder):
name = 'One Hot Encoder'
hyperparameter_ranges = {}

def __init__(self, top_n=10, random_state=0):
"""Initalizes self."""
parameters = {"top_n": top_n}
def __init__(self,
top_n=10,
categories=None,
drop=None,
handle_unknown="ignore",
handle_missing="error",
random_state=0):
"""Initalizes an transformer that encodes categorical features in a one-hot numeric array."

Arguments:
top_n (int): Number of categories per column to encode. If None, all categories will be encoded.
Otherwise, the `n` most frequent will be encoded and all others will be dropped. Defaults to 10.
categories (list): A two dimensional list of categories, where `categories[i]` is a list of the categories
for the column at index `i`. This can also be `None`, or `"auto"` if `top_n` is not None. Defaults to None.
drop (string): Method ("first" or "if_binary") to use to drop one category per feature. Can also be
a list specifying which method to use for each feature. Defaults to None.
handle_unknown (string): Whether to ignore or error for unknown categories for a feature encountered
during `fit` or `transform`. If either `top_n` or `categories` is used to limit the number of categories
per column, this must be "ignore". Defaults to "ignore".
handle_missing (string): Options for how to handle missing (NaN) values encountered during
`fit` or `transform`. If this is set to "as_category" and NaN values are within the `n` most frequent,
"nan" values will be encoded as their own column. If this is set to "error", any missing
values encountered will raise an error. Defaults to "error".
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

"""
parameters = {"top_n": top_n,
"categories": categories,
"drop": drop,
"handle_unknown": handle_unknown,
"handle_missing": handle_missing}

# Check correct inputs
unknown_input_options = ["ignore", "error"]
missing_input_options = ["as_category", "error"]
if handle_unknown not in unknown_input_options:
raise ValueError("Invalid input {} for handle_unknown".format(handle_unknown))
if handle_missing not in missing_input_options:
raise ValueError("Invalid input {} for handle_missing".format(handle_missing))
if top_n is not None and categories is not None:
raise ValueError("Cannot use categories and top_n arguments simultaneously")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Love it, this is very clear


self._encoder = None
super().__init__(parameters=parameters,
component_obj=None,
random_state=random_state)
Expand All @@ -32,17 +71,38 @@ def fit(self, X, y=None):
X = pd.DataFrame(X)
X_t = X
cols_to_encode = self._get_cat_cols(X_t)
self.col_unique_values = {}
for col in X_t.columns:
if col in cols_to_encode:

if self.parameters['handle_missing'] == "as_category":
X_t[cols_to_encode] = X_t[cols_to_encode].replace(np.nan, "nan")
elif self.parameters['handle_missing'] == "error" and X.isnull().any().any():
raise ValueError("Input contains NaN")

if len(cols_to_encode) == 0:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have unit test coverage of this case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, test_all_numerical_dtype starting at line 284

categories = 'auto'

elif self.parameters['categories'] is not None:
categories = self.parameters['categories']
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could things break if this comes back as an empty list?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm assuming not, since we start with an empty list in the else block below, but just checking

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a good catch, scikit-learn throws an error in that case. I'll add our own catch to provide more useful feedback.

if len(categories) != len(cols_to_encode) or not isinstance(categories[0], list):
raise ValueError('Categories argument must contain a list of categories for each categorical feature')

else:
categories = []
for col in X_t[cols_to_encode]:
value_counts = X_t[col].value_counts(dropna=False).to_frame()
if len(value_counts) <= top_n:
if top_n is None or len(value_counts) <= top_n:
unique_values = value_counts.index.tolist()
else:
value_counts = value_counts.sample(frac=1, random_state=self.random_state)
value_counts = value_counts.sort_values([col], ascending=False, kind='mergesort')
unique_values = value_counts.head(top_n).index.tolist()
self.col_unique_values[col] = unique_values
unique_values = np.sort(unique_values)
categories.append(unique_values)

# Create an encoder to pass off the rest of the computation to
self._encoder = SKOneHotEncoder(categories=categories,
drop=self.parameters['drop'],
handle_unknown=self.parameters['handle_unknown'])
self._encoder.fit(X_t[cols_to_encode])
return self

def transform(self, X, y=None):
Expand All @@ -54,22 +114,27 @@ def transform(self, X, y=None):
Returns:
Transformed dataframe, where each categorical feature has been encoded into numerical columns using one-hot encoding.
"""
try:
col_values = self.col_unique_values
except AttributeError:
if self._encoder is None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍 good call!

I'm working on a plan to standardize this, using decorators and a metaclass, but in the meantime its good we're adding this logic to more components.

raise RuntimeError("You must fit one hot encoder before calling transform!")
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
cat_cols = self._get_cat_cols(X)

if self.parameters['handle_missing'] == "as_category":
X[cat_cols] = X[cat_cols].replace(np.nan, "nan")
if self.parameters['handle_missing'] == "error" and X.isnull().any().any():
raise ValueError("Input contains NaN")

X_t = pd.DataFrame()
# Add the non-categorical columns, untouched
for col in X.columns:
if col in col_values:
unique = col_values[col]
for label in unique:
new_name = str(col) + "_" + str(label)
add = (X[col] == label).astype("uint8")
add = add.rename(new_name)
X_t = pd.concat([X_t, add], axis=1)
else:
if col not in cat_cols:
X_t = pd.concat([X_t, X[col]], axis=1)

# Call sklearn's transform on the categorical columns
if len(cat_cols) > 0:
X_cat = pd.DataFrame(self._encoder.transform(X[cat_cols]).toarray())
X_cat.columns = self._encoder.get_feature_names(input_features=cat_cols)
X_t = pd.concat([X_t.reindex(X_cat.index), X_cat], axis=1)

return X_t
6 changes: 5 additions & 1 deletion evalml/tests/component_tests/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,12 @@ def test_describe_component():
column_imputer = PerColumnImputer({"a": "mean", "b": ("constant", 100)})
scaler = StandardScaler()
feature_selection = RFClassifierSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf)
assert enc.describe(return_dict=True) == {'name': 'One Hot Encoder', 'parameters': {'top_n': 10,
'categories': None,
'drop': None,
'handle_unknown': 'ignore',
'handle_missing': 'error'}}
drop_col_transformer = DropColumns(columns=['col_one', 'col_two'])
assert enc.describe(return_dict=True) == {'name': 'One Hot Encoder', 'parameters': {'top_n': 10}}
assert imputer.describe(return_dict=True) == {'name': 'Simple Imputer', 'parameters': {'impute_strategy': 'mean', 'fill_value': None}}
assert column_imputer.describe(return_dict=True) == {'name': 'Per Column Imputer', 'parameters': {'impute_strategies': {'a': 'mean', 'b': ('constant', 100)}, 'default_impute_strategy': 'most_frequent'}}
assert scaler.describe(return_dict=True) == {'name': 'Standard Scaler', 'parameters': {}}
Expand Down
Loading