-
Notifications
You must be signed in to change notification settings - Fork 91
Integrate Sklearn OneHotEncoder #830
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5e2d533
a407776
6ebdebf
db8b9f0
ec1d143
c4d692e
8659ef0
331be4e
104dddc
1f4059b
24c75b7
6511d28
be1942b
d09c3a2
ab89776
14dc0fd
6505d72
64cc83f
a7f5088
3813863
ed36917
bbd8357
490a04f
f6e6d67
b7c3e56
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from sklearn.preprocessing import OneHotEncoder as SKOneHotEncoder | ||
|
||
from .encoder import CategoricalEncoder | ||
|
||
|
@@ -11,9 +12,47 @@ class OneHotEncoder(CategoricalEncoder): | |
name = 'One Hot Encoder' | ||
hyperparameter_ranges = {} | ||
|
||
def __init__(self, top_n=10, random_state=0): | ||
"""Initalizes self.""" | ||
parameters = {"top_n": top_n} | ||
def __init__(self, | ||
top_n=10, | ||
categories=None, | ||
drop=None, | ||
handle_unknown="ignore", | ||
handle_missing="error", | ||
random_state=0): | ||
"""Initalizes an transformer that encodes categorical features in a one-hot numeric array." | ||
|
||
Arguments: | ||
top_n (int): Number of categories per column to encode. If None, all categories will be encoded. | ||
Otherwise, the `n` most frequent will be encoded and all others will be dropped. Defaults to 10. | ||
categories (list): A two dimensional list of categories, where `categories[i]` is a list of the categories | ||
for the column at index `i`. This can also be `None`, or `"auto"` if `top_n` is not None. Defaults to None. | ||
drop (string): Method ("first" or "if_binary") to use to drop one category per feature. Can also be | ||
a list specifying which method to use for each feature. Defaults to None. | ||
handle_unknown (string): Whether to ignore or error for unknown categories for a feature encountered | ||
during `fit` or `transform`. If either `top_n` or `categories` is used to limit the number of categories | ||
per column, this must be "ignore". Defaults to "ignore". | ||
handle_missing (string): Options for how to handle missing (NaN) values encountered during | ||
`fit` or `transform`. If this is set to "as_category" and NaN values are within the `n` most frequent, | ||
"nan" values will be encoded as their own column. If this is set to "error", any missing | ||
values encountered will raise an error. Defaults to "error". | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice! |
||
""" | ||
parameters = {"top_n": top_n, | ||
"categories": categories, | ||
"drop": drop, | ||
"handle_unknown": handle_unknown, | ||
"handle_missing": handle_missing} | ||
|
||
# Check correct inputs | ||
unknown_input_options = ["ignore", "error"] | ||
missing_input_options = ["as_category", "error"] | ||
if handle_unknown not in unknown_input_options: | ||
raise ValueError("Invalid input {} for handle_unknown".format(handle_unknown)) | ||
if handle_missing not in missing_input_options: | ||
raise ValueError("Invalid input {} for handle_missing".format(handle_missing)) | ||
if top_n is not None and categories is not None: | ||
raise ValueError("Cannot use categories and top_n arguments simultaneously") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Love it, this is very clear |
||
|
||
self._encoder = None | ||
super().__init__(parameters=parameters, | ||
component_obj=None, | ||
random_state=random_state) | ||
|
@@ -32,17 +71,38 @@ def fit(self, X, y=None): | |
X = pd.DataFrame(X) | ||
X_t = X | ||
cols_to_encode = self._get_cat_cols(X_t) | ||
self.col_unique_values = {} | ||
for col in X_t.columns: | ||
if col in cols_to_encode: | ||
|
||
if self.parameters['handle_missing'] == "as_category": | ||
X_t[cols_to_encode] = X_t[cols_to_encode].replace(np.nan, "nan") | ||
elif self.parameters['handle_missing'] == "error" and X.isnull().any().any(): | ||
raise ValueError("Input contains NaN") | ||
|
||
if len(cols_to_encode) == 0: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we have unit test coverage of this case? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep, |
||
categories = 'auto' | ||
eccabay marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
elif self.parameters['categories'] is not None: | ||
categories = self.parameters['categories'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could things break if this comes back as an empty list? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm assuming not, since we start with an empty list in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a good catch, scikit-learn throws an error in that case. I'll add our own catch to provide more useful feedback. |
||
if len(categories) != len(cols_to_encode) or not isinstance(categories[0], list): | ||
raise ValueError('Categories argument must contain a list of categories for each categorical feature') | ||
|
||
else: | ||
categories = [] | ||
for col in X_t[cols_to_encode]: | ||
value_counts = X_t[col].value_counts(dropna=False).to_frame() | ||
if len(value_counts) <= top_n: | ||
if top_n is None or len(value_counts) <= top_n: | ||
unique_values = value_counts.index.tolist() | ||
else: | ||
value_counts = value_counts.sample(frac=1, random_state=self.random_state) | ||
value_counts = value_counts.sort_values([col], ascending=False, kind='mergesort') | ||
unique_values = value_counts.head(top_n).index.tolist() | ||
self.col_unique_values[col] = unique_values | ||
unique_values = np.sort(unique_values) | ||
categories.append(unique_values) | ||
eccabay marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
# Create an encoder to pass off the rest of the computation to | ||
self._encoder = SKOneHotEncoder(categories=categories, | ||
drop=self.parameters['drop'], | ||
handle_unknown=self.parameters['handle_unknown']) | ||
self._encoder.fit(X_t[cols_to_encode]) | ||
return self | ||
|
||
def transform(self, X, y=None): | ||
|
@@ -54,22 +114,27 @@ def transform(self, X, y=None): | |
Returns: | ||
Transformed dataframe, where each categorical feature has been encoded into numerical columns using one-hot encoding. | ||
""" | ||
try: | ||
col_values = self.col_unique_values | ||
except AttributeError: | ||
if self._encoder is None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 good call! I'm working on a plan to standardize this, using decorators and a metaclass, but in the meantime its good we're adding this logic to more components. |
||
raise RuntimeError("You must fit one hot encoder before calling transform!") | ||
if not isinstance(X, pd.DataFrame): | ||
X = pd.DataFrame(X) | ||
cat_cols = self._get_cat_cols(X) | ||
|
||
if self.parameters['handle_missing'] == "as_category": | ||
X[cat_cols] = X[cat_cols].replace(np.nan, "nan") | ||
eccabay marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if self.parameters['handle_missing'] == "error" and X.isnull().any().any(): | ||
raise ValueError("Input contains NaN") | ||
|
||
X_t = pd.DataFrame() | ||
# Add the non-categorical columns, untouched | ||
for col in X.columns: | ||
if col in col_values: | ||
unique = col_values[col] | ||
for label in unique: | ||
new_name = str(col) + "_" + str(label) | ||
add = (X[col] == label).astype("uint8") | ||
add = add.rename(new_name) | ||
X_t = pd.concat([X_t, add], axis=1) | ||
else: | ||
if col not in cat_cols: | ||
X_t = pd.concat([X_t, X[col]], axis=1) | ||
|
||
# Call sklearn's transform on the categorical columns | ||
if len(cat_cols) > 0: | ||
X_cat = pd.DataFrame(self._encoder.transform(X[cat_cols]).toarray()) | ||
X_cat.columns = self._encoder.get_feature_names(input_features=cat_cols) | ||
X_t = pd.concat([X_t.reindex(X_cat.index), X_cat], axis=1) | ||
|
||
return X_t |
Uh oh!
There was an error while loading. Please reload this page.