Skip to content

Standarize inputs as pd.Dataframe / pd.Series #130

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Nov 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Changelog
* Lower botocore requirement :pr:`235`
* Changes
* Updating demo datasets to retain column names :pr:`223`
* Standarizing inputs as pd.Dataframe / pd.Series :pr:`130`
* Documentation Changes
* Testing Changes
* Added support for testing on Windows with CircleCI :pr:`226`
Expand Down
1 change: 0 additions & 1 deletion evalml/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@

from evalml.pipelines import list_model_types, save_pipeline, load_pipeline
from evalml.models import AutoClassifier, AutoRegressor
from evalml.pipelines import list_model_types, load_pipeline, save_pipeline

warnings.filterwarnings("ignore", category=DeprecationWarning)

Expand Down
16 changes: 16 additions & 0 deletions evalml/objectives/fraud_cost.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pandas as pd

from .objective_base import ObjectiveBase

from evalml.problem_types import ProblemTypes
Expand Down Expand Up @@ -38,12 +40,26 @@ def __init__(self, retry_percentage=.5, interchange_fee=.02,
def decision_function(self, y_predicted, extra_cols, threshold):
"""Determine if transaction is fraud given predicted probabilities,
dataframe with transaction amount, and threshold"""
if not isinstance(extra_cols, pd.DataFrame):
extra_cols = pd.DataFrame(extra_cols)

if not isinstance(y_predicted, pd.Series):
y_predicted = pd.Series(y_predicted)

transformed_probs = (y_predicted * extra_cols[self.amount_col])
return transformed_probs > threshold

def objective_function(self, y_predicted, y_true, extra_cols):
"""Calculate amount lost to fraud given predictions, true values, and dataframe
with transaction amount"""
if not isinstance(extra_cols, pd.DataFrame):
extra_cols = pd.DataFrame(extra_cols)

if not isinstance(y_predicted, pd.Series):
y_predicted = pd.Series(y_predicted)

if not isinstance(y_true, pd.Series):
y_true = pd.Series(y_true)

# extract transaction using the amount columns in users data
transaction_amount = extra_cols[self.amount_col]
Expand Down
11 changes: 11 additions & 0 deletions evalml/objectives/lead_scoring.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import pandas as pd

from .objective_base import ObjectiveBase

from evalml.problem_types import ProblemTypes
Expand Down Expand Up @@ -28,9 +30,18 @@ def __init__(self, true_positives=1, false_positives=-1, verbose=False):
super().__init__(verbose=verbose)

def decision_function(self, y_predicted, threshold):
if not isinstance(y_predicted, pd.Series):
y_predicted = pd.Series(y_predicted)

return y_predicted > threshold

def objective_function(self, y_predicted, y_true):
if not isinstance(y_predicted, pd.Series):
y_predicted = pd.Series(y_predicted)

if not isinstance(y_true, pd.Series):
y_true = pd.Series(y_true)

true_positives = (y_true & y_predicted).sum()
false_positives = (~y_true & y_predicted).sum()

Expand Down
21 changes: 17 additions & 4 deletions evalml/pipelines/pipeline_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,11 +240,14 @@ def predict(self, X):
"""Make predictions using selected features.

Args:
X (DataFrame) : features
X (pd.DataFrame or np.array) : data of shape [n_samples, n_features]

Returns:
Series : estimated labels
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

X_t = self._transform(X)

if self.objective and self.objective.needs_fitting:
Expand All @@ -265,13 +268,17 @@ def predict_proba(self, X):
"""Make probability estimates for labels.

Args:
X (DataFrame) : features
X (pd.DataFrame or np.array) : data of shape [n_samples, n_features]

Returns:
DataFrame : probability estimates
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

X = self._transform(X)
proba = self.estimator.predict_proba(X)

if proba.shape[1] <= 2:
return proba[:, 1]
else:
Expand All @@ -281,13 +288,19 @@ def score(self, X, y, other_objectives=None):
"""Evaluate model performance on current and additional objectives

Args:
X (DataFrame) : features for model predictions
y (Series) : true labels
X (pd.DataFrame or np.array) : data of shape [n_samples, n_features]
y (Series) : true labels of length [n_samples]
other_objectives (list): list of other objectives to score

Returns:
score, ordered dictionary of other objective scores
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)

if not isinstance(y, pd.Series):
y = pd.Series(y)

other_objectives = other_objectives or []
other_objectives = [get_objective(o) for o in other_objectives]
y_predicted = None
Expand Down
9 changes: 7 additions & 2 deletions evalml/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,15 +52,20 @@ def split_data(X, y, regression=False, test_size=.2, random_state=None):
"""Splits data into train and test sets.

Args:
X (DataFrame) : features
y (Series) : labels
X (pd.DataFrame or np.array) : data of shape [n_samples, n_features]
y (Series) : labels of length [n_samples]
regression (bool): if true, do not use stratified split
test_size (float) : percent of train set to holdout for testing
random_state (int) : seed for the random number generator

Returns:
DataFrame, DataFrame, Series, Series : features and labels each split into train and test sets
"""
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
if not isinstance(y, pd.Series):
y = pd.Series(y)

if regression:
CV_method = ShuffleSplit(n_splits=1,
test_size=test_size,
Expand Down
20 changes: 15 additions & 5 deletions evalml/tests/objective_tests/test_fraud_detection.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd

from evalml import AutoClassifier
Expand All @@ -15,8 +16,6 @@ def test_function(X_y):
)

clf = AutoClassifier(objective=objective, max_pipelines=1)

X = pd.DataFrame(X)
clf.fit(X, y)

pipeline = clf.best_pipeline
Expand All @@ -26,8 +25,19 @@ def test_function(X_y):

fraud_cost = FraudCost(amount_col="value")

probabilities = pd.Series([.1, .5, .5])
y_predicted = pd.Series([.1, .5, .5])
y_true = [True, False, True]
extra_columns = pd.DataFrame({"value": [100, 5, 25]})

out = fraud_cost.decision_function(probabilities, extra_columns, 5)
assert out.tolist() == [True, False, True]
out = fraud_cost.decision_function(y_predicted, extra_columns, 5)
assert out.tolist() == y_true
score = fraud_cost.score(out, y_true, extra_columns)
assert (score == 0.0)

# testing with other types of inputs
y_predicted = np.array([.1, .5, .5])
extra_columns = {"value": [100, 5, 25]}
out = fraud_cost.decision_function(y_predicted, extra_columns, 5)
assert out.tolist() == y_true
score = fraud_cost.score(out, y_true, extra_columns)
assert (score == 0.0)
16 changes: 13 additions & 3 deletions evalml/tests/objective_tests/test_lead_scoring.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd

from evalml import AutoClassifier
Expand All @@ -13,11 +14,20 @@ def test_function(X_y):
)

clf = AutoClassifier(objective=objective, max_pipelines=1, random_state=0)

X = pd.DataFrame(X)
clf.fit(X, y)

pipeline = clf.best_pipeline
pipeline.predict(X)
pipeline.predict_proba(X)
pipeline.score(X, y)

predicted = pd.Series([1, 10, .5, 5])
out = objective.decision_function(predicted, 1)
y_true = [False, True, False, True]
assert out.tolist() == [False, True, False, True]

predicted = np.array([1, 10, .5, 5])
out = objective.decision_function(predicted, 1)
assert out.tolist() == y_true

score = objective.score(out, y_true)
assert (score == 0.5)
4 changes: 0 additions & 4 deletions evalml/tests/objective_tests/test_objectives.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import pandas as pd

from evalml.objectives import (
Precision,
PrecisionMacro,
Expand All @@ -24,8 +22,6 @@ def test_get_objectives_types():

def test_binary_average(X_y):
X, y = X_y
X = pd.DataFrame(X)
y = pd.Series(y)

pipeline = LogisticRegressionPipeline(objective=Precision(), penalty='l2', C=1.0, impute_strategy='mean', number_features=0)
pipeline.fit(X, y)
Expand Down
3 changes: 0 additions & 3 deletions evalml/tests/pipeline_tests/test_pipelines.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import os

import pandas as pd
import pytest

from evalml.model_types import ModelTypes
Expand Down Expand Up @@ -72,8 +71,6 @@ def test_load_pickled_pipeline_with_custom_objective(X_y, pickled_pipeline_path)

def test_reproducibility(X_y):
X, y = X_y
X = pd.DataFrame(X)

objective = FraudCost(
retry_percentage=.5,
interchange_fee=.02,
Expand Down
6 changes: 0 additions & 6 deletions evalml/tests/preprocessing_tests/test_split_data.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
import pandas as pd

from evalml.preprocessing import split_data


def test_split_regression(X_y_reg):
X, y = X_y_reg
X = pd.DataFrame(X)
y = pd.Series(y)
test_pct = 0.25
X_train, X_test, y_train, y_test = split_data(X, y, test_size=test_pct, regression=True)
test_size = len(X) * test_pct
Expand All @@ -19,8 +15,6 @@ def test_split_regression(X_y_reg):

def test_split_classification(X_y):
X, y = X_y
X = pd.DataFrame(X)
y = pd.Series(y)
test_pct = 0.25
X_train, X_test, y_train, y_test = split_data(X, y, test_size=test_pct)
test_size = len(X) * 0.25
Expand Down