alteryx · angela97lin · Nov 21, 2019 · Oct 15, 2019 · Oct 15, 2019 · Oct 15, 2019
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -8,6 +8,7 @@ Changelog
         * Lower botocore requirement :pr:`235`
     * Changes
         * Updating demo datasets to retain column names :pr:`223`
+        * Standarizing inputs as pd.Dataframe / pd.Series :pr:`130`
     * Documentation Changes
     * Testing Changes
         * Added support for testing on Windows with CircleCI :pr:`226`

diff --git a/evalml/__init__.py b/evalml/__init__.py
@@ -22,7 +22,6 @@
 
 from evalml.pipelines import list_model_types, save_pipeline, load_pipeline
 from evalml.models import AutoClassifier, AutoRegressor
-from evalml.pipelines import list_model_types, load_pipeline, save_pipeline
 
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 

diff --git a/evalml/objectives/fraud_cost.py b/evalml/objectives/fraud_cost.py
@@ -1,3 +1,5 @@
+import pandas as pd
+
 from .objective_base import ObjectiveBase
 
 from evalml.problem_types import ProblemTypes
@@ -38,12 +40,26 @@ def __init__(self, retry_percentage=.5, interchange_fee=.02,
     def decision_function(self, y_predicted, extra_cols, threshold):
         """Determine if transaction is fraud given predicted probabilities,
             dataframe with transaction amount, and threshold"""
+        if not isinstance(extra_cols, pd.DataFrame):
+            extra_cols = pd.DataFrame(extra_cols)
+
+        if not isinstance(y_predicted, pd.Series):
+            y_predicted = pd.Series(y_predicted)
+
         transformed_probs = (y_predicted * extra_cols[self.amount_col])
         return transformed_probs > threshold
 
     def objective_function(self, y_predicted, y_true, extra_cols):
         """Calculate amount lost to fraud given predictions, true values, and dataframe
             with transaction amount"""
+        if not isinstance(extra_cols, pd.DataFrame):
+            extra_cols = pd.DataFrame(extra_cols)
+
+        if not isinstance(y_predicted, pd.Series):
+            y_predicted = pd.Series(y_predicted)
+
+        if not isinstance(y_true, pd.Series):
+            y_true = pd.Series(y_true)
 
         # extract transaction using the amount columns in users data
         transaction_amount = extra_cols[self.amount_col]

diff --git a/evalml/objectives/lead_scoring.py b/evalml/objectives/lead_scoring.py
@@ -1,3 +1,5 @@
+import pandas as pd
+
 from .objective_base import ObjectiveBase
 
 from evalml.problem_types import ProblemTypes
@@ -28,9 +30,18 @@ def __init__(self, true_positives=1, false_positives=-1, verbose=False):
         super().__init__(verbose=verbose)
 
     def decision_function(self, y_predicted, threshold):
+        if not isinstance(y_predicted, pd.Series):
+            y_predicted = pd.Series(y_predicted)
+
         return y_predicted > threshold
 
     def objective_function(self, y_predicted, y_true):
+        if not isinstance(y_predicted, pd.Series):
+            y_predicted = pd.Series(y_predicted)
+
+        if not isinstance(y_true, pd.Series):
+            y_true = pd.Series(y_true)
+
         true_positives = (y_true & y_predicted).sum()
         false_positives = (~y_true & y_predicted).sum()
 

diff --git a/evalml/pipelines/pipeline_base.py b/evalml/pipelines/pipeline_base.py
@@ -240,11 +240,14 @@ def predict(self, X):
         """Make predictions using selected features.
 
         Args:
-            X (DataFrame) : features
+            X (pd.DataFrame or np.array) : data of shape [n_samples, n_features]
 
         Returns:
             Series : estimated labels
         """
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X)
+
         X_t = self._transform(X)
 
         if self.objective and self.objective.needs_fitting:
@@ -265,13 +268,17 @@ def predict_proba(self, X):
         """Make probability estimates for labels.
 
         Args:
-            X (DataFrame) : features
+            X (pd.DataFrame or np.array) : data of shape [n_samples, n_features]
 
         Returns:
             DataFrame : probability estimates
         """
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X)
+
         X = self._transform(X)
         proba = self.estimator.predict_proba(X)
+
         if proba.shape[1] <= 2:
             return proba[:, 1]
         else:
@@ -281,13 +288,19 @@ def score(self, X, y, other_objectives=None):
         """Evaluate model performance on current and additional objectives
 
         Args:
-            X (DataFrame) : features for model predictions
-            y (Series) : true labels
+            X (pd.DataFrame or np.array) : data of shape [n_samples, n_features]
+            y (Series) : true labels of length [n_samples]
             other_objectives (list): list of other objectives to score
 
         Returns:
             score, ordered dictionary of other objective scores
         """
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X)
+
+        if not isinstance(y, pd.Series):
+            y = pd.Series(y)
+
         other_objectives = other_objectives or []
         other_objectives = [get_objective(o) for o in other_objectives]
         y_predicted = None

diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py
@@ -52,15 +52,20 @@ def split_data(X, y, regression=False, test_size=.2, random_state=None):
     """Splits data into train and test sets.
 
     Args:
-        X (DataFrame) : features
-        y (Series) : labels
+        X (pd.DataFrame or np.array) : data of shape [n_samples, n_features]
+        y (Series) : labels of length [n_samples]
         regression (bool): if true, do not use stratified split
         test_size (float) : percent of train set to holdout for testing
         random_state (int) : seed for the random number generator
 
     Returns:
         DataFrame, DataFrame, Series, Series : features and labels each split into train and test sets
     """
+    if not isinstance(X, pd.DataFrame):
+        X = pd.DataFrame(X)
+    if not isinstance(y, pd.Series):
+        y = pd.Series(y)
+
     if regression:
         CV_method = ShuffleSplit(n_splits=1,
                                  test_size=test_size,

diff --git a/evalml/tests/objective_tests/test_fraud_detection.py b/evalml/tests/objective_tests/test_fraud_detection.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pandas as pd
 
 from evalml import AutoClassifier
@@ -15,8 +16,6 @@ def test_function(X_y):
     )
 
     clf = AutoClassifier(objective=objective, max_pipelines=1)
-
-    X = pd.DataFrame(X)
     clf.fit(X, y)
 
     pipeline = clf.best_pipeline
@@ -26,8 +25,19 @@ def test_function(X_y):
 
     fraud_cost = FraudCost(amount_col="value")
 
-    probabilities = pd.Series([.1, .5, .5])
+    y_predicted = pd.Series([.1, .5, .5])
+    y_true = [True, False, True]
     extra_columns = pd.DataFrame({"value": [100, 5, 25]})
 
-    out = fraud_cost.decision_function(probabilities, extra_columns, 5)
-    assert out.tolist() == [True, False, True]
+    out = fraud_cost.decision_function(y_predicted, extra_columns, 5)
+    assert out.tolist() == y_true
+    score = fraud_cost.score(out, y_true, extra_columns)
+    assert (score == 0.0)
+
+    # testing with other types of inputs
+    y_predicted = np.array([.1, .5, .5])
+    extra_columns = {"value": [100, 5, 25]}
+    out = fraud_cost.decision_function(y_predicted, extra_columns, 5)
+    assert out.tolist() == y_true
+    score = fraud_cost.score(out, y_true, extra_columns)
+    assert (score == 0.0)
diff --git a/evalml/tests/objective_tests/test_lead_scoring.py b/evalml/tests/objective_tests/test_lead_scoring.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pandas as pd
 
 from evalml import AutoClassifier
@@ -13,11 +14,20 @@ def test_function(X_y):
     )
 
     clf = AutoClassifier(objective=objective, max_pipelines=1, random_state=0)
-
-    X = pd.DataFrame(X)
     clf.fit(X, y)
-
     pipeline = clf.best_pipeline
     pipeline.predict(X)
     pipeline.predict_proba(X)
     pipeline.score(X, y)
+
+    predicted = pd.Series([1, 10, .5, 5])
+    out = objective.decision_function(predicted, 1)
+    y_true = [False, True, False, True]
+    assert out.tolist() == [False, True, False, True]
+
+    predicted = np.array([1, 10, .5, 5])
+    out = objective.decision_function(predicted, 1)
+    assert out.tolist() == y_true
+
+    score = objective.score(out, y_true)
+    assert (score == 0.5)
diff --git a/evalml/tests/objective_tests/test_objectives.py b/evalml/tests/objective_tests/test_objectives.py
@@ -1,5 +1,3 @@
-import pandas as pd
-
 from evalml.objectives import (
     Precision,
     PrecisionMacro,
@@ -24,8 +22,6 @@ def test_get_objectives_types():
 
 def test_binary_average(X_y):
     X, y = X_y
-    X = pd.DataFrame(X)
-    y = pd.Series(y)
 
     pipeline = LogisticRegressionPipeline(objective=Precision(), penalty='l2', C=1.0, impute_strategy='mean', number_features=0)
     pipeline.fit(X, y)

diff --git a/evalml/tests/pipeline_tests/test_pipelines.py b/evalml/tests/pipeline_tests/test_pipelines.py
@@ -1,6 +1,5 @@
 import os
 
-import pandas as pd
 import pytest
 
 from evalml.model_types import ModelTypes
@@ -72,8 +71,6 @@ def test_load_pickled_pipeline_with_custom_objective(X_y, pickled_pipeline_path)
 
 def test_reproducibility(X_y):
     X, y = X_y
-    X = pd.DataFrame(X)
-
     objective = FraudCost(
         retry_percentage=.5,
         interchange_fee=.02,

diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py
@@ -1,12 +1,8 @@
-import pandas as pd
-
 from evalml.preprocessing import split_data
 
 
 def test_split_regression(X_y_reg):
     X, y = X_y_reg
-    X = pd.DataFrame(X)
-    y = pd.Series(y)
     test_pct = 0.25
     X_train, X_test, y_train, y_test = split_data(X, y, test_size=test_pct, regression=True)
     test_size = len(X) * test_pct
@@ -19,8 +15,6 @@ def test_split_regression(X_y_reg):
 
 def test_split_classification(X_y):
     X, y = X_y
-    X = pd.DataFrame(X)
-    y = pd.Series(y)
     test_pct = 0.25
     X_train, X_test, y_train, y_test = split_data(X, y, test_size=test_pct)
     test_size = len(X) * 0.25