automl · mfeurer · Jul 28, 2020 · Jul 3, 2020 · Jul 8, 2020 · Jul 8, 2020
diff --git a/autosklearn/automl.py b/autosklearn/automl.py
@@ -3,13 +3,14 @@
 import json
 import multiprocessing
 import os
-from typing import Optional, List
+from typing import Optional, List, Union
 import unittest.mock
 import warnings
 
 from ConfigSpace.read_and_write import pcs
 import numpy as np
 import numpy.ma as ma
+import pandas as pd
 import scipy.stats
 from sklearn.base import BaseEstimator
 from sklearn.model_selection._split import _RepeatedSplits, \
@@ -25,6 +26,7 @@
 
 from autosklearn.metrics import Scorer
 from autosklearn.data.xy_data_manager import XYDataManager
+from autosklearn.data.validation import InputValidator
 from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash
 from autosklearn.evaluation.abstract_evaluator import _fit_and_suppress_warnings
 from autosklearn.evaluation.train_evaluator import _fit_with_budget
@@ -189,6 +191,8 @@ def __init__(self,
 
         self._debug_mode = debug_mode
 
+        self.InputValidator = InputValidator()
+
         # Place holder for the run history of the
         # Ensemble building process
         self.ensemble_performance_history = []
@@ -286,6 +290,17 @@ def fit(
         only_return_configuration_space: Optional[bool] = False,
         load_models: bool = True,
     ):
+        # Make sure that input is valid
+        # Performs Ordinal one hot encoding to the target
+        # both for train and test data
+        X, y = self.InputValidator.validate(X, y)
+
+        if X_test is not None:
+            X_test, y_test = self.InputValidator.validate(X_test, y_test)
+            if len(y.shape) != len(y_test.shape):
+                raise ValueError('Target value shapes do not match: %s vs %s'
+                                 % (y.shape, y_test.shape))
+
         # Reset learnt stuff
         self.models_ = None
         self.cv_models_ = None
@@ -331,6 +346,10 @@ def fit(
                     raise ValueError('Only `Categorical` and `Numerical` are '
                                      'valid feature types, you passed `%s`' % ft)
 
+        # Feature types dynamically understood from dataframe
+        if feat_type is None and self.InputValidator.feature_types:
+            feat_type = self.InputValidator.feature_types
+
         datamanager = XYDataManager(
             X, y,
             X_test=X_test,
@@ -530,6 +549,9 @@ def fit(
 
     def refit(self, X, y):
 
+        # Make sure input data is valid
+        X, y = self.InputValidator.validate(X, y)
+
         if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
             self._load_models()
 
@@ -609,6 +631,9 @@ def predict(self, X, batch_size=None, n_jobs=1):
             raise ValueError("Predict and predict_proba can only be called "
                              "if 'ensemble_size != 0'")
 
+        # Make sure that input is valid
+        X = self.InputValidator.validate_features(X)
+
         # Parallelize predictions across models with n_jobs processes.
         # Each process computes predictions in chunks of batch_size rows.
         try:
@@ -659,6 +684,9 @@ def fit_ensemble(self, y, task=None, precision=32,
         if self._logger is None:
             self._logger = self._get_logger(dataset_name)
 
+        # Make sure that input is valid
+        y = self.InputValidator.validate_target(y, is_classification=True)
+
         self._proc_ensemble = self._get_ensemble_process(
             1, task, precision, dataset_name, max_iterations=1,
             ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size)
@@ -793,7 +821,20 @@ def _load_best_individual_model(self):
     def score(self, X, y):
         # fix: Consider only index 1 of second dimension
         # Don't know if the reshaping should be done there or in calculate_score
+
+        # Make sure that input is valid
+        X, y = self.InputValidator.validate(X, y)
+
         prediction = self.predict(X)
+
+        # Encode the prediction using the input validator
+        # We train autosklearn with a encoded version of y,
+        # which is decoded by predict().
+        # Above call to validate() encodes the y given for score()
+        # Below call encodes the prediction, so we compare in the
+        # same representation domain
+        prediction = self.InputValidator.encode_target(prediction)
+
         return calculate_score(solution=y,
                                prediction=prediction,
                                task_type=self._task,
@@ -971,67 +1012,7 @@ def configuration_space_created_hook(self, datamanager, configuration_space):
         return configuration_space
 
 
-class BaseAutoML(AutoML):
-    """Base class for AutoML objects to hold abstract functions for both
-    regression and classification."""
-
-    def __init__(self, *args, **kwargs):
-        self._n_outputs = 1
-        super().__init__(*args, **kwargs)
-
-    def _perform_input_checks(self, X, y):
-        X = self._check_X(X)
-        if y is not None:
-            y = self._check_y(y)
-        return X, y
-
-    def _check_X(self, X):
-        X = sklearn.utils.check_array(X, accept_sparse="csr",
-                                      force_all_finite=False)
-        if scipy.sparse.issparse(X):
-            X.sort_indices()
-        return X
-
-    def _check_y(self, y):
-        y = sklearn.utils.check_array(y, ensure_2d=False)
-        y = np.atleast_1d(y)
-
-        if y.ndim == 1:
-            return y
-        elif y.ndim == 2 and y.shape[1] == 1:
-            warnings.warn("A column-vector y was passed when a 1d array was"
-                          " expected. Will change shape via np.ravel().",
-                          sklearn.utils.DataConversionWarning, stacklevel=2)
-            y = np.ravel(y)
-            return y
-
-        return y
-
-    def refit(self, X, y):
-        X, y = self._perform_input_checks(X, y)
-        _n_outputs = 1 if len(y.shape) == 1 else y.shape[1]
-        if self._n_outputs != _n_outputs:
-            raise ValueError('Number of outputs changed from %d to %d!' %
-                             (self._n_outputs, _n_outputs))
-
-        return super().refit(X, y)
-
-    def fit_ensemble(self, y, task=None, precision=32,
-                     dataset_name=None, ensemble_nbest=None,
-                     ensemble_size=None):
-        _n_outputs = 1 if len(y.shape) == 1 else y.shape[1]
-        if self._n_outputs != _n_outputs:
-            raise ValueError('Number of outputs changed from %d to %d!' %
-                             (self._n_outputs, _n_outputs))
-
-        return super().fit_ensemble(
-            y, task=task, precision=precision,
-            dataset_name=dataset_name, ensemble_nbest=ensemble_nbest,
-            ensemble_size=ensemble_size
-        )
-
-
-class AutoMLClassifier(BaseAutoML):
+class AutoMLClassifier(AutoML):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
@@ -1041,23 +1022,21 @@ def __init__(self, *args, **kwargs):
 
     def fit(
         self,
-        X: np.ndarray,
-        y: np.ndarray,
-        X_test: Optional[np.ndarray] = None,
-        y_test: Optional[np.ndarray] = None,
+        X: Union[np.ndarray, pd.DataFrame],
+        y: Union[np.ndarray, pd.DataFrame],
+        X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
+        y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
         feat_type: Optional[List[bool]] = None,
         dataset_name: Optional[str] = None,
         only_return_configuration_space: bool = False,
         load_models: bool = True,
     ):
-        X, y = self._perform_input_checks(X, y)
-        if X_test is not None:
-            X_test, y_test = self._perform_input_checks(X_test, y_test)
-            if len(y.shape) != len(y_test.shape):
-                raise ValueError('Target value shapes do not match: %s vs %s'
-                                 % (y.shape, y_test.shape))
 
-        y_task = type_of_target(y)
+        # We first validate the dtype of the target provided by the user
+        # In doing so, we also fit the internal encoder for classification
+        y_task = type_of_target(
+            self.InputValidator.validate_target(y, is_classification=True)
+        )
         task = self._task_mapping.get(y_task)
         if task is None:
             raise ValueError('Cannot work on data of type %s' % y_task)
@@ -1068,22 +1047,6 @@ def fit(
             else:
                 self._metric = accuracy
 
-        y, self._classes, self._n_classes = self._process_target_classes(y)
-        if y_test is not None:
-            # Map test values to actual values - TODO: copy to all kinds of
-            # other parts in this code and test it!!!
-            y_test_new = []
-            for output_idx in range(len(self._classes)):
-                mapping = {self._classes[output_idx][idx]: idx
-                           for idx in range(len(self._classes[output_idx]))}
-                enumeration = y_test if len(self._classes) == 1 else y_test[output_idx]
-                y_test_new.append(
-                    np.array([mapping[value] for value in enumeration])
-                )
-            y_test = np.array(y_test_new)
-            if self._n_outputs == 1:
-                y_test = y_test.flatten()
-
         return super().fit(
             X, y,
             X_test=X_test,
@@ -1095,67 +1058,22 @@ def fit(
             load_models=load_models,
         )
 
-    def fit_ensemble(self, y, task=None, precision=32,
-                     dataset_name=None, ensemble_nbest=None,
-                     ensemble_size=None):
-        y, _classes, _n_classes = self._process_target_classes(y)
-        if not hasattr(self, '_classes'):
-            self._classes = _classes
-        if not hasattr(self, '_n_classes'):
-            self._n_classes = _n_classes
-
-        return super().fit_ensemble(y, task, precision, dataset_name,
-                                    ensemble_nbest, ensemble_size)
-
-    def _process_target_classes(self, y):
-        y = super()._check_y(y)
-        self._n_outputs = 1 if len(y.shape) == 1 else y.shape[1]
-
-        y = np.copy(y)
-
-        _classes = []
-        _n_classes = []
-
-        if self._n_outputs == 1:
-            classes_k, y = np.unique(y, return_inverse=True)
-            _classes.append(classes_k)
-            _n_classes.append(classes_k.shape[0])
-        else:
-            for k in range(self._n_outputs):
-                classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True)
-                _classes.append(classes_k)
-                _n_classes.append(classes_k.shape[0])
-
-        _n_classes = np.array(_n_classes, dtype=np.int)
-
-        return y, _classes, _n_classes
-
     def predict(self, X, batch_size=None, n_jobs=1):
         predicted_probabilities = super().predict(X, batch_size=batch_size,
                                                   n_jobs=n_jobs)
 
-        if self._n_outputs == 1:
+        if self.InputValidator.is_single_column_target() == 1:
             predicted_indexes = np.argmax(predicted_probabilities, axis=1)
-            predicted_classes = self._classes[0].take(predicted_indexes)
-
-            return predicted_classes
         else:
-            predicted_indices = (predicted_probabilities > 0.5).astype(int)
-            n_samples = predicted_probabilities.shape[0]
-            predicted_classes = np.zeros((n_samples, self._n_outputs))
+            predicted_indexes = (predicted_probabilities > 0.5).astype(int)
 
-            for k in range(self._n_outputs):
-                output_predicted_indexes = predicted_indices[:, k].reshape(-1)
-                predicted_classes[:, k] = self._classes[k].take(
-                    output_predicted_indexes)
-
-            return predicted_classes
+        return self.InputValidator.decode_target(predicted_indexes)
 
     def predict_proba(self, X, batch_size=None, n_jobs=1):
         return super().predict(X, batch_size=batch_size, n_jobs=n_jobs)
 
 
-class AutoMLRegressor(BaseAutoML):
+class AutoMLRegressor(AutoML):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self._task_mapping = {'continuous-multioutput': MULTIOUTPUT_REGRESSION,
@@ -1164,25 +1082,28 @@ def __init__(self, *args, **kwargs):
 
     def fit(
         self,
-        X: np.ndarray,
-        y: np.ndarray,
-        X_test: Optional[np.ndarray] = None,
-        y_test: Optional[np.ndarray] = None,
+        X: Union[np.ndarray, pd.DataFrame],
+        y: Union[np.ndarray, pd.DataFrame],
+        X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
+        y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
         feat_type: Optional[List[bool]] = None,
         dataset_name: Optional[str] = None,
         only_return_configuration_space: bool = False,
         load_models: bool = True,
     ):
-        X, y = super()._perform_input_checks(X, y)
-        y_task = type_of_target(y)
+
+        # Check the data provided in y
+        # After the y data type is validated,
+        # check the task type
+        y_task = type_of_target(
+            self.InputValidator.validate_target(y)
+        )
         task = self._task_mapping.get(y_task)
         if task is None:
             raise ValueError('Cannot work on data of type %s' % y_task)
-
         if self._metric is None:
             self._metric = r2
 
-        self._n_outputs = 1 if len(y.shape) == 1 else y.shape[1]
         return super().fit(
             X, y,
             X_test=X_test,
@@ -1193,10 +1114,3 @@ def fit(
             only_return_configuration_space=only_return_configuration_space,
             load_models=load_models,
         )
-
-    def fit_ensemble(self, y, task=None, precision=32,
-                     dataset_name=None, ensemble_nbest=None,
-                     ensemble_size=None):
-        y = super()._check_y(y)
-        return super().fit_ensemble(y, task, precision, dataset_name,
-                                    ensemble_nbest, ensemble_size)