Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
220 changes: 67 additions & 153 deletions autosklearn/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@
import json
import multiprocessing
import os
from typing import Optional, List
from typing import Optional, List, Union
import unittest.mock
import warnings

from ConfigSpace.read_and_write import pcs
import numpy as np
import numpy.ma as ma
import pandas as pd
import scipy.stats
from sklearn.base import BaseEstimator
from sklearn.model_selection._split import _RepeatedSplits, \
Expand All @@ -25,6 +26,7 @@

from autosklearn.metrics import Scorer
from autosklearn.data.xy_data_manager import XYDataManager
from autosklearn.data.validation import InputValidator
from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash
from autosklearn.evaluation.abstract_evaluator import _fit_and_suppress_warnings
from autosklearn.evaluation.train_evaluator import _fit_with_budget
Expand Down Expand Up @@ -189,6 +191,8 @@ def __init__(self,

self._debug_mode = debug_mode

self.InputValidator = InputValidator()

# Place holder for the run history of the
# Ensemble building process
self.ensemble_performance_history = []
Expand Down Expand Up @@ -286,6 +290,17 @@ def fit(
only_return_configuration_space: Optional[bool] = False,
load_models: bool = True,
):
# Make sure that input is valid
# Performs Ordinal one hot encoding to the target
# both for train and test data
X, y = self.InputValidator.validate(X, y)

if X_test is not None:
X_test, y_test = self.InputValidator.validate(X_test, y_test)
if len(y.shape) != len(y_test.shape):
raise ValueError('Target value shapes do not match: %s vs %s'
% (y.shape, y_test.shape))

# Reset learnt stuff
self.models_ = None
self.cv_models_ = None
Expand Down Expand Up @@ -331,6 +346,10 @@ def fit(
raise ValueError('Only `Categorical` and `Numerical` are '
'valid feature types, you passed `%s`' % ft)

# Feature types dynamically understood from dataframe
if feat_type is None and self.InputValidator.feature_types:
feat_type = self.InputValidator.feature_types

datamanager = XYDataManager(
X, y,
X_test=X_test,
Expand Down Expand Up @@ -530,6 +549,9 @@ def fit(

def refit(self, X, y):

# Make sure input data is valid
X, y = self.InputValidator.validate(X, y)

if self.models_ is None or len(self.models_) == 0 or self.ensemble_ is None:
self._load_models()

Expand Down Expand Up @@ -609,6 +631,9 @@ def predict(self, X, batch_size=None, n_jobs=1):
raise ValueError("Predict and predict_proba can only be called "
"if 'ensemble_size != 0'")

# Make sure that input is valid
X = self.InputValidator.validate_features(X)

# Parallelize predictions across models with n_jobs processes.
# Each process computes predictions in chunks of batch_size rows.
try:
Expand Down Expand Up @@ -659,6 +684,9 @@ def fit_ensemble(self, y, task=None, precision=32,
if self._logger is None:
self._logger = self._get_logger(dataset_name)

# Make sure that input is valid
y = self.InputValidator.validate_target(y, is_classification=True)

self._proc_ensemble = self._get_ensemble_process(
1, task, precision, dataset_name, max_iterations=1,
ensemble_nbest=ensemble_nbest, ensemble_size=ensemble_size)
Expand Down Expand Up @@ -793,7 +821,20 @@ def _load_best_individual_model(self):
def score(self, X, y):
# fix: Consider only index 1 of second dimension
# Don't know if the reshaping should be done there or in calculate_score

# Make sure that input is valid
X, y = self.InputValidator.validate(X, y)

prediction = self.predict(X)

# Encode the prediction using the input validator
# We train autosklearn with a encoded version of y,
# which is decoded by predict().
# Above call to validate() encodes the y given for score()
# Below call encodes the prediction, so we compare in the
# same representation domain
prediction = self.InputValidator.encode_target(prediction)

return calculate_score(solution=y,
prediction=prediction,
task_type=self._task,
Expand Down Expand Up @@ -971,67 +1012,7 @@ def configuration_space_created_hook(self, datamanager, configuration_space):
return configuration_space


class BaseAutoML(AutoML):
"""Base class for AutoML objects to hold abstract functions for both
regression and classification."""

def __init__(self, *args, **kwargs):
self._n_outputs = 1
super().__init__(*args, **kwargs)

def _perform_input_checks(self, X, y):
X = self._check_X(X)
if y is not None:
y = self._check_y(y)
return X, y

def _check_X(self, X):
X = sklearn.utils.check_array(X, accept_sparse="csr",
force_all_finite=False)
if scipy.sparse.issparse(X):
X.sort_indices()
return X

def _check_y(self, y):
y = sklearn.utils.check_array(y, ensure_2d=False)
y = np.atleast_1d(y)

if y.ndim == 1:
return y
elif y.ndim == 2 and y.shape[1] == 1:
warnings.warn("A column-vector y was passed when a 1d array was"
" expected. Will change shape via np.ravel().",
sklearn.utils.DataConversionWarning, stacklevel=2)
y = np.ravel(y)
return y

return y

def refit(self, X, y):
X, y = self._perform_input_checks(X, y)
_n_outputs = 1 if len(y.shape) == 1 else y.shape[1]
if self._n_outputs != _n_outputs:
raise ValueError('Number of outputs changed from %d to %d!' %
(self._n_outputs, _n_outputs))

return super().refit(X, y)

def fit_ensemble(self, y, task=None, precision=32,
dataset_name=None, ensemble_nbest=None,
ensemble_size=None):
_n_outputs = 1 if len(y.shape) == 1 else y.shape[1]
if self._n_outputs != _n_outputs:
raise ValueError('Number of outputs changed from %d to %d!' %
(self._n_outputs, _n_outputs))

return super().fit_ensemble(
y, task=task, precision=precision,
dataset_name=dataset_name, ensemble_nbest=ensemble_nbest,
ensemble_size=ensemble_size
)


class AutoMLClassifier(BaseAutoML):
class AutoMLClassifier(AutoML):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)

Expand All @@ -1041,23 +1022,21 @@ def __init__(self, *args, **kwargs):

def fit(
self,
X: np.ndarray,
y: np.ndarray,
X_test: Optional[np.ndarray] = None,
y_test: Optional[np.ndarray] = None,
X: Union[np.ndarray, pd.DataFrame],
y: Union[np.ndarray, pd.DataFrame],
X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
feat_type: Optional[List[bool]] = None,
dataset_name: Optional[str] = None,
only_return_configuration_space: bool = False,
load_models: bool = True,
):
X, y = self._perform_input_checks(X, y)
if X_test is not None:
X_test, y_test = self._perform_input_checks(X_test, y_test)
if len(y.shape) != len(y_test.shape):
raise ValueError('Target value shapes do not match: %s vs %s'
% (y.shape, y_test.shape))

y_task = type_of_target(y)
# We first validate the dtype of the target provided by the user
# In doing so, we also fit the internal encoder for classification
y_task = type_of_target(
self.InputValidator.validate_target(y, is_classification=True)
)
task = self._task_mapping.get(y_task)
if task is None:
raise ValueError('Cannot work on data of type %s' % y_task)
Expand All @@ -1068,22 +1047,6 @@ def fit(
else:
self._metric = accuracy

y, self._classes, self._n_classes = self._process_target_classes(y)
if y_test is not None:
# Map test values to actual values - TODO: copy to all kinds of
# other parts in this code and test it!!!
y_test_new = []
for output_idx in range(len(self._classes)):
mapping = {self._classes[output_idx][idx]: idx
for idx in range(len(self._classes[output_idx]))}
enumeration = y_test if len(self._classes) == 1 else y_test[output_idx]
y_test_new.append(
np.array([mapping[value] for value in enumeration])
)
y_test = np.array(y_test_new)
if self._n_outputs == 1:
y_test = y_test.flatten()

return super().fit(
X, y,
X_test=X_test,
Expand All @@ -1095,67 +1058,22 @@ def fit(
load_models=load_models,
)

def fit_ensemble(self, y, task=None, precision=32,
dataset_name=None, ensemble_nbest=None,
ensemble_size=None):
y, _classes, _n_classes = self._process_target_classes(y)
if not hasattr(self, '_classes'):
self._classes = _classes
if not hasattr(self, '_n_classes'):
self._n_classes = _n_classes

return super().fit_ensemble(y, task, precision, dataset_name,
ensemble_nbest, ensemble_size)

def _process_target_classes(self, y):
y = super()._check_y(y)
self._n_outputs = 1 if len(y.shape) == 1 else y.shape[1]

y = np.copy(y)

_classes = []
_n_classes = []

if self._n_outputs == 1:
classes_k, y = np.unique(y, return_inverse=True)
_classes.append(classes_k)
_n_classes.append(classes_k.shape[0])
else:
for k in range(self._n_outputs):
classes_k, y[:, k] = np.unique(y[:, k], return_inverse=True)
_classes.append(classes_k)
_n_classes.append(classes_k.shape[0])

_n_classes = np.array(_n_classes, dtype=np.int)

return y, _classes, _n_classes

def predict(self, X, batch_size=None, n_jobs=1):
predicted_probabilities = super().predict(X, batch_size=batch_size,
n_jobs=n_jobs)

if self._n_outputs == 1:
if self.InputValidator.is_single_column_target() == 1:
predicted_indexes = np.argmax(predicted_probabilities, axis=1)
predicted_classes = self._classes[0].take(predicted_indexes)

return predicted_classes
else:
predicted_indices = (predicted_probabilities > 0.5).astype(int)
n_samples = predicted_probabilities.shape[0]
predicted_classes = np.zeros((n_samples, self._n_outputs))
predicted_indexes = (predicted_probabilities > 0.5).astype(int)

for k in range(self._n_outputs):
output_predicted_indexes = predicted_indices[:, k].reshape(-1)
predicted_classes[:, k] = self._classes[k].take(
output_predicted_indexes)

return predicted_classes
return self.InputValidator.decode_target(predicted_indexes)

def predict_proba(self, X, batch_size=None, n_jobs=1):
return super().predict(X, batch_size=batch_size, n_jobs=n_jobs)


class AutoMLRegressor(BaseAutoML):
class AutoMLRegressor(AutoML):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._task_mapping = {'continuous-multioutput': MULTIOUTPUT_REGRESSION,
Expand All @@ -1164,25 +1082,28 @@ def __init__(self, *args, **kwargs):

def fit(
self,
X: np.ndarray,
y: np.ndarray,
X_test: Optional[np.ndarray] = None,
y_test: Optional[np.ndarray] = None,
X: Union[np.ndarray, pd.DataFrame],
y: Union[np.ndarray, pd.DataFrame],
X_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
y_test: Optional[Union[np.ndarray, pd.DataFrame]] = None,
feat_type: Optional[List[bool]] = None,
dataset_name: Optional[str] = None,
only_return_configuration_space: bool = False,
load_models: bool = True,
):
X, y = super()._perform_input_checks(X, y)
y_task = type_of_target(y)

# Check the data provided in y
# After the y data type is validated,
# check the task type
y_task = type_of_target(
self.InputValidator.validate_target(y)
)
task = self._task_mapping.get(y_task)
if task is None:
raise ValueError('Cannot work on data of type %s' % y_task)

if self._metric is None:
self._metric = r2

self._n_outputs = 1 if len(y.shape) == 1 else y.shape[1]
return super().fit(
X, y,
X_test=X_test,
Expand All @@ -1193,10 +1114,3 @@ def fit(
only_return_configuration_space=only_return_configuration_space,
load_models=load_models,
)

def fit_ensemble(self, y, task=None, precision=32,
dataset_name=None, ensemble_nbest=None,
ensemble_size=None):
y = super()._check_y(y)
return super().fit_ensemble(y, task, precision, dataset_name,
ensemble_nbest, ensemble_size)
Loading