Skip to content

Commit 385ba54

Browse files
Default n_jobs to 1 for StackedEnsemblers until we can write our own Ensembler (#2295)
* Default n_jobs to None for StackedEnsemblers * Release notes * release notes * test update * test update * test update * test update * don't mock fit and test n_jobs * update test * update release notes * test updates * add multiclass coverage to test * cleaner logic * improve test parametrization * lint fixes * change estimators for core dependencies true
1 parent f5ce75b commit 385ba54

File tree

5 files changed

+65
-7
lines changed

5 files changed

+65
-7
lines changed

docs/source/release_notes.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ Release Notes
66
* Added dictionary input functionality for ``Undersampler`` component :pr:`2271`
77
* Changed the default parameter values for ``Elastic Net Classifier`` and ``Elastic Net Regressor`` :pr:`2269`
88
* Fixes
9+
* Set default `n_jobs` to 1 for `StackedEnsembleClassifier` and `StackedEnsembleRegressor` until fix for text-based parallelism in sklearn stacking can be found :pr:`2295`
910
* Changes
1011
* Updated ``start_iteration_callback`` to accept a pipeline instance instead of a pipeline class and no longer accept pipeline parameters as a parameter :pr:`2290`
1112
* Documentation Changes

evalml/automl/automl_algorithm/iterative_algorithm.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ def __init__(self,
3333
n_jobs=-1, # TODO remove
3434
number_features=None, # TODO remove
3535
ensembling=False,
36+
text_in_ensembling=False,
3637
pipeline_params=None,
3738
_frozen_pipeline_parameters=None,
3839
_estimator_family_order=None):
@@ -47,6 +48,7 @@ def __init__(self,
4748
n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines.
4849
number_features (int): The number of columns in the input features.
4950
ensembling (boolean): If True, runs ensembling in a separate batch after every allowed pipeline class has been iterated over. Defaults to False.
51+
text_in_ensembling (boolean): If True and ensembling is True, then n_jobs will be set to 1 to avoid downstream sklearn stacking issues related to nltk.
5052
pipeline_params (dict or None): Pipeline-level parameters that should be passed to the proposed pipelines.
5153
_frozen_pipeline_parameters (dict or None): Pipeline-level parameters are frozen and used in the proposed pipelines.
5254
_estimator_family_order (list(ModelFamily) or None): specify the sort order for the first batch. Defaults to _ESTIMATOR_FAMILY_ORDER.
@@ -75,6 +77,7 @@ def __init__(self,
7577
self._first_batch_results = []
7678
self._best_pipeline_info = {}
7779
self.ensembling = ensembling and len(self.allowed_pipelines) > 1
80+
self.text_in_ensembling = text_in_ensembling
7881
self._pipeline_params = pipeline_params or {}
7982
self._frozen_pipeline_parameters = _frozen_pipeline_parameters or {}
8083

@@ -105,9 +108,10 @@ def next_batch(self):
105108
parameters = self._combine_parameters(pipeline, pipeline_params)
106109
input_pipelines.append(pipeline.new(parameters=parameters,
107110
random_seed=self.random_seed))
111+
n_jobs_ensemble = 1 if self.text_in_ensembling else self.n_jobs
108112
ensemble = _make_stacked_ensemble_pipeline(input_pipelines, input_pipelines[0].problem_type,
109113
random_seed=self.random_seed,
110-
n_jobs=self.n_jobs)
114+
n_jobs=n_jobs_ensemble)
111115

112116
next_batch.append(ensemble)
113117
else:

evalml/automl/automl_search.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,7 @@ def __init__(self,
385385
check_all_pipeline_names_unique(self.allowed_pipelines)
386386

387387
run_ensembling = self.ensembling
388+
text_in_ensembling = len(infer_feature_types(X_train).select('natural_language').columns) > 0
388389
if run_ensembling and len(self.allowed_pipelines) == 1:
389390
logger.warning("Ensembling is set to True, but the number of unique pipelines is one, so ensembling will not run.")
390391
run_ensembling = False
@@ -438,6 +439,7 @@ def __init__(self,
438439
number_features=self.X_train.shape[1],
439440
pipelines_per_batch=self._pipelines_per_batch,
440441
ensembling=run_ensembling,
442+
text_in_ensembling=text_in_ensembling,
441443
pipeline_params=parameters,
442444
_frozen_pipeline_parameters=self._frozen_pipeline_parameters
443445
)

evalml/tests/automl_tests/test_automl.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1899,6 +1899,49 @@ def test_search_with_text(mock_fit, mock_score):
18991899
assert automl.rankings['pipeline_name'][1:].str.contains('Text').all()
19001900

19011901

1902+
@pytest.mark.parametrize("problem_type,pipeline_name,ensemble_name",
1903+
[('binary', 'Stacked Ensemble Classification Pipeline', 'Stacked Ensemble Classifier'),
1904+
('multiclass', 'Stacked Ensemble Classification Pipeline', 'Stacked Ensemble Classifier'),
1905+
('regression', 'Stacked Ensemble Regression Pipeline', 'Stacked Ensemble Regressor')])
1906+
@pytest.mark.parametrize("df_text", [True, False])
1907+
@patch('evalml.automl.automl_algorithm.IterativeAlgorithm.__init__')
1908+
def test_search_with_text_and_ensembling(mock_iter, df_text, problem_type, pipeline_name, ensemble_name):
1909+
X_with_text = pd.DataFrame(
1910+
{'col_1': ['I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!',
1911+
'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.',
1912+
'I\'m gonna be the main event, like no king was before! I\'m brushing up on looking down, I\'m working on my ROAR!',
1913+
'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.',
1914+
'In sleep he sang to me, in dreams he came... That voice which calls to me, and speaks my name.',
1915+
'I\'m singing in the rain! Just singing in the rain, what a glorious feeling, I\'m happy again!',
1916+
'do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!',
1917+
'I dreamed a dream in days gone by, when hope was high and life worth living',
1918+
'Red, the blood of angry men - black, the dark of ages past',
1919+
'do you hear the people sing? Singing the songs of angry men\n\tIt is the music of a people who will NOT be slaves again!',
1920+
'Red, the blood of angry men - black, the dark of ages past',
1921+
'It was red and yellow and green and brown and scarlet and black and ochre and peach and ruby and olive and violet and fawn...']
1922+
})
1923+
X_no_text = pd.DataFrame({'col_1': [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3]})
1924+
1925+
if df_text:
1926+
X = X_with_text
1927+
else:
1928+
X = X_no_text
1929+
if problem_type == 'binary':
1930+
y = [0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0]
1931+
elif problem_type == 'multiclass':
1932+
y = [0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]
1933+
else:
1934+
y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
1935+
mock_iter.return_value = None
1936+
_ = AutoMLSearch(X_train=X, y_train=y, problem_type=problem_type, allowed_model_families=["random_forest", "decision_tree"],
1937+
max_batches=4, ensembling=True)
1938+
call_args = mock_iter.call_args_list[0][1]
1939+
if df_text:
1940+
assert call_args['text_in_ensembling']
1941+
else:
1942+
assert not call_args['text_in_ensembling']
1943+
1944+
19021945
@patch('evalml.pipelines.BinaryClassificationPipeline.score', return_value={"Log Loss Binary": 0.8})
19031946
@patch('evalml.pipelines.BinaryClassificationPipeline.fit')
19041947
def test_pipelines_per_batch(mock_fit, mock_score, X_y_binary):

evalml/tests/automl_tests/test_iterative_algorithm.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -236,10 +236,11 @@ def test_iterative_algorithm_one_allowed_pipeline(ensembling_value, logistic_reg
236236
assert any([p != logistic_regression_binary_pipeline_class({}).default_parameters for p in all_parameters])
237237

238238

239+
@pytest.mark.parametrize("text_in_ensembling", [True, False])
239240
@pytest.mark.parametrize("n_jobs", [-1, 0, 1, 2, 3])
240-
def test_iterative_algorithm_stacked_ensemble_n_jobs_binary(n_jobs, dummy_binary_pipeline_classes):
241+
def test_iterative_algorithm_stacked_ensemble_n_jobs_binary(n_jobs, text_in_ensembling, dummy_binary_pipeline_classes):
241242
dummy_binary_pipeline_classes = dummy_binary_pipeline_classes()
242-
algo = IterativeAlgorithm(allowed_pipelines=dummy_binary_pipeline_classes, ensembling=True, n_jobs=n_jobs)
243+
algo = IterativeAlgorithm(allowed_pipelines=dummy_binary_pipeline_classes, ensembling=True, text_in_ensembling=text_in_ensembling, n_jobs=n_jobs)
243244
next_batch = algo.next_batch()
244245
seen_ensemble = False
245246
scores = range(0, len(next_batch))
@@ -251,13 +252,17 @@ def test_iterative_algorithm_stacked_ensemble_n_jobs_binary(n_jobs, dummy_binary
251252
for pipeline in next_batch:
252253
if isinstance(pipeline.estimator, StackedEnsembleClassifier):
253254
seen_ensemble = True
254-
assert pipeline.parameters['Stacked Ensemble Classifier']['n_jobs'] == n_jobs
255+
if text_in_ensembling:
256+
assert pipeline.parameters['Stacked Ensemble Classifier']['n_jobs'] == 1
257+
else:
258+
assert pipeline.parameters['Stacked Ensemble Classifier']['n_jobs'] == n_jobs
255259
assert seen_ensemble
256260

257261

262+
@pytest.mark.parametrize("text_in_ensembling", [True, False])
258263
@pytest.mark.parametrize("n_jobs", [-1, 0, 1, 2, 3])
259-
def test_iterative_algorithm_stacked_ensemble_n_jobs_regression(n_jobs, linear_regression_pipeline_class):
260-
algo = IterativeAlgorithm(allowed_pipelines=[linear_regression_pipeline_class({}), linear_regression_pipeline_class({})], ensembling=True, n_jobs=n_jobs)
264+
def test_iterative_algorithm_stacked_ensemble_n_jobs_regression(n_jobs, text_in_ensembling, linear_regression_pipeline_class):
265+
algo = IterativeAlgorithm(allowed_pipelines=[linear_regression_pipeline_class({}), linear_regression_pipeline_class({})], ensembling=True, text_in_ensembling=text_in_ensembling, n_jobs=n_jobs)
261266
next_batch = algo.next_batch()
262267
seen_ensemble = False
263268
scores = range(0, len(next_batch))
@@ -269,7 +274,10 @@ def test_iterative_algorithm_stacked_ensemble_n_jobs_regression(n_jobs, linear_r
269274
for pipeline in next_batch:
270275
if isinstance(pipeline.estimator, StackedEnsembleRegressor):
271276
seen_ensemble = True
272-
assert pipeline.parameters['Stacked Ensemble Regressor']['n_jobs'] == n_jobs
277+
if text_in_ensembling:
278+
assert pipeline.parameters['Stacked Ensemble Regressor']['n_jobs'] == 1
279+
else:
280+
assert pipeline.parameters['Stacked Ensemble Regressor']['n_jobs'] == n_jobs
273281
assert seen_ensemble
274282

275283

0 commit comments

Comments
 (0)