MNT Warn when shuffle is False but random_state is not None (scikit-learn#15353)

NicolasHug · web-flow · commit 7effc37174ff · 2019-10-24T16:33:12.000-04:00
diff --git a/doc/whats_new/v0.22.rst b/doc/whats_new/v0.22.rst
@@ -484,6 +484,11 @@ Changelog
   where one test set could be `n_classes` larger than another. Test sets should
   now be near-equally sized. :pr:`14704` by `Joel Nothman`_.
 
+- |API| :class:`model_selection.KFold` and
+  :class:`model_selection.StratifiedKFold` now raise a warning if
+  `random_state` is set but `shuffle` is False. This will raise an error in
+  0.24.
+
 :mod:`sklearn.multioutput`
 ..........................
 
diff --git a/sklearn/linear_model/tests/test_logistic.py b/sklearn/linear_model/tests/test_logistic.py
@@ -1496,7 +1496,7 @@ def test_LogisticRegressionCV_GridSearchCV_elastic_net(multi_class):
         X, y = make_classification(n_samples=100, n_classes=3, n_informative=3,
                                    random_state=0)
 
-    cv = StratifiedKFold(5, random_state=0)
+    cv = StratifiedKFold(5)
 
     l1_ratios = np.linspace(0, 1, 3)
     Cs = np.logspace(-4, 4, 3)
@@ -1527,7 +1527,7 @@ def test_LogisticRegressionCV_GridSearchCV_elastic_net_ovr():
     X, y = make_classification(n_samples=100, n_classes=3, n_informative=3,
                                random_state=0)
     X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
-    cv = StratifiedKFold(5, random_state=0)
+    cv = StratifiedKFold(5)
 
     l1_ratios = np.linspace(0, 1, 3)
     Cs = np.logspace(-4, 4, 3)
@@ -1770,7 +1770,7 @@ def test_scores_attribute_layout_elasticnet():
     # the third dimension corresponds to l1_ratios.
 
     X, y = make_classification(n_samples=1000, random_state=0)
-    cv = StratifiedKFold(n_splits=5, shuffle=False)
+    cv = StratifiedKFold(n_splits=5)
 
     l1_ratios = [.1, .9]
     Cs = [.1, 1, 10]
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
@@ -287,6 +287,15 @@ def __init__(self, n_splits, shuffle, random_state):
             raise TypeError("shuffle must be True or False;"
                             " got {0}".format(shuffle))
 
+        if not shuffle and random_state is not None:  # None is the default
+            # TODO 0.24: raise a ValueError instead of a warning
+            warnings.warn(
+                'Setting a random_state has no effect since shuffle is '
+                'False. This will raise an error in 0.24. You should leave '
+                'random_state to its default (None), or set shuffle=True.',
+                DeprecationWarning
+            )
+
         self.n_splits = n_splits
         self.shuffle = shuffle
         self.random_state = random_state
@@ -374,7 +383,8 @@ class KFold(_BaseKFold):
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
-        by `np.random`. Used when ``shuffle`` == True.
+        by `np.random`. Only used when ``shuffle`` is True. This should be left
+        to None if ``shuffle`` is False.
 
     Examples
     --------
@@ -579,7 +589,8 @@ class StratifiedKFold(_BaseKFold):
         If int, random_state is the seed used by the random number generator;
         If RandomState instance, random_state is the random number generator;
         If None, the random number generator is the RandomState instance used
-        by `np.random`. Used when ``shuffle`` == True.
+        by `np.random`. Only used when ``shuffle`` is True. This should be left
+        to None if ``shuffle`` is False.
 
     Examples
     --------
diff --git a/sklearn/model_selection/tests/test_search.py b/sklearn/model_selection/tests/test_search.py
@@ -1220,7 +1220,7 @@ def test_search_cv_results_none_param():
     X, y = [[1], [2], [3], [4], [5]], [0, 0, 0, 0, 1]
     estimators = (DecisionTreeRegressor(), DecisionTreeClassifier())
     est_parameters = {"random_state": [0, None]}
-    cv = KFold(random_state=0)
+    cv = KFold()
 
     for est in estimators:
         grid_search = GridSearchCV(est, est_parameters, cv=cv,
@@ -1294,7 +1294,7 @@ def test_grid_search_correct_score_results():
 
 def test_fit_grid_point():
     X, y = make_classification(random_state=0)
-    cv = StratifiedKFold(random_state=0)
+    cv = StratifiedKFold()
     svc = LinearSVC(random_state=0)
     scorer = make_scorer(accuracy_score)
 
@@ -1345,7 +1345,7 @@ def test_grid_search_with_multioutput_data():
                                           random_state=0)
 
     est_parameters = {"max_depth": [1, 2, 3, 4]}
-    cv = KFold(random_state=0)
+    cv = KFold()
 
     estimators = [DecisionTreeRegressor(random_state=0),
                   DecisionTreeClassifier(random_state=0)]
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
@@ -390,7 +390,8 @@ def test_stratified_kfold_ratios(k, shuffle):
     distr = np.bincount(y) / len(y)
 
     test_sizes = []
-    skf = StratifiedKFold(k, random_state=0, shuffle=shuffle)
+    random_state = None if not shuffle else 0
+    skf = StratifiedKFold(k, random_state=random_state, shuffle=shuffle)
     for train, test in skf.split(X, y):
         assert_allclose(np.bincount(y[train]) / len(train), distr, atol=0.02)
         assert_allclose(np.bincount(y[test]) / len(test), distr, atol=0.02)
@@ -409,9 +410,10 @@ def test_stratified_kfold_label_invariance(k, shuffle):
     X = np.ones(len(y))
 
     def get_splits(y):
+        random_state = None if not shuffle else 0
         return [(list(train), list(test))
                 for train, test
-                in StratifiedKFold(k, random_state=0,
+                in StratifiedKFold(k, random_state=random_state,
                                    shuffle=shuffle).split(X, y)]
 
     splits_base = get_splits(y)
@@ -1582,3 +1584,12 @@ def test_leave_p_out_empty_trainset():
             ValueError,
             match='p=2 must be strictly less than the number of samples=2'):
         next(cv.split(X, y, groups=[1, 2]))
+
+
+@pytest.mark.parametrize('Klass', (KFold, StratifiedKFold))
+def test_random_state_shuffle_false(Klass):
+    # passing a non-default random_state when shuffle=False makes no sense
+    # TODO 0.24: raise a ValueError instead of a warning
+    with pytest.warns(DeprecationWarning,
+                      match='has no effect since shuffle is False'):
+        Klass(3, shuffle=False, random_state=0)