FIX: Add allow_nans option to check_arrays

YS-L · larsmans · commit 3f9dff942d0b · 2014-04-13T16:59:52.000+02:00
Grid search and cross validation should not panic when seeing NaNs in the input arrays, because that breaks Imputer. Fixes scikit-learn#2774 and scikit-learn#3044.
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -182,6 +182,11 @@ Changelog
      :class:`cluster.WardAgglomeration` when no samples are given,
      rather than returning meaningless clustering.
 
+   - Grid search and cross validation allow NaNs in the input arrays so that
+     preprocessors such as :class:`preprocessing.Imputer
+     <preprocessing.Imputer>` can be trained within the cross validation loop,
+     avoiding potentially skewed results.
+
 
 API changes summary
 -------------------
diff --git a/sklearn/cross_validation.py b/sklearn/cross_validation.py
@@ -1097,7 +1097,8 @@ def cross_val_score(estimator, X, y=None, scoring=None, cv=None, n_jobs=1,
     scores : array of float, shape=(len(list(cv)),)
         Array of scores of the estimator for each run of the cross validation.
     """
-    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True)
+    X, y = check_arrays(X, y, sparse_format='csr', allow_lists=True,
+                        allow_nans=True)
     if y is not None:
         y = np.asarray(y)
 
@@ -1408,7 +1409,7 @@ def permutation_test_score(estimator, X, y, score_func=None, cv=None,
         vol. 11
 
     """
-    X, y = check_arrays(X, y, sparse_format='csr')
+    X, y = check_arrays(X, y, sparse_format='csr', allow_nans=True)
     cv = _check_cv(cv, X, y, classifier=is_classifier(estimator))
     scorer = check_scoring(estimator, scoring=scoring, score_func=score_func)
     random_state = check_random_state(random_state)
@@ -1505,6 +1506,7 @@ def train_test_split(*arrays, **options):
     train_size = options.pop('train_size', None)
     random_state = options.pop('random_state', None)
     options['sparse_format'] = 'csr'
+    options['allow_nans'] = True
 
     if test_size is None and train_size is None:
         test_size = 0.25
diff --git a/sklearn/grid_search.py b/sklearn/grid_search.py
@@ -349,7 +349,8 @@ def _fit(self, X, y, parameter_iterable):
                                      score_func=self.score_func)
 
         n_samples = _num_samples(X)
-        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr')
+        X, y = check_arrays(X, y, allow_lists=True, sparse_format='csr',
+                            allow_nans=True)
 
         if y is not None:
             if len(y) != n_samples:
diff --git a/sklearn/tests/test_cross_validation.py b/sklearn/tests/test_cross_validation.py
@@ -35,6 +35,9 @@
 from sklearn.linear_model import Ridge
 from sklearn.svm import SVC
 
+from sklearn.preprocessing import Imputer
+from sklearn.pipeline import Pipeline
+
 
 class MockListClassifier(BaseEstimator):
     """Dummy classifier to test the cross-validation.
@@ -852,3 +855,35 @@ def test_safe_split_with_precomputed_kernel():
     X_te, y_te = cval._safe_split(clf, X, y, te, tr)
     K_te, y_te2 = cval._safe_split(clfp, K, y, te, tr)
     assert_array_almost_equal(K_te, np.dot(X_te, X_tr.T))
+
+
+def test_cross_val_score_allow_nans():
+    # Check that cross_val_score allows input data with NaNs
+    X = np.arange(200, dtype=np.float64).reshape(10, -1)
+    X[2, :] = np.nan
+    y = np.repeat([0, 1], X.shape[0]/2)
+    p = Pipeline([
+        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
+        ('classifier', MockClassifier()),
+    ])
+    cval.cross_val_score(p, X, y, cv=5)
+
+
+def test_train_test_split_allow_nans():
+    # Check that train_test_split allows input data with NaNs
+    X = np.arange(200, dtype=np.float64).reshape(10, -1)
+    X[2, :] = np.nan
+    y = np.repeat([0, 1], X.shape[0]/2)
+    split = cval.train_test_split(X, y, test_size=0.2, random_state=42)
+
+
+def test_permutation_test_score_allow_nans():
+    # Check that permutation_test_score allows input data with NaNs
+    X = np.arange(200, dtype=np.float64).reshape(10, -1)
+    X[2, :] = np.nan
+    y = np.repeat([0, 1], X.shape[0]/2)
+    p = Pipeline([
+        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
+        ('classifier', MockClassifier()),
+    ])
+    cval.permutation_test_score(p, X, y, cv=5)
diff --git a/sklearn/tests/test_grid_search.py b/sklearn/tests/test_grid_search.py
@@ -39,6 +39,8 @@
 from sklearn.metrics import make_scorer
 from sklearn.metrics import roc_auc_score
 from sklearn.cross_validation import KFold, StratifiedKFold
+from sklearn.preprocessing import Imputer
+from sklearn.pipeline import Pipeline
 
 
 # Neither of the following two estimators inherit from BaseEstimator,
@@ -654,3 +656,15 @@ def test_predict_proba_disabled():
     clf = SVC(probability=False)
     gs = GridSearchCV(clf, {}, cv=2).fit(X, y)
     assert_false(hasattr(gs, "predict_proba"))
+
+
+def test_grid_search_allows_nans():
+    """ Test GridSearchCV with Imputer """
+    X = np.arange(20, dtype=np.float64).reshape(5, -1)
+    X[2, :] = np.nan
+    y = [0, 0, 1, 1, 1]
+    p = Pipeline([
+        ('imputer', Imputer(strategy='mean', missing_values='NaN')),
+        ('classifier', MockClassifier()),
+    ])
+    gs = GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
@@ -211,6 +211,9 @@ def check_arrays(*arrays, **options):
     allow_lists : bool
         Allow lists of arbitrary objects as input, just check their length.
         Disables
+
+    allow_nans : boolean, False by default
+        Allows nans in the arrays
     """
     sparse_format = options.pop('sparse_format', None)
     if sparse_format not in (None, 'csr', 'csc', 'dense'):
@@ -219,6 +222,8 @@ def check_arrays(*arrays, **options):
     check_ccontiguous = options.pop('check_ccontiguous', False)
     dtype = options.pop('dtype', None)
     allow_lists = options.pop('allow_lists', False)
+    allow_nans = options.pop('allow_nans', False)
+
     if options:
         raise TypeError("Unexpected keyword arguments: %r" % options.keys())
 
@@ -254,13 +259,15 @@ def check_arrays(*arrays, **options):
                     array.data = np.ascontiguousarray(array.data, dtype=dtype)
                 else:
                     array.data = np.asarray(array.data, dtype=dtype)
-                _assert_all_finite(array.data)
+                if not allow_nans:
+                    _assert_all_finite(array.data)
             else:
                 if check_ccontiguous:
                     array = np.ascontiguousarray(array, dtype=dtype)
                 else:
                     array = np.asarray(array, dtype=dtype)
-                _assert_all_finite(array)
+                if not allow_nans:
+                    _assert_all_finite(array)
 
             if array.ndim >= 3:
                 raise ValueError("Found array with dim %d. Expected <= 2" %