Merge pull request scikit-learn#4541 from amueller/robust_input_dtype_check

GaelVaroquaux · GaelVaroquaux · commit 203298e08894 · 2015-04-14T17:45:22.000+02:00
[MRG + 1] FIX be robust to columns name dtype, robust dtype checking
diff --git a/sklearn/utils/tests/test_validation.py b/sklearn/utils/tests/test_validation.py
@@ -13,6 +13,7 @@
 from sklearn.utils.testing import assert_raises_regexp
 from sklearn.utils import as_float_array, check_array, check_symmetric
 from sklearn.utils import check_X_y
+from sklearn.utils.mocking import MockDataFrame
 from sklearn.utils.estimator_checks import NotAnArray
 from sklearn.random_projection import sparse_random_matrix
 from sklearn.linear_model import ARDRegression
@@ -218,6 +219,25 @@ def test_check_array():
     assert_true(isinstance(result, np.ndarray))
 
 
+def test_check_array_pandas_dtype_object_conversion():
+    # test that data-frame like objects with dtype object
+    # get converted
+    X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.object)
+    X_df = MockDataFrame(X)
+    assert_equal(check_array(X_df).dtype.kind, "f")
+    assert_equal(check_array(X_df, ensure_2d=False).dtype.kind, "f")
+    # smoke-test against dataframes with column named "dtype"
+    X_df.dtype = "Hans"
+    assert_equal(check_array(X_df, ensure_2d=False).dtype.kind, "f")
+
+
+def test_check_array_dtype_stability():
+    # test that lists with ints don't get converted to floats
+    X = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
+    assert_equal(check_array(X).dtype.kind, "i")
+    assert_equal(check_array(X, ensure_2d=False).dtype.kind, "i")
+
+
 def test_check_array_min_samples_and_features_messages():
     # empty list is considered 2D by default:
     msg = "0 feature(s) (shape=(1, 0)) while a minimum of 1 is required."
diff --git a/sklearn/utils/validation.py b/sklearn/utils/validation.py
@@ -324,21 +324,27 @@ def check_array(array, accept_sparse=None, dtype="numeric", order=None,
     if isinstance(accept_sparse, str):
         accept_sparse = [accept_sparse]
 
+    # store whether originally we wanted numeric dtype
+    dtype_numeric = dtype == "numeric"
+
     if sp.issparse(array):
-        if dtype == "numeric":
+        if dtype_numeric:
             dtype = None
         array = _ensure_sparse_format(array, accept_sparse, dtype, order,
                                       copy, force_all_finite)
     else:
         if ensure_2d:
             array = np.atleast_2d(array)
-        if dtype == "numeric":
-            if hasattr(array, "dtype") and array.dtype.kind == "O":
+        if dtype_numeric:
+            if hasattr(array, "dtype") and getattr(array.dtype, "kind", None) == "O":
                 # if input is object, convert to float.
                 dtype = np.float64
             else:
                 dtype = None
         array = np.array(array, dtype=dtype, order=order, copy=copy)
+        # make sure we actually converted to numeric:
+        if dtype_numeric and array.dtype.kind == "O":
+            array = array.astype(np.float64)
         if not allow_nd and array.ndim >= 3:
             raise ValueError("Found array with dim %d. Expected <= 2" %
                              array.ndim)
@@ -353,7 +359,6 @@ def check_array(array, accept_sparse=None, dtype="numeric", order=None,
                              " minimum of %d is required."
                              % (n_samples, shape_repr, ensure_min_samples))
 
-
     if ensure_min_features > 0 and array.ndim == 2:
         n_features = array.shape[1]
         if n_features < ensure_min_features: