Merge pull request rasbt#1042 from rasbt/sklearn-deprecations

rasbt · web-flow · commit 7f11b9b4978a · 2023-05-19T11:15:33.000-04:00
fix sklearn boston deprecations
diff --git a/mlxtend/feature_selection/tests/test_column_selector.py b/mlxtend/feature_selection/tests/test_column_selector.py
@@ -11,7 +11,7 @@
 from packaging.version import Version
 from sklearn import __version__ as sklearn_version
 from sklearn import datasets
-from sklearn.linear_model import LinearRegression, LogisticRegression
+from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import GridSearchCV
 from sklearn.pipeline import make_pipeline
 
@@ -64,68 +64,69 @@ def test_ColumnSelector_in_gridsearch():
 
 
 def test_ColumnSelector_with_dataframe():
-    boston = datasets.load_boston()
-    df_in = pd.DataFrame(boston.data, columns=boston.feature_names)
-    df_out = ColumnSelector(cols=("ZN", "CRIM")).transform(df_in)
-    assert df_out.shape == (506, 2)
+    iris = datasets.load_iris()
+    df_in = pd.DataFrame(iris.data, columns=iris.feature_names)
+    df_out = ColumnSelector(cols=("sepal length (cm)", "sepal width (cm)")).transform(
+        df_in
+    )
+    assert df_out.shape == (150, 2)
 
 
 def test_ColumnSelector_with_dataframe_and_int_columns():
-    boston = datasets.load_boston()
-    df_in = pd.DataFrame(boston.data, columns=boston.feature_names)
-    df_out_str = ColumnSelector(cols=("INDUS", "CHAS")).transform(df_in)
+    iris = datasets.load_iris()
+    df_in = pd.DataFrame(iris.data, columns=iris.feature_names)
+    df_out_str = ColumnSelector(
+        cols=("petal length (cm)", "petal width (cm)")
+    ).transform(df_in)
     df_out_int = ColumnSelector(cols=(2, 3)).transform(df_in)
 
     np.testing.assert_array_equal(df_out_str[:, 0], df_out_int[:, 0])
     np.testing.assert_array_equal(df_out_str[:, 1], df_out_int[:, 1])
 
 
 def test_ColumnSelector_with_dataframe_drop_axis():
-    boston = datasets.load_boston()
-    df_in = pd.DataFrame(boston.data, columns=boston.feature_names)
-    X1_out = ColumnSelector(cols="ZN", drop_axis=True).transform(df_in)
-    assert X1_out.shape == (506,)
+    iris = datasets.load_iris()
+    df_in = pd.DataFrame(iris.data, columns=iris.feature_names)
+    X1_out = ColumnSelector(cols=("petal length (cm)",), drop_axis=True).transform(
+        df_in
+    )
+    assert X1_out.shape == (150,)
 
-    X1_out = ColumnSelector(cols=("ZN",), drop_axis=True).transform(df_in)
-    assert X1_out.shape == (506,)
+    X1_out = ColumnSelector(cols=("petal length (cm)",), drop_axis=True).transform(
+        df_in
+    )
+    assert X1_out.shape == (150,)
 
-    X1_out = ColumnSelector(cols="ZN").transform(df_in)
-    assert X1_out.shape == (506, 1)
+    X1_out = ColumnSelector(cols="petal length (cm)").transform(df_in)
+    assert X1_out.shape == (150, 1)
 
-    X1_out = ColumnSelector(cols=("ZN",)).transform(df_in)
-    assert X1_out.shape == (506, 1)
+    X1_out = ColumnSelector(cols=("petal length (cm)",)).transform(df_in)
+    assert X1_out.shape == (150, 1)
 
 
 def test_ColumnSelector_with_dataframe_in_gridsearch():
-    boston = datasets.load_boston()
-    X = pd.DataFrame(boston.data, columns=boston.feature_names)
-    y = boston.target
-    pipe = make_pipeline(ColumnSelector(), LinearRegression())
+    iris = datasets.load_iris()
+    X = pd.DataFrame(iris.data, columns=iris.feature_names)
+    y = iris.target
+    pipe = make_pipeline(ColumnSelector(), LogisticRegression())
     grid = {
-        "columnselector__cols": [["ZN", "RM"], ["ZN", "RM", "AGE"], "ZN", ["RM"]],
-        "linearregression__copy_X": [True, False],
-        "linearregression__fit_intercept": [True, False],
+        "columnselector__cols": [
+            ["petal length (cm)", "petal width (cm)"],
+            ["sepal length (cm)", "sepal width (cm)", "petal width (cm)"],
+        ],
     }
 
-    if Version(sklearn_version) < Version("0.24.1"):
-        gsearch1 = GridSearchCV(
-            estimator=pipe,
-            param_grid=grid,
-            cv=5,
-            n_jobs=1,
-            iid=False,
-            scoring="neg_mean_squared_error",
-            refit=False,
-        )
-    else:
-        gsearch1 = GridSearchCV(
-            estimator=pipe,
-            param_grid=grid,
-            cv=5,
-            n_jobs=1,
-            scoring="neg_mean_squared_error",
-            refit=False,
-        )
+    gsearch1 = GridSearchCV(
+        estimator=pipe,
+        param_grid=grid,
+        cv=5,
+        n_jobs=1,
+        scoring="accuracy",
+        refit=False,
+    )
 
     gsearch1.fit(X, y)
-    assert gsearch1.best_params_["columnselector__cols"] == ["ZN", "RM", "AGE"]
+    assert gsearch1.best_params_["columnselector__cols"] == [
+        "petal length (cm)",
+        "petal width (cm)",
+    ]
diff --git a/mlxtend/feature_selection/tests/test_sequential_feature_selector.py b/mlxtend/feature_selection/tests/test_sequential_feature_selector.py
@@ -9,7 +9,7 @@
 from numpy.testing import assert_almost_equal
 from packaging.version import Version
 from sklearn import __version__ as sklearn_version
-from sklearn.datasets import load_boston, load_iris
+from sklearn.datasets import load_iris
 from sklearn.decomposition import PCA
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LinearRegression
@@ -19,6 +19,7 @@
 from sklearn.pipeline import Pipeline
 
 from mlxtend.classifier import SoftmaxRegression
+from mlxtend.data import boston_housing_data
 from mlxtend.feature_selection import SequentialFeatureSelector as SFS
 from mlxtend.utils import assert_raises
 
@@ -421,8 +422,7 @@ def test_knn_scoring_metric():
 
 
 def test_regression():
-    boston = load_boston()
-    X, y = boston.data, boston.target
+    X, y = boston_housing_data()
     lr = LinearRegression()
     sfs_r = SFS(
         lr,
@@ -443,8 +443,7 @@ def test_regression():
 
 
 def test_regression_sffs():
-    boston = load_boston()
-    X, y = boston.data, boston.target
+    X, y = boston_housing_data()
     lr = LinearRegression()
     sfs_r = SFS(
         lr,
@@ -460,8 +459,7 @@ def test_regression_sffs():
 
 
 def test_regression_sbfs():
-    boston = load_boston()
-    X, y = boston.data, boston.target
+    X, y = boston_housing_data()
     lr = LinearRegression()
     sfs_r = SFS(
         lr,
@@ -477,8 +475,7 @@ def test_regression_sbfs():
 
 
 def test_regression_in_range():
-    boston = load_boston()
-    X, y = boston.data, boston.target
+    X, y = boston_housing_data()
     lr = LinearRegression()
     sfs_r = SFS(
         lr,
@@ -722,9 +719,7 @@ def test_string_scoring_clf():
 
 
 def test_max_feature_subset_size_in_tuple_range():
-    boston = load_boston()
-    X, y = boston.data, boston.target
-
+    X, y = boston_housing_data()
     lr = LinearRegression()
 
     sfs = SFS(
@@ -741,8 +736,7 @@ def test_max_feature_subset_size_in_tuple_range():
 
 
 def test_max_feature_subset_best():
-    boston = load_boston()
-    X, y = boston.data, boston.target
+    X, y = boston_housing_data()
     lr = LinearRegression()
 
     sfs = SFS(lr, k_features="best", forward=True, floating=False, cv=10)
@@ -752,8 +746,7 @@ def test_max_feature_subset_best():
 
 
 def test_max_feature_subset_parsimonious():
-    boston = load_boston()
-    X, y = boston.data, boston.target
+    X, y = boston_housing_data()
     lr = LinearRegression()
 
     sfs = SFS(lr, k_features="parsimonious", forward=True, floating=False, cv=10)
diff --git a/mlxtend/feature_selection/tests/test_sequential_feature_selector_feature_groups.py b/mlxtend/feature_selection/tests/test_sequential_feature_selector_feature_groups.py
@@ -4,14 +4,14 @@
 #
 # License: BSD 3 clause
 import numpy as np
-from numpy import nan
 from numpy.testing import assert_almost_equal
-from sklearn.datasets import load_boston, load_iris
+from sklearn.datasets import load_iris
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import LinearRegression
 from sklearn.metrics import roc_auc_score
 from sklearn.neighbors import KNeighborsClassifier
 
+from mlxtend.data import boston_housing_data
 from mlxtend.feature_selection import SequentialFeatureSelector as SFS
 from mlxtend.utils import assert_raises
 
@@ -99,8 +99,7 @@ def test_knn_wo_cv_feature_groups_default():
 
 
 def test_regression_sbfs():
-    boston = load_boston()
-    X, y = boston.data, boston.target
+    X, y = boston_housing_data()
     lr = LinearRegression()
     sfs_r = SFS(
         lr,
@@ -163,8 +162,7 @@ def test_keyboard_interrupt():
 
 
 def test_max_feature_subset_best():
-    boston = load_boston()
-    X, y = boston.data, boston.target
+    X, y = boston_housing_data()
     lr = LinearRegression()
 
     sfs = SFS(
@@ -189,8 +187,7 @@ def test_max_feature_subset_best():
 
 
 def test_max_feature_subset_parsimonious():
-    boston = load_boston()
-    X, y = boston.data, boston.target
+    X, y = boston_housing_data()
     lr = LinearRegression()
 
     sfs = SFS(