|
27 | 27 | from __future__ import print_function |
28 | 28 |
|
29 | 29 | import pandas as pd |
| 30 | +import numpy as np |
30 | 31 |
|
31 | | -from sklearn.compose import make_column_transformer |
32 | | -from sklearn.pipeline import make_pipeline |
| 32 | +from sklearn.compose import ColumnTransformer |
| 33 | +from sklearn.pipeline import Pipeline |
33 | 34 | from sklearn.impute import SimpleImputer |
34 | 35 | from sklearn.preprocessing import StandardScaler, CategoricalEncoder |
35 | 36 | from sklearn.linear_model import LogisticRegression |
36 | 37 | from sklearn.model_selection import train_test_split, GridSearchCV |
37 | 38 |
|
| 39 | +np.random.seed(0) |
38 | 40 |
|
39 | 41 | # Read data from Titanic dataset. |
40 | 42 | titanic_url = ('https://raw.githubusercontent.com/amueller/' |
|
49 | 51 | # - embarked: categories encoded as strings {'C', 'S', 'Q'}. |
50 | 52 | # - sex: categories encoded as strings {'female', 'male'}. |
51 | 53 | # - pclass: ordinal integers {1, 2, 3}. |
52 | | -numeric_features = ['age', 'fare'] |
53 | | -categorical_features = ['embarked', 'sex', 'pclass'] |
54 | | - |
55 | | -# Provisionally, use pd.fillna() to impute missing values for categorical |
56 | | -# features; SimpleImputer will eventually support strategy="constant". |
57 | | -data[categorical_features] = data[categorical_features].fillna(value='missing') |
58 | 54 |
|
59 | 55 | # We create the preprocessing pipelines for both numeric and categorical data. |
60 | | -numeric_transformer = make_pipeline(SimpleImputer(), StandardScaler()) |
61 | | -categorical_transformer = CategoricalEncoder('onehot-dense', |
62 | | - handle_unknown='ignore') |
| 56 | +numeric_features = ['age', 'fare'] |
| 57 | +numeric_transformer = Pipeline(steps=[ |
| 58 | + ('imputer', SimpleImputer(strategy='median')), |
| 59 | + ('scaler', StandardScaler())]) |
| 60 | + |
| 61 | +categorical_features = ['embarked', 'sex', 'pclass'] |
| 62 | +categorical_transformer = Pipeline(steps=[ |
| 63 | + ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), |
| 64 | + ('onehot', CategoricalEncoder('onehot-dense', handle_unknown='ignore'))]) |
63 | 65 |
|
64 | | -preprocessing_pl = make_column_transformer( |
65 | | - (numeric_features, numeric_transformer), |
66 | | - (categorical_features, categorical_transformer), |
67 | | - remainder='drop' |
68 | | -) |
| 66 | +preprocessor = ColumnTransformer( |
| 67 | + transformers=[ |
| 68 | + ('num', numeric_transformer, numeric_features), |
| 69 | + ('cat', categorical_transformer, categorical_features)], |
| 70 | + remainder='drop') |
69 | 71 |
|
70 | 72 | # Append classifier to preprocessing pipeline. |
71 | 73 | # Now we have a full prediction pipeline. |
72 | | -clf = make_pipeline(preprocessing_pl, LogisticRegression()) |
| 74 | +clf = Pipeline(steps=[('preprocessor', preprocessor), |
| 75 | + ('classifier', LogisticRegression())]) |
73 | 76 |
|
74 | 77 | X = data.drop('survived', axis=1) |
75 | | -y = data.survived.values |
| 78 | +y = data['survived'] |
76 | 79 |
|
77 | 80 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, |
78 | 81 | shuffle=True) |
79 | 82 |
|
80 | 83 | clf.fit(X_train, y_train) |
81 | | -print("model score: %f" % clf.score(X_test, y_test)) |
| 84 | +print("model score: %.3f" % clf.score(X_test, y_test)) |
82 | 85 |
|
83 | 86 |
|
84 | 87 | ############################################################################### |
|
93 | 96 |
|
94 | 97 |
|
95 | 98 | param_grid = { |
96 | | - 'columntransformer__pipeline__simpleimputer__strategy': ['mean', 'median'], |
97 | | - 'logisticregression__C': [0.1, 1.0, 1.0], |
| 99 | + 'preprocessor__num__imputer__strategy': ['mean', 'median'], |
| 100 | + 'classifier__C': [0.1, 1.0, 10, 100], |
98 | 101 | } |
99 | 102 |
|
100 | 103 | grid_search = GridSearchCV(clf, param_grid, cv=10, iid=False) |
101 | 104 | grid_search.fit(X_train, y_train) |
102 | 105 |
|
103 | | -print(("best logistic regression from grid search: %f" |
| 106 | +print(("best logistic regression from grid search: %.3f" |
104 | 107 | % grid_search.score(X_test, y_test))) |
0 commit comments