FIX address some feedback from forum (INRIA#337)

glemaitre · web-flow · commit 122ded8bdc68 · 2021-05-06T16:52:44.000+02:00
diff --git a/python_scripts/02_numerical_pipeline_ex_01.py b/python_scripts/02_numerical_pipeline_ex_01.py
@@ -62,8 +62,23 @@
 data_numeric_train, data_numeric_test, target_train, target_test = \
     train_test_split(data_numeric, target, random_state=0)
 
+# %% [markdown]
+# Split the dataset into a train and test sets.
 # %%
 from sklearn.model_selection import train_test_split
+
+# Write your code here.
+
+
+# %% [markdown]
+# Use a `DummyClassifier` such that the resulting classifier will always
+# predict the class `' >50K'`. What is the accuracy score on the test set?
+# Repeat the experiment by always predicting the class `' <=50K'`.
+#
+# Hint: you can refer to the parameter `strategy` of the `DummyClassifier`
+# to achieve the desired behaviour.
+
+# %%
 from sklearn.dummy import DummyClassifier
 
 # Write your code here.
diff --git a/python_scripts/02_numerical_pipeline_hands_on.py b/python_scripts/02_numerical_pipeline_hands_on.py
@@ -190,6 +190,11 @@
 #
 # To create a logistic regression model in scikit-learn you can do:
 
+# %%
+# to display nice model diagram
+from sklearn import set_config
+set_config(display='diagram')
+
 # %%
 from sklearn.linear_model import LogisticRegression
 
diff --git a/python_scripts/02_numerical_pipeline_introduction.py b/python_scripts/02_numerical_pipeline_introduction.py
@@ -83,14 +83,19 @@
 # into account its `k` closest samples in the training set and predicts the
 # majority target of these samples.
 #
-# The `fit` method is called to train the model from the input
-# (features) and target data.
-#
 # ```{caution}
 # We use a K-nearest neighbors here. However, be aware that it is seldom useful
 # in practice. We use it because it is an intuitive algorithm. In the next
 # notebook, we will introduce better models.
 # ```
+#
+# The `fit` method is called to train the model from the input (features) and
+# target data.
+
+# %%
+# to display nice model diagram
+from sklearn import set_config
+set_config(display='diagram')
 
 # %%
 from sklearn.neighbors import KNeighborsClassifier
@@ -105,12 +110,12 @@
 #
 # The method `fit` is composed of two elements: (i) a **learning algorithm**
 # and (ii) some **model states**. The learning algorithm takes the training
-# data and training target as input and set the model states. These model
-# states will be used later to either predict (for classifier and regressor) or
-# transform data (for transformers).
+# data and training target as input and sets the model states. These model
+# states will be used later to either predict (for classifiers and regressors)
+# or transform data (for transformers).
 #
 # Both the learning algorithm and the type of model states are specific to each
-# type of models.
+# type of model.
 
 # %% [markdown]
 # ```{note}
diff --git a/python_scripts/02_numerical_pipeline_scaling.py b/python_scripts/02_numerical_pipeline_scaling.py
@@ -34,6 +34,11 @@
 
 adult_census = pd.read_csv("../datasets/adult-census.csv")
 
+# %%
+# to display nice model diagram
+from sklearn import set_config
+set_config(display='diagram')
+
 # %% [markdown]
 # We will now drop the target from the data we will use to train our
 # predictive model.
@@ -191,15 +196,24 @@
 # `Pipeline`, which chains together operations and is used as any other
 # classifier or regressor. The helper function `make_pipeline` will create a
 # `Pipeline`: it takes as arguments the successive transformations to perform,
-# followed by the classifier or regressor model, and will assign automatically
-# a name at steps based on the name of the classes.
+# followed by the classifier or regressor model.
 
 # %%
 import time
 from sklearn.linear_model import LogisticRegression
 from sklearn.pipeline import make_pipeline
 
 model = make_pipeline(StandardScaler(), LogisticRegression())
+model
+
+# %% [markdown]
+# The `make_pipeline` function did not require us to give a name to each step.
+# Indeed, it was automatically assigned based on the name of the classes
+# provided; a `StandardScaler` will be a step named `"standardscaler"` in the
+# resulting pipeline. We can check the name of each steps of our model:
+
+# %%
+model.named_steps
 
 # %% [markdown]
 # This predictive pipeline exposes the same methods as the final predictor:
@@ -278,11 +292,11 @@
 #
 # ```{warning}
 # Working with non-scaled data will potentially force the algorithm to iterate
-# more as we showed in the example above. There is also a catastrophic scenario
-# where the number of required iterations are more than the maximum number of
-# iterations allowed by the predictor (controlled by the `max_iter`) parameter.
-# Therefore, before increasing `max_iter`, make sure that the data are well
-# scaled.
+# more as we showed in the example above. There is also the catastrophic
+# scenario where the number of required iterations are more than the maximum
+# number of iterations allowed by the predictor (controlled by the `max_iter`)
+# parameter. Therefore, before increasing `max_iter`, make sure that the data
+# are well scaled.
 # ```
 
 # %% [markdown]
@@ -298,7 +312,7 @@
 # the procedure such that the training and testing sets are different each
 # time. Statistical performance metrics are collected for each repetition and
 # then aggregated. As a result we can get an estimate of the variability of the
-# model statistical performance.
+# model's statistical performance.
 #
 # Note that there exists several cross-validation strategies, each of them
 # defines how to repeat the `fit`/`score` procedure. In this section, we will
diff --git a/python_scripts/03_categorical_pipeline_column_transformer.py b/python_scripts/03_categorical_pipeline_column_transformer.py
@@ -118,8 +118,7 @@
 model = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
 
 # %% [markdown]
-# Starting from `scikit-learn 0.23`, the notebooks can display an interactive
-# view of the pipelines.
+# We can display an interactive diagram with the following command:
 
 # %%
 from sklearn import set_config