ENH labels parameter in P/R/F may extend or reduce label set

jnothman · jnothman · commit 019fc9b8e702 · 2015-06-09T01:02:57.000+10:00
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
@@ -155,7 +155,7 @@ take several parameters:
   certainties (``needs_threshold=True``).  The default value is
   False.
 
-* any additional parameters, such as ``beta`` in an :func:`f1_score`.
+* any additional parameters, such as ``beta`` or ``labels`` in :func:`f1_score`.
 
 Here is an example of building custom scorers, and of using the
 ``greater_is_better`` parameter::
@@ -657,8 +657,9 @@ specified by the ``average`` argument to the
 :func:`fbeta_score`, :func:`precision_recall_fscore_support`,
 :func:`precision_score` and :func:`recall_score` functions, as described
 :ref:`above <average>`. Note that for "micro"-averaging in a multiclass setting
-will produce equal precision, recall and :math:`F`, while "weighted" averaging
-may produce an F-score that is not between precision and recall.
+with all labels included will produce equal precision, recall and :math:`F`,
+while "weighted" averaging may produce an F-score that is not between
+precision and recall.
 
 To make this more explicit, consider the following notation:
 
@@ -709,6 +710,18 @@ Then the metrics are defined as:
   ... # doctest: +ELLIPSIS
   (array([ 0.66...,  0.        ,  0.        ]), array([ 1.,  0.,  0.]), array([ 0.71...,  0.        ,  0.        ]), array([2, 2, 2]...))
 
+For multiclass classification with a "negative class", it is possible to exclude some labels:
+
+  >>> metrics.recall_score(y_true, y_pred, labels=[1, 2], average='micro')
+  ... # excluding 0, no labels were correctly recalled
+  0.0
+
+Similarly, labels not present in the data sample may be accounted for in macro-averaging.
+
+  >>> metrics.precision_score(y_true, y_pred, labels=[0, 1, 2, 3], average='macro')
+  ... # doctest: +ELLIPSIS
+  0.166...
+
 .. _hinge_loss:
 
 Hinge loss
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -64,6 +64,14 @@ Enhancements
    - Added backlinks from the API reference pages to the user guide. By
      `Andreas Müller`_.
 
+   - The ``labels`` parameter to :func:`sklearn.metrics.f1_score`,
+     :func:`sklearn.metrics.fbeta_score`,
+     :func:`sklearn.metrics.recall_score` and
+     :func:`sklearn.metrics.precision_score` has been extended.
+     It is now possible to ignore one or more labels, such as where
+     a multiclass problem has a majority class to ignore. By `Joel Nothman`_.
+
+
 Bug fixes
 .........
 
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
@@ -517,8 +517,14 @@ def f1_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
     y_pred : 1d array-like, or label indicator array / sparse matrix
         Estimated targets as returned by a classifier.
 
-    labels : array
-        Integer array of labels.
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average. For multilabel targets,
+        labels are column indices. By default, all labels in ``y_true`` and
+        ``y_pred`` are used in sorted order.
 
     pos_label : str or int, 1 by default
         The class to report if ``average='binary'``. Until version 0.18 it is
@@ -614,8 +620,14 @@ def fbeta_score(y_true, y_pred, beta, labels=None, pos_label=1,
     beta: float
         Weight of precision in harmonic mean.
 
-    labels : array
-        Integer array of labels.
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average. For multilabel targets,
+        labels are column indices. By default, all labels in ``y_true`` and
+        ``y_pred`` are used in sorted order.
 
     pos_label : str or int, 1 by default
         The class to report if ``average='binary'``. Until version 0.18 it is
@@ -784,8 +796,14 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
     beta : float, 1.0 by default
         The strength of recall versus precision in the F-score.
 
-    labels : array
-        Integer array of labels.
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average. For multilabel targets,
+        labels are column indices. By default, all labels in ``y_true`` and
+        ``y_pred`` are used in sorted order.
 
     pos_label : str or int, 1 by default
         The class to report if ``average='binary'``. Until version 0.18 it is
@@ -879,6 +897,7 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         raise ValueError("beta should be >0 in the F-beta score")
 
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
+    present_labels = unique_labels(y_true, y_pred)
 
     if average == 'binary' and (y_type != 'binary' or pos_label is None):
         warnings.warn('The default `weighted` averaging is deprecated, '
@@ -891,17 +910,49 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
                       % str(average_options), DeprecationWarning, stacklevel=2)
         average = 'weighted'
 
-    label_order = labels  # save this for later
+    if y_type == 'binary' and pos_label is not None and average is not None:
+        if average != 'binary':
+            warnings.warn('From version 0.18, binary input will not be '
+                          'handled specially when using averaged '
+                          'precision/recall/F-score. '
+                          'Please use average=\'binary\' to report only the '
+                          'positive class performance.', DeprecationWarning)
+        if labels is None or len(labels) <= 2:
+            if pos_label not in present_labels:
+                if len(present_labels) < 2:
+                    # Only negative labels
+                    return (0., 0., 0., 0)
+                else:
+                    raise ValueError("pos_label=%r is not a valid label: %r" %
+                                     (pos_label, present_labels))
+            labels = [pos_label]
     if labels is None:
-        labels = unique_labels(y_true, y_pred)
+        labels = present_labels
+        n_labels = None
     else:
-        labels = np.sort(labels)
+        n_labels = len(labels)
+        labels = np.hstack([labels, np.setdiff1d(present_labels, labels,
+                                                 assume_unique=True)])
 
     ### Calculate tp_sum, pred_sum, true_sum ###
 
     if y_type.startswith('multilabel'):
         sum_axis = 1 if average == 'samples' else 0
 
+        # All labels are index integers for multilabel.
+        # Select labels:
+        if not np.all(labels == present_labels):
+            if np.max(labels) > np.max(present_labels):
+                raise ValueError('All labels must be in [0, n labels). '
+                                 'Got %d > %d' %
+                                 (np.max(labels), np.max(present_labels)))
+            if np.min(labels) < 0:
+                raise ValueError('All labels must be in [0, n labels). '
+                                 'Got %d < 0' % np.min(labels))
+
+            y_true = y_true[:, labels[:n_labels]]
+            y_pred = y_pred[:, labels[:n_labels]]
+
         # calculate weighted counts
         true_and_pred = y_true.multiply(y_pred)
         tp_sum = count_nonzero(true_and_pred, axis=sum_axis,
@@ -916,11 +967,11 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
                          "not meaningful outside multilabel "
                          "classification. See the accuracy_score instead.")
     else:
-        lb = LabelEncoder()
-        lb.fit(labels)
-        y_true = lb.transform(y_true)
-        y_pred = lb.transform(y_pred)
-        labels = lb.classes_
+        le = LabelEncoder()
+        le.fit(labels)
+        y_true = le.transform(y_true)
+        y_pred = le.transform(y_pred)
+        sorted_labels = le.classes_
 
         # labels are now from 0 to len(labels) - 1 -> use bincount
         tp = y_true == y_pred
@@ -943,28 +994,13 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
             true_sum = bincount(y_true, weights=sample_weight,
                                 minlength=len(labels))
 
-    ### Select labels to keep ###
+        # Retain only selected labels
+        indices = np.searchsorted(sorted_labels, labels[:n_labels])
+        tp_sum = tp_sum[indices]
+        true_sum = true_sum[indices]
+        pred_sum = pred_sum[indices]
 
-    if y_type == 'binary' and average is not None and pos_label is not None:
-        if average != 'binary':
-            warnings.warn('From version 0.18, binary input will not be '
-                          'handled specially when using averaged '
-                          'precision/recall/F-score. '
-                          'Please use average=\'binary\' to report only the '
-                          'positive class performance.', DeprecationWarning)
-        if pos_label not in labels:
-            if len(labels) == 1:
-                # Only negative labels
-                return (0., 0., 0., 0)
-            else:
-                raise ValueError("pos_label=%r is not a valid label: %r" %
-                                 (pos_label, labels))
-        pos_label_idx = labels == pos_label
-        tp_sum = tp_sum[pos_label_idx]
-        pred_sum = pred_sum[pos_label_idx]
-        true_sum = true_sum[pos_label_idx]
-
-    elif average == 'micro':
+    if average == 'micro':
         tp_sum = np.array([tp_sum.sum()])
         pred_sum = np.array([pred_sum.sum()])
         true_sum = np.array([true_sum.sum()])
@@ -1004,12 +1040,6 @@ def precision_recall_fscore_support(y_true, y_pred, beta=1.0, labels=None,
         recall = np.average(recall, weights=weights)
         f_score = np.average(f_score, weights=weights)
         true_sum = None  # return no support
-    elif label_order is not None:
-        indices = np.searchsorted(labels, label_order)
-        precision = precision[indices]
-        recall = recall[indices]
-        f_score = f_score[indices]
-        true_sum = true_sum[indices]
 
     return precision, recall, f_score, true_sum
 
@@ -1035,8 +1065,14 @@ def precision_score(y_true, y_pred, labels=None, pos_label=1,
     y_pred : 1d array-like, or label indicator array / sparse matrix
         Estimated targets as returned by a classifier.
 
-    labels : array
-        Integer array of labels.
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average. For multilabel targets,
+        labels are column indices. By default, all labels in ``y_true`` and
+        ``y_pred`` are used in sorted order.
 
     pos_label : str or int, 1 by default
         The class to report if ``average='binary'``. Until version 0.18 it is
@@ -1128,8 +1164,14 @@ def recall_score(y_true, y_pred, labels=None, pos_label=1, average='binary',
     y_pred : 1d array-like, or label indicator array / sparse matrix
         Estimated targets as returned by a classifier.
 
-    labels : array
-        Integer array of labels.
+    labels : list, optional
+        The set of labels to include when ``average != 'binary'``, and their
+        order if ``average is None``. Labels present in the data can be
+        excluded, for example to calculate a multiclass average ignoring a
+        majority negative class, while labels not present in the data will
+        result in 0 components in a macro average. For multilabel targets,
+        labels are column indices. By default, all labels in ``y_true`` and
+        ``y_pred`` are used in sorted order.
 
     pos_label : str or int, 1 by default
         The class to report if ``average='binary'``. Until version 0.18 it is
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
@@ -11,6 +11,7 @@
 
 from sklearn.datasets import make_multilabel_classification
 from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
+from sklearn.preprocessing import label_binarize
 from sklearn.utils.fixes import np_version
 from sklearn.utils.validation import check_random_state
 
@@ -173,6 +174,73 @@ def test_precision_recall_f_binary_single_class():
     assert_equal(0., f1_score([-1, -1], [-1, -1]))
 
 
+@ignore_warnings
+def test_precision_recall_f_extra_labels():
+    """Test handling of explicit additional (not in input) labels to PRF
+    """
+    y_true = [1, 3, 3, 2]
+    y_pred = [1, 1, 3, 2]
+    y_true_bin = label_binarize(y_true, classes=np.arange(5))
+    y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
+    data = [(y_true, y_pred),
+            (y_true_bin, y_pred_bin)]
+
+    for i, (y_true, y_pred) in enumerate(data):
+        # No average: zeros in array
+        actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
+                              average=None)
+        assert_array_almost_equal([0., 1., 1., .5, 0.], actual)
+
+        # Macro average is changed
+        actual = recall_score(y_true, y_pred, labels=[0, 1, 2, 3, 4],
+                              average='macro')
+        assert_array_almost_equal(np.mean([0., 1., 1., .5, 0.]), actual)
+
+        # No effect otheriwse
+        for average in ['micro', 'weighted', 'samples']:
+            if average == 'samples' and i == 0:
+                continue
+            assert_almost_equal(recall_score(y_true, y_pred,
+                                             labels=[0, 1, 2, 3, 4],
+                                             average=average),
+                                recall_score(y_true, y_pred, labels=None,
+                                             average=average))
+
+    # Error when introducing invalid label in multilabel case
+    # (although it would only affect performance if average='macro'/None)
+    for average in [None, 'macro', 'micro', 'samples']:
+        assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin,
+                      labels=np.arange(6), average=average)
+        assert_raises(ValueError, recall_score, y_true_bin, y_pred_bin,
+                      labels=np.arange(-1, 4), average=average)
+
+
+@ignore_warnings
+def test_precision_recall_f_ignored_labels():
+    """Test a subset of labels may be requested for PRF"""
+    y_true = [1, 1, 2, 3]
+    y_pred = [1, 3, 3, 3]
+    y_true_bin = label_binarize(y_true, classes=np.arange(5))
+    y_pred_bin = label_binarize(y_pred, classes=np.arange(5))
+    data = [(y_true, y_pred),
+            (y_true_bin, y_pred_bin)]
+
+    for i, (y_true, y_pred) in enumerate(data):
+        recall_13 = partial(recall_score, y_true, y_pred, labels=[1, 3])
+        recall_all = partial(recall_score, y_true, y_pred, labels=None)
+
+        assert_array_almost_equal([.5, 1.], recall_13(average=None))
+        assert_almost_equal((.5 + 1.) / 2, recall_13(average='macro'))
+        assert_almost_equal((.5 * 2 + 1. * 1) / 3,
+                            recall_13(average='weighted'))
+        assert_almost_equal(2. / 3, recall_13(average='micro'))
+
+        # ensure the above were meaningful tests:
+        for average in ['macro', 'weighted', 'micro']:
+            assert_not_equal(recall_13(average=average),
+                             recall_all(average=average))
+
+
 def test_average_precision_score_score_non_binary_class():
     # Test that average_precision_score function returns an error when trying
     # to compute average_precision_score for multiclass task.
@@ -315,7 +383,7 @@ def test_precision_refcall_f1_score_multilabel_unordered_labels():
     y_pred = np.array([[0, 0, 1, 1]])
     for average in ['samples', 'micro', 'macro', 'weighted', None]:
         p, r, f, s = precision_recall_fscore_support(
-            y_true, y_pred, labels=[4, 1, 2, 3], warn_for=[], average=average)
+            y_true, y_pred, labels=[3, 0, 1, 2], warn_for=[], average=average)
         assert_array_equal(p, 0)
         assert_array_equal(r, 0)
         assert_array_equal(f, 0)
diff --git a/sklearn/metrics/tests/test_common.py b/sklearn/metrics/tests/test_common.py
@@ -1085,9 +1085,9 @@ def test_no_averaging_labels():
     # in multi-class and multi-label cases
     y_true_multilabel = np.array([[1, 1, 0, 0], [1, 1, 0, 0]])
     y_pred_multilabel = np.array([[0, 0, 1, 1], [0, 1, 1, 0]])
-    y_true_multiclass = np.array([1, 2, 3])
-    y_pred_multiclass = np.array([1, 3, 4])
-    labels = np.array([4, 1, 2, 3])
+    y_true_multiclass = np.array([0, 1, 2])
+    y_pred_multiclass = np.array([0, 2, 3])
+    labels = np.array([3, 0, 1, 2])
     _, inverse_labels = np.unique(labels, return_inverse=True)
 
     for name in METRICS_WITH_AVERAGING: