Merge pull request scikit-learn#3472 from arjoly/fix-metrics-division

agramfort · agramfort · commit 24bd8f903057 · 2014-07-22T15:03:49.000+02:00
MAINT move log_loss and hinge_loss to the classification metrics
diff --git a/sklearn/metrics/__init__.py b/sklearn/metrics/__init__.py
@@ -6,19 +6,19 @@
 from .ranking import auc
 from .ranking import average_precision_score
 from .ranking import label_ranking_average_precision_score
-from .ranking import log_loss
 from .ranking import precision_recall_curve
 from .ranking import roc_auc_score
 from .ranking import roc_curve
-from .ranking import hinge_loss
 
 from .classification import accuracy_score
 from .classification import classification_report
 from .classification import confusion_matrix
 from .classification import f1_score
 from .classification import fbeta_score
 from .classification import hamming_loss
+from .classification import hinge_loss
 from .classification import jaccard_similarity_score
+from .classification import log_loss
 from .classification import matthews_corrcoef
 from .classification import precision_recall_fscore_support
 from .classification import precision_score
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
@@ -27,6 +27,7 @@
 
 from ..externals.six.moves import zip
 from ..preprocessing import label_binarize
+from ..preprocessing import LabelBinarizer
 from ..preprocessing import LabelEncoder
 from ..utils import check_array, check_consistent_length
 from ..utils import column_or_1d
@@ -1306,3 +1307,153 @@ def hamming_loss(y_true, y_pred, classes=None):
         return sp_hamming(y_true, y_pred)
     else:
         raise ValueError("{0} is not supported".format(y_type))
+
+
+def log_loss(y_true, y_pred, eps=1e-15, normalize=True):
+    """Log loss, aka logistic loss or cross-entropy loss.
+
+    This is the loss function used in (multinomial) logistic regression
+    and extensions of it such as neural networks, defined as the negative
+    log-likelihood of the true labels given a probabilistic classifier's
+    predictions. For a single sample with true label yt in {0,1} and
+    estimated probability yp that yt = 1, the log loss is
+
+        -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))
+
+    Parameters
+    ----------
+    y_true : array-like or label indicator matrix
+        Ground truth (correct) labels for n_samples samples.
+
+    y_pred : array-like of float, shape = (n_samples, n_classes)
+        Predicted probabilities, as returned by a classifier's
+        predict_proba method.
+
+    eps : float
+        Log loss is undefined for p=0 or p=1, so probabilities are
+        clipped to max(eps, min(1 - eps, p)).
+
+    normalize : bool, optional (default=True)
+        If true, return the mean loss per sample.
+        Otherwise, return the sum of the per-sample losses.
+
+    Returns
+    -------
+    loss : float
+
+    Examples
+    --------
+    >>> log_loss(["spam", "ham", "ham", "spam"],  # doctest: +ELLIPSIS
+    ...          [[.1, .9], [.9, .1], [.8, .2], [.35, .65]])
+    0.21616...
+
+    References
+    ----------
+    C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer,
+    p. 209.
+
+    Notes
+    -----
+    The logarithm used is the natural logarithm (base-e).
+    """
+    lb = LabelBinarizer()
+    T = lb.fit_transform(y_true)
+    if T.shape[1] == 1:
+        T = np.append(1 - T, T, axis=1)
+
+    # Clipping
+    Y = np.clip(y_pred, eps, 1 - eps)
+
+    # This happens in cases when elements in y_pred have type "str".
+    if not isinstance(Y, np.ndarray):
+        raise ValueError("y_pred should be an array of floats.")
+
+    # If y_pred is of single dimension, assume y_true to be binary
+    # and then check.
+    if Y.ndim == 1:
+        Y = Y[:, np.newaxis]
+    if Y.shape[1] == 1:
+        Y = np.append(1 - Y, Y, axis=1)
+
+    # Check if dimensions are consistent.
+    check_consistent_length(T, Y)
+    T = check_array(T)
+    Y = check_array(Y)
+    if T.shape[1] != Y.shape[1]:
+        raise ValueError("y_true and y_pred have different number of classes "
+                         "%d, %d" % (T.shape[1], Y.shape[1]))
+
+    # Renormalize
+    Y /= Y.sum(axis=1)[:, np.newaxis]
+    loss = -(T * np.log(Y)).sum()
+    return loss / T.shape[0] if normalize else loss
+
+
+def hinge_loss(y_true, pred_decision, pos_label=None, neg_label=None):
+    """Average hinge loss (non-regularized)
+
+    Assuming labels in y_true are encoded with +1 and -1, when a prediction
+    mistake is made, ``margin = y_true * pred_decision`` is always negative
+    (since the signs disagree), implying ``1 - margin`` is always greater than
+    1.  The cumulated hinge loss is therefore an upper bound of the number of
+    mistakes made by the classifier.
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+        True target, consisting of integers of two values. The positive label
+        must be greater than the negative label.
+
+    pred_decision : array, shape = [n_samples] or [n_samples, n_classes]
+        Predicted decisions, as output by decision_function (floats).
+
+    Returns
+    -------
+    loss : float
+
+    References
+    ----------
+    .. [1] `Wikipedia entry on the Hinge loss
+            <http://en.wikipedia.org/wiki/Hinge_loss>`_
+
+    Examples
+    --------
+    >>> from sklearn import svm
+    >>> from sklearn.metrics import hinge_loss
+    >>> X = [[0], [1]]
+    >>> y = [-1, 1]
+    >>> est = svm.LinearSVC(random_state=0)
+    >>> est.fit(X, y)
+    LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
+         intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
+         random_state=0, tol=0.0001, verbose=0)
+    >>> pred_decision = est.decision_function([[-2], [3], [0.5]])
+    >>> pred_decision  # doctest: +ELLIPSIS
+    array([-2.18...,  2.36...,  0.09...])
+    >>> hinge_loss([-1, 1, 1], pred_decision)  # doctest: +ELLIPSIS
+    0.30...
+
+    """
+    # TODO: multi-class hinge-loss
+    check_consistent_length(y_true, pred_decision)
+    y_true = column_or_1d(y_true)
+    pred_decision = column_or_1d(pred_decision)
+
+    # the rest of the code assumes that positive and negative labels
+    # are encoded as +1 and -1 respectively
+    lbin = LabelBinarizer(neg_label=-1)
+    y_true = lbin.fit_transform(y_true)[:, 0]
+
+    if len(lbin.classes_) > 2 or (pred_decision.ndim == 2
+                                  and pred_decision.shape[1] != 1):
+        raise ValueError("Multi-class hinge loss not supported")
+    pred_decision = np.ravel(pred_decision)
+
+    try:
+        margin = y_true * pred_decision
+    except TypeError:
+        raise TypeError("pred_decision should be an array of floats.")
+    losses = 1 - margin
+    # The hinge doesn't penalize good enough predictions.
+    losses[losses <= 0] = 0
+    return np.mean(losses)
diff --git a/sklearn/metrics/metrics.py b/sklearn/metrics/metrics.py
@@ -1,25 +1,25 @@
 import warnings
 warnings.warn("sklearn.metrics.metrics is deprecated and will be remove in "
-             "0.18. Please import from sklearn.metrics",
-             DeprecationWarning)
+              "0.18. Please import from sklearn.metrics",
+              DeprecationWarning)
 
 
 from .ranking import auc
 from .ranking import average_precision_score
 from .ranking import label_ranking_average_precision_score
-from .ranking import log_loss
 from .ranking import precision_recall_curve
 from .ranking import roc_auc_score
 from .ranking import roc_curve
-from .ranking import hinge_loss
 
 from .classification import accuracy_score
 from .classification import classification_report
 from .classification import confusion_matrix
 from .classification import f1_score
 from .classification import fbeta_score
 from .classification import hamming_loss
+from .classification import hinge_loss
 from .classification import jaccard_similarity_score
+from .classification import log_loss
 from .classification import matthews_corrcoef
 from .classification import precision_recall_fscore_support
 from .classification import precision_score
diff --git a/sklearn/metrics/ranking.py b/sklearn/metrics/ranking.py
@@ -102,76 +102,6 @@ def auc(x, y, reorder=False):
     return area
 
 
-def hinge_loss(y_true, pred_decision, pos_label=None, neg_label=None):
-    """Average hinge loss (non-regularized)
-
-    Assuming labels in y_true are encoded with +1 and -1, when a prediction
-    mistake is made, ``margin = y_true * pred_decision`` is always negative
-    (since the signs disagree), implying ``1 - margin`` is always greater than
-    1.  The cumulated hinge loss is therefore an upper bound of the number of
-    mistakes made by the classifier.
-
-    Parameters
-    ----------
-    y_true : array, shape = [n_samples]
-        True target, consisting of integers of two values. The positive label
-        must be greater than the negative label.
-
-    pred_decision : array, shape = [n_samples] or [n_samples, n_classes]
-        Predicted decisions, as output by decision_function (floats).
-
-    Returns
-    -------
-    loss : float
-
-    References
-    ----------
-    .. [1] `Wikipedia entry on the Hinge loss
-            <http://en.wikipedia.org/wiki/Hinge_loss>`_
-
-    Examples
-    --------
-    >>> from sklearn import svm
-    >>> from sklearn.metrics import hinge_loss
-    >>> X = [[0], [1]]
-    >>> y = [-1, 1]
-    >>> est = svm.LinearSVC(random_state=0)
-    >>> est.fit(X, y)
-    LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
-         intercept_scaling=1, loss='l2', multi_class='ovr', penalty='l2',
-         random_state=0, tol=0.0001, verbose=0)
-    >>> pred_decision = est.decision_function([[-2], [3], [0.5]])
-    >>> pred_decision  # doctest: +ELLIPSIS
-    array([-2.18...,  2.36...,  0.09...])
-    >>> hinge_loss([-1, 1, 1], pred_decision)  # doctest: +ELLIPSIS
-    0.30...
-
-    """
-    # TODO: multi-class hinge-loss
-    check_consistent_length(y_true, pred_decision)
-    y_true = column_or_1d(y_true)
-    pred_decision = column_or_1d(pred_decision)
-
-    # the rest of the code assumes that positive and negative labels
-    # are encoded as +1 and -1 respectively
-    lbin = LabelBinarizer(neg_label=-1)
-    y_true = lbin.fit_transform(y_true)[:, 0]
-
-    if len(lbin.classes_) > 2 or (pred_decision.ndim == 2
-                                  and pred_decision.shape[1] != 1):
-        raise ValueError("Multi-class hinge loss not supported")
-    pred_decision = np.ravel(pred_decision)
-
-    try:
-        margin = y_true * pred_decision
-    except TypeError:
-        raise TypeError("pred_decision should be an array of floats.")
-    losses = 1 - margin
-    # The hinge doesn't penalize good enough predictions.
-    losses[losses <= 0] = 0
-    return np.mean(losses)
-
-
 def average_precision_score(y_true, y_score, average="macro",
                             sample_weight=None):
     """Compute average precision (AP) from prediction scores
@@ -617,86 +547,6 @@ class or confidence values.
     return fpr, tpr, thresholds
 
 
-def log_loss(y_true, y_pred, eps=1e-15, normalize=True):
-    """Log loss, aka logistic loss or cross-entropy loss.
-
-    This is the loss function used in (multinomial) logistic regression
-    and extensions of it such as neural networks, defined as the negative
-    log-likelihood of the true labels given a probabilistic classifier's
-    predictions. For a single sample with true label yt in {0,1} and
-    estimated probability yp that yt = 1, the log loss is
-
-        -log P(yt|yp) = -(yt log(yp) + (1 - yt) log(1 - yp))
-
-    Parameters
-    ----------
-    y_true : array-like or label indicator matrix
-        Ground truth (correct) labels for n_samples samples.
-
-    y_pred : array-like of float, shape = (n_samples, n_classes)
-        Predicted probabilities, as returned by a classifier's
-        predict_proba method.
-
-    eps : float
-        Log loss is undefined for p=0 or p=1, so probabilities are
-        clipped to max(eps, min(1 - eps, p)).
-
-    normalize : bool, optional (default=True)
-        If true, return the mean loss per sample.
-        Otherwise, return the sum of the per-sample losses.
-
-    Returns
-    -------
-    loss : float
-
-    Examples
-    --------
-    >>> log_loss(["spam", "ham", "ham", "spam"],  # doctest: +ELLIPSIS
-    ...          [[.1, .9], [.9, .1], [.8, .2], [.35, .65]])
-    0.21616...
-
-    References
-    ----------
-    C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer,
-    p. 209.
-
-    Notes
-    -----
-    The logarithm used is the natural logarithm (base-e).
-    """
-    lb = LabelBinarizer()
-    T = lb.fit_transform(y_true)
-    if T.shape[1] == 1:
-        T = np.append(1 - T, T, axis=1)
-
-    # Clipping
-    Y = np.clip(y_pred, eps, 1 - eps)
-
-    # This happens in cases when elements in y_pred have type "str".
-    if not isinstance(Y, np.ndarray):
-        raise ValueError("y_pred should be an array of floats.")
-
-    # If y_pred is of single dimension, assume y_true to be binary
-    # and then check.
-    if Y.ndim == 1:
-        Y = Y[:, np.newaxis]
-    if Y.shape[1] == 1:
-        Y = np.append(1 - Y, Y, axis=1)
-
-    # Check if dimensions are consistent.
-    check_consistent_length(T, Y)
-    T = check_array(T)
-    Y = check_array(Y)
-    if T.shape[1] != Y.shape[1]:
-        raise ValueError("y_true and y_pred have different number of classes "
-                         "%d, %d" % (T.shape[1], Y.shape[1]))
-
-    # Renormalize
-    Y /= Y.sum(axis=1)[:, np.newaxis]
-    loss = -(T * np.log(Y)).sum()
-    return loss / T.shape[0] if normalize else loss
-
-
 def label_ranking_average_precision_score(y_true, y_score):
     """Compute ranking-based average precision
 
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
diff --git a/sklearn/metrics/tests/test_ranking.py b/sklearn/metrics/tests/test_ranking.py