ENH multiclass probability estimates for SGDClassifier

larsmans · larsmans · commit 01801886b235 · 2013-03-30T12:33:33.000+01:00
Fixes scikit-learn#1814.
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -51,6 +51,9 @@ Changelog
      converts their ``coef_`` into a sparse matrix, meaning stored models
      trained using these estimators can be made much more compact.
 
+   - :class:`linear_model.SGDClassifier` now produces multiclass probability
+     estimates when trained under log loss or modified Huber loss.
+
    - Hyperlinks to documentation in example code on the website by
      `Martin Luessi`_.
 
diff --git a/sklearn/linear_model/base.py b/sklearn/linear_model/base.py
@@ -222,6 +222,25 @@ def predict(self, X):
             indices = scores.argmax(axis=1)
         return self.classes_[indices]
 
+    def _predict_proba_lr(self, X):
+        """Probability estimation for OvR logistic regression.
+
+        Positive class probabilities are computed as
+        1. / (1. + np.exp(-self.decision_function(X)));
+        multiclass is handled by normalizing that over all classes.
+        """
+        prob = self.decision_function(X)
+        prob *= -1
+        np.exp(prob, prob)
+        prob += 1
+        np.reciprocal(prob, prob)
+        if len(prob.shape) == 1:
+            return np.vstack([1 - prob, prob]).T
+        else:
+            # OvR normalization, like LibLinear's predict_probability
+            prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
+            return prob
+
 
 class SparseCoefMixin(object):
     """Mixin for converting coef_ to and from CSR format.
diff --git a/sklearn/linear_model/logistic.py b/sklearn/linear_model/logistic.py
@@ -120,18 +120,7 @@ def predict_proba(self, X):
             Returns the probability of the sample for each class in the model,
             where classes are ordered as they are in ``self.classes_``.
         """
-        # 1. / (1. + np.exp(-scores)), computed in-place
-        prob = self.decision_function(X)
-        prob *= -1
-        np.exp(prob, prob)
-        prob += 1
-        np.reciprocal(prob, prob)
-        if len(prob.shape) == 1:
-            return np.vstack([1 - prob, prob]).T
-        else:
-            # OvR, not softmax, like Liblinear's predict_probability
-            prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))
-            return prob
+        return self._predict_proba_lr(X)
 
     def predict_log_proba(self, X):
         """Log of probability estimates.
diff --git a/sklearn/linear_model/stochastic_gradient.py b/sklearn/linear_model/stochastic_gradient.py
@@ -654,7 +654,12 @@ class SGDClassifier(BaseSGDClassifier, SelectorMixin):
     def predict_proba(self, X):
         """Probability estimates.
 
-        Probability estimates are only supported for binary classification.
+        Multiclass probability estimates are derived from binary (one-vs.-rest)
+        estimates by simple normalization, as recommended by Zadrozny and
+        Elkan.
+
+        Binary probability estimates for loss="modified_huber" are given by
+        (clip(decision_function(X), -1, 1) + 1) / 2.
 
         Parameters
         ----------
@@ -668,34 +673,59 @@ def predict_proba(self, X):
 
         References
         ----------
+        Zadrozny and Elkan, "Transforming classifier scores into multiclass
+        probability estimates", SIGKDD'02,
+        http://www.research.ibm.com/people/z/zadrozny/kdd2002-Transf.pdf
 
         The justification for the formula in the loss="modified_huber"
         case is in the appendix B in:
         http://jmlr.csail.mit.edu/papers/volume2/zhang02c/zhang02c.pdf
         """
-        if len(self.classes_) != 2:
-            raise NotImplementedError("predict_(log_)proba only supported"
-                                      " for binary classification")
-
-        scores = self.decision_function(X)
-        proba = np.ones((scores.shape[0], 2), dtype=np.float64)
         if self.loss == "log":
-            proba[:, 1] = 1. / (1. + np.exp(-scores))
+            return self._predict_proba_lr(X)
 
         elif self.loss == "modified_huber":
-            proba[:, 1] = (np.clip(scores, -1, 1) + 1) / 2.
+            binary = (len(self.classes_) == 2)
+            scores = self.decision_function(X)
+
+            if binary:
+                prob2 = np.ones((scores.shape[0], 2))
+                prob = prob2[:, 1]
+            else:
+                prob = scores
+
+            np.clip(scores, -1, 1, prob)
+            prob += 1.
+            prob /= 2.
+
+            if binary:
+                prob2[:, 0] -= prob
+                prob = prob2
+            else:
+                # the above might assign zero to all classes, which doesn't
+                # normalize neatly; work around this to produce uniform
+                # probabilities
+                prob_sum = prob.sum(axis=1)
+                all_zero = (prob_sum == 0)
+                if np.any(all_zero):
+                    prob[all_zero, :] = 1
+                    prob_sum[all_zero] = len(self.classes_)
+
+                # normalize
+                prob /= prob_sum.reshape((prob.shape[0], -1))
+
+            return prob
 
         else:
             raise NotImplementedError("predict_(log_)proba only supported when"
                                       " loss='log' or loss='modified_huber' "
-                                      "(%s given)" % self.loss)
-        proba[:, 0] -= proba[:, 1]
-        return proba
+                                      "(%r given)" % self.loss)
 
     def predict_log_proba(self, X):
         """Log of probability estimates.
 
-        Log probability estimates are only supported for binary classification.
+        When loss="modified_huber", probability estimates may be hard zeros
+        and ones, so taking the logarithm is not possible.
 
         Parameters
         ----------
diff --git a/sklearn/linear_model/tests/test_sgd.py b/sklearn/linear_model/tests/test_sgd.py
@@ -319,9 +319,11 @@ def test_sgd_proba(self):
         clf = self.factory(loss="hinge", alpha=0.01, n_iter=10).fit(X, Y)
         assert_raises(NotImplementedError, clf.predict_proba, [3, 2])
 
-        # the log and modified_huber losses can output "probability" estimates
-        for loss in ("log", "modified_huber"):
-            clf = self.factory(loss=loss, alpha=0.01, n_iter=10).fit(X, Y)
+        # log and modified_huber losses can output probability estimates
+        # binary case
+        for loss in ["log", "modified_huber"]:
+            clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10)
+            clf.fit(X, Y)
             p = clf.predict_proba([3, 2])
             assert_true(p[0, 1] > 0.5)
             p = clf.predict_proba([-1, -1])
@@ -332,6 +334,49 @@ def test_sgd_proba(self):
             p = clf.predict_log_proba([-1, -1])
             assert_true(p[0, 1] < p[0, 0])
 
+        # log loss multiclass probability estimates
+        clf = self.factory(loss="log", alpha=0.01, n_iter=10).fit(X2, Y2)
+
+        d = clf.decision_function([[.1, -.1], [.3, .2]])
+        p = clf.predict_proba([[.1, -.1], [.3, .2]])
+        assert_array_equal(np.argmax(p, axis=1), np.argmax(d, axis=1))
+        assert_almost_equal(p[0].sum(), 1)
+        assert_true(np.all(p[0] >= 0))
+
+        p = clf.predict_proba([-1, -1])
+        d = clf.decision_function([-1, -1])
+        assert_array_equal(np.argsort(p[0]), np.argsort(d[0]))
+
+        l = clf.predict_log_proba([3, 2])
+        p = clf.predict_proba([3, 2])
+        assert_array_almost_equal(np.log(p), l)
+
+        l = clf.predict_log_proba([-1, -1])
+        p = clf.predict_proba([-1, -1])
+        assert_array_almost_equal(np.log(p), l)
+
+        # Modified Huber multiclass probability estimates; requires a separate
+        # test because the hard zero/one probabilities may destroy the
+        # ordering present in decision_function output.
+        clf = self.factory(loss="modified_huber", alpha=0.01, n_iter=10)
+        clf.fit(X2, Y2)
+        d = clf.decision_function([3, 2])
+        p = clf.predict_proba([3, 2])
+        if not isinstance(self, SparseSGDClassifierTestCase):
+            assert_equal(np.argmax(d, axis=1), np.argmax(p, axis=1))
+        else:   # XXX the sparse test gets a different X2 (?)
+            assert_equal(np.argmin(d, axis=1), np.argmin(p, axis=1))
+
+        # the following sample produces decision_function values < -1,
+        # which would cause naive normalization to fail (see comment
+        # in SGDClassifier.predict_proba)
+        x = X.mean(axis=0)
+        d = clf.decision_function(x)
+        if np.all(d < -1):  # XXX not true in sparse test case (why?)
+            p = clf.predict_proba(x)
+            assert_array_almost_equal(p[0], [1/3.] * 3)
+
+
     def test_sgd_l1(self):
         """Test L1 regularization"""
         n = len(X4)