[MRG+1] avoid integer overflow by using floats for matthews_corrcoef (scikit-learn#9693)

sam-s · amueller · commit 4c61e8b2237a · 2017-09-11T16:06:01.000-04:00
* Fix bug#9622: avoid integer overflow by using floats for matthews_corrcoef

* matthews_corrcoef: cosmetic change requested by jnothman

* Add test_matthews_corrcoef_overflow for Bug#9622

* test_matthews_corrcoef_overflow: clean-up and make deterministic

* matthews_corrcoef: pass dtype=np.float64 to sum &amp; trace instead of using astype

* test_matthews_corrcoef_overflow: add simple deterministic tests
diff --git a/sklearn/metrics/classification.py b/sklearn/metrics/classification.py
@@ -167,7 +167,7 @@ def accuracy_score(y_true, y_pred, normalize=True, sample_weight=None):
     2
 
     In the multilabel case with binary label indicators:
-    
+
     >>> accuracy_score(np.array([[0, 1], [1, 1]]), np.ones((2, 2)))
     0.5
     """
@@ -528,9 +528,9 @@ def matthews_corrcoef(y_true, y_pred, sample_weight=None):
     y_pred = lb.transform(y_pred)
 
     C = confusion_matrix(y_true, y_pred, sample_weight=sample_weight)
-    t_sum = C.sum(axis=1)
-    p_sum = C.sum(axis=0)
-    n_correct = np.trace(C)
+    t_sum = C.sum(axis=1, dtype=np.float64)
+    p_sum = C.sum(axis=0, dtype=np.float64)
+    n_correct = np.trace(C, dtype=np.float64)
     n_samples = p_sum.sum()
     cov_ytyp = n_correct * n_samples - np.dot(t_sum, p_sum)
     cov_ypyp = n_samples ** 2 - np.dot(p_sum, p_sum)
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
@@ -483,6 +483,41 @@ def test_matthews_corrcoef_multiclass():
     assert_almost_equal(mcc, 0.)
 
 
+def test_matthews_corrcoef_overflow():
+    # https://github.com/scikit-learn/scikit-learn/issues/9622
+    rng = np.random.RandomState(20170906)
+
+    def mcc_safe(y_true, y_pred):
+        conf_matrix = confusion_matrix(y_true, y_pred)
+        true_pos = conf_matrix[1, 1]
+        false_pos = conf_matrix[1, 0]
+        false_neg = conf_matrix[0, 1]
+        n_points = len(y_true)
+        pos_rate = (true_pos + false_neg) / n_points
+        activity = (true_pos + false_pos) / n_points
+        mcc_numerator = true_pos / n_points - pos_rate * activity
+        mcc_denominator = activity * pos_rate * (1 - activity) * (1 - pos_rate)
+        return mcc_numerator / np.sqrt(mcc_denominator)
+
+    def random_ys(n_points):    # binary
+        x_true = rng.random_sample(n_points)
+        x_pred = x_true + 0.2 * (rng.random_sample(n_points) - 0.5)
+        y_true = (x_true > 0.5)
+        y_pred = (x_pred > 0.5)
+        return y_true, y_pred
+
+    for n_points in [100, 10000, 1000000]:
+        arr = np.repeat([0., 1.], n_points)  # binary
+        assert_almost_equal(matthews_corrcoef(arr, arr), 1.0)
+        arr = np.repeat([0., 1., 2.], n_points)  # multiclass
+        assert_almost_equal(matthews_corrcoef(arr, arr), 1.0)
+
+        y_true, y_pred = random_ys(n_points)
+        assert_almost_equal(matthews_corrcoef(y_true, y_true), 1.0)
+        assert_almost_equal(matthews_corrcoef(y_true, y_pred),
+                            mcc_safe(y_true, y_pred))
+
+
 def test_precision_recall_f1_score_multiclass():
     # Test Precision Recall and F1 Score for multiclass classification task
     y_true, y_pred, _ = make_prediction(binary=False)