Merge pull request scikit-learn#3772 from MechCoder/manhattan_metric

agramfort · agramfort · commit 30619ff44879 · 2014-10-21T09:50:15.000+02:00
[MRG+2] ENH: Patches Nearest Centroid for metric=manhattan for sparse and dense data
diff --git a/doc/whats_new.rst b/doc/whats_new.rst
@@ -119,6 +119,10 @@ Bug fixes
       estimator. It allows for instance to make bagging of a pipeline object.
       By `Arnaud Joly`_
 
+    - :class:`neighbors.NearestCentroid` now uses the median as the centroid
+      when metric is set to ``manhattan``. It was using the mean before.
+      By `Manoj Kumar`_
+
 API changes summary
 -------------------
 
diff --git a/sklearn/neighbors/nearest_centroid.py b/sklearn/neighbors/nearest_centroid.py
@@ -8,6 +8,7 @@
 #
 # License: BSD 3 clause
 
+import warnings
 import numpy as np
 from scipy import sparse as sp
 
@@ -16,6 +17,7 @@
 from ..metrics.pairwise import pairwise_distances
 from ..preprocessing import LabelEncoder
 from ..utils.validation import check_array, check_X_y
+from ..utils.sparsefuncs import csc_median_axis_0
 
 
 class NearestCentroid(BaseEstimator, ClassifierMixin):
@@ -31,6 +33,12 @@ class NearestCentroid(BaseEstimator, ClassifierMixin):
         feature array. If metric is a string or callable, it must be one of
         the options allowed by metrics.pairwise.pairwise_distances for its
         metric parameter.
+        The centroids for the samples corresponding to each class is the point
+        from which the sum of the distances (according to the metric) of all
+        samples that belong to that particular class are minimized.
+        If the "manhattan" metric is provided, this centroid is the median and
+        for all other metrics, the centroid is now set to be the mean.
+
     shrink_threshold : float, optional (default = None)
         Threshold for shrinking centroids to remove features.
 
@@ -86,8 +94,14 @@ def fit(self, X, y):
         y : array, shape = [n_samples]
             Target values (integers)
         """
-        X, y = check_X_y(X, y, ['csr', 'csc'])
-        if sp.issparse(X) and self.shrink_threshold:
+        # If X is sparse and the metric is "manhattan", store it in a csc
+        # format is easier to calculate the median.
+        if self.metric == 'manhattan':
+            X, y = check_X_y(X, y, ['csc'])
+        else:
+            X, y = check_X_y(X, y, ['csr', 'csc'])
+        is_X_sparse = sp.issparse(X)
+        if is_X_sparse and self.shrink_threshold:
             raise ValueError("threshold shrinking not supported"
                              " for sparse input")
 
@@ -107,9 +121,23 @@ def fit(self, X, y):
         for cur_class in y_ind:
             center_mask = y_ind == cur_class
             nk[cur_class] = np.sum(center_mask)
-            if sp.issparse(X):
+            if is_X_sparse:
                 center_mask = np.where(center_mask)[0]
-            self.centroids_[cur_class] = X[center_mask].mean(axis=0)
+
+            # XXX: Update other averaging methods according to the metrics.
+            if self.metric == "manhattan":
+                # NumPy does not calculate median of sparse matrices.
+                if not is_X_sparse:
+                    self.centroids_[cur_class] = np.median(X[center_mask], axis=0)
+                else:
+                    self.centroids_[cur_class] = csc_median_axis_0(X[center_mask])
+            else:
+                if self.metric != 'euclidean':
+                    warnings.warn("Averaging for metrics other than "
+                                  "euclidean and manhattan not supported. "
+                                  "The average is set to be the mean."
+                                  )
+                self.centroids_[cur_class] = X[center_mask].mean(axis=0)
 
         if self.shrink_threshold:
             dataset_centroid_ = np.mean(X, axis=0)
diff --git a/sklearn/neighbors/tests/test_nearest_centroid.py b/sklearn/neighbors/tests/test_nearest_centroid.py
@@ -125,6 +125,17 @@ def test_predict_translated_data():
     assert_array_equal(y_init, y_translate)
 
 
+def test_manhattan_metric():
+    """Test the manhattan metric."""
+
+    clf = NearestCentroid(metric='manhattan')
+    clf.fit(X, y)
+    dense_centroid = clf.centroids_
+    clf.fit(X_csr, y)
+    assert_array_equal(clf.centroids_, dense_centroid)
+    assert_array_equal(dense_centroid, [[-1, -1], [1, 1]])
+
+
 if __name__ == "__main__":
     import nose
     nose.runmodule()
diff --git a/sklearn/preprocessing/imputation.py b/sklearn/preprocessing/imputation.py
@@ -12,6 +12,7 @@
 from ..utils import check_array
 from ..utils import as_float_array
 from ..utils.fixes import astype
+from ..utils.sparsefuncs import _get_median
 
 from ..externals import six
 
@@ -31,34 +32,6 @@ def _get_mask(X, value_to_mask):
         return X == value_to_mask
 
 
-def _get_median(data, n_zeros):
-    """Compute the median of data with n_zeros additional zeros.
-
-    This function is used to support sparse matrices; it modifies data in-place
-    """
-    n_elems = len(data) + n_zeros
-    if not n_elems:
-        return np.nan
-    n_negative = np.count_nonzero(data < 0)
-    middle, is_odd = divmod(n_elems, 2)
-    data.sort()
-
-    if is_odd:
-        return _get_elem_at_rank(middle, data, n_negative, n_zeros)
-
-    return (_get_elem_at_rank(middle - 1, data, n_negative, n_zeros) +
-            _get_elem_at_rank(middle, data, n_negative, n_zeros)) / 2.
-
-
-def _get_elem_at_rank(rank, data, n_negative, n_zeros):
-    """Find the value in data augmented with n_zeros for the given rank"""
-    if rank < n_negative:
-        return data[rank]
-    if rank - n_negative < n_zeros:
-        return 0
-    return data[rank - n_zeros]
-
-
 def _most_frequent(array, extra_value, n_repeat):
     """Compute the most frequent value in a 1d array extended with
        [extra_value] * n_repeat, where extra_value is assumed to be not part
diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py
@@ -342,3 +342,63 @@ def count_nonzero(X, axis=None, sample_weight=None):
                                weights=weights)
     else:
         raise ValueError('Unsupported axis: {0}'.format(axis))
+
+
+def _get_median(data, n_zeros):
+    """Compute the median of data with n_zeros additional zeros.
+
+    This function is used to support sparse matrices; it modifies data in-place
+    """
+    n_elems = len(data) + n_zeros
+    if not n_elems:
+        return np.nan
+    n_negative = np.count_nonzero(data < 0)
+    middle, is_odd = divmod(n_elems, 2)
+    data.sort()
+
+    if is_odd:
+        return _get_elem_at_rank(middle, data, n_negative, n_zeros)
+
+    return (_get_elem_at_rank(middle - 1, data, n_negative, n_zeros) +
+            _get_elem_at_rank(middle, data, n_negative, n_zeros)) / 2.
+
+
+def _get_elem_at_rank(rank, data, n_negative, n_zeros):
+    """Find the value in data augmented with n_zeros for the given rank"""
+    if rank < n_negative:
+        return data[rank]
+    if rank - n_negative < n_zeros:
+        return 0
+    return data[rank - n_zeros]
+
+
+def csc_median_axis_0(X):
+    """Find the median across axis 0 of a CSC matrix.
+    It is equivalent to doing np.median(X, axis=0).
+
+    Parameters
+    ----------
+    X : CSC sparse matrix, shape (n_samples, n_features)
+        Input data.
+
+    Returns
+    -------
+    median : ndarray, shape (n_features,)
+        Median. 
+
+    """
+    if not isinstance(X, sp.csc_matrix):
+        raise TypeError("Expected matrix of CSC format, got %s" % X.format)
+
+    indptr = X.indptr
+    n_samples, n_features = X.shape
+    median = np.zeros(n_features)
+
+    for f_ind, (start, end) in enumerate(zip(indptr[:-1], indptr[1:])):
+
+        # Prevent modifying X in place
+        data = np.copy(X.data[start: end])
+        nz = n_samples - data.size
+        median[f_ind] = _get_median(data, nz)
+
+    return median
diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py
@@ -10,7 +10,7 @@
                                        inplace_row_scale,
                                        inplace_swap_row, inplace_swap_column,
                                        min_max_axis,
-                                       count_nonzero)
+                                       count_nonzero, csc_median_axis_0)
 from sklearn.utils.sparsefuncs_fast import assign_rows_csr
 from sklearn.utils.testing import assert_raises
 
@@ -359,3 +359,36 @@ def test_count_nonzero():
 
     assert_raises(TypeError, count_nonzero, X_csc)
     assert_raises(ValueError, count_nonzero, X_csr, axis=2)
+
+
+def test_csc_row_median():
+    """Test csc_row_median actually calculates the median."""
+
+    # Test that it gives the same output when X is dense.
+    rng = np.random.RandomState(0)
+    X = rng.rand(100, 50)
+    dense_median = np.median(X, axis=0)
+    csc = sp.csc_matrix(X)
+    sparse_median = csc_median_axis_0(csc)
+    assert_array_equal(sparse_median, dense_median)
+
+    # Test that it gives the same output when X is sparse
+    X = rng.rand(51, 100)
+    X[X < 0.7] = 0.0
+    ind = rng.randint(0, 50, 10)
+    X[ind] = -X[ind]
+    csc = sp.csc_matrix(X)
+    dense_median = np.median(X, axis=0)
+    sparse_median = csc_median_axis_0(csc)
+    assert_array_equal(sparse_median, dense_median)
+
+    # Test for toy data.
+    X = [[0, -2], [-1, -1], [1, 0], [2, 1]]
+    csc = sp.csc_matrix(X)
+    assert_array_equal(csc_median_axis_0(csc), np.array([0.5, -0.5]))
+    X = [[0, -2], [-1, -5], [1, -3]]
+    csc = sp.csc_matrix(X)
+    assert_array_equal(csc_median_axis_0(csc), np.array([0., -3]))
+
+    # Test that it raises an Error for non-csc matrices.
+    assert_raises(TypeError, csc_median_axis_0, sp.csr_matrix(X))