Remove add_row_csr (scikit-learn#6676)

MechCoder · MechCoder · commit 20f89ef7a37d · 2016-05-24T14:16:49.000-07:00
diff --git a/sklearn/cluster/_k_means.pyx b/sklearn/cluster/_k_means.pyx
@@ -15,7 +15,7 @@ cimport numpy as np
 cimport cython
 
 from ..utils.extmath import norm
-from sklearn.utils.sparsefuncs_fast cimport add_row_csr
+from sklearn.utils.sparsefuncs_fast import assign_rows_csr
 from sklearn.utils.fixes import bincount
 
 ctypedef np.float64_t DOUBLE
@@ -326,9 +326,8 @@ def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters,
     centers: array, shape (n_clusters, n_features)
         The resulting centers
     """
-    n_features = X.shape[1]
-
-    cdef np.npy_intp cluster_id
+    cdef int n_features = X.shape[1]
+    cdef int curr_label
 
     cdef np.ndarray[DOUBLE, ndim=1] data = X.data
     cdef np.ndarray[int, ndim=1] indices = X.indices
@@ -341,24 +340,25 @@ def _centers_sparse(X, np.ndarray[INT, ndim=1] labels, n_clusters,
         bincount(labels, minlength=n_clusters)
     cdef np.ndarray[np.npy_intp, ndim=1, mode="c"] empty_clusters = \
         np.where(n_samples_in_cluster == 0)[0]
+    cdef int n_empty_clusters = empty_clusters.shape[0]
 
     # maybe also relocate small clusters?
 
-    if empty_clusters.shape[0] > 0:
+    if n_empty_clusters > 0:
         # find points to reassign empty clusters to
-        far_from_centers = distances.argsort()[::-1]
+        far_from_centers = distances.argsort()[::-1][:n_empty_clusters]
 
-        for i in range(empty_clusters.shape[0]):
-            cluster_id = empty_clusters[i]
+        # XXX two relocated clusters could be close to each other
+        assign_rows_csr(X, far_from_centers, empty_clusters, centers)
 
-            # XXX two relocated clusters could be close to each other
-            centers[cluster_id] = 0.
-            add_row_csr(data, indices, indptr, far_from_centers[i],
-                        centers[cluster_id])
-            n_samples_in_cluster[cluster_id] = 1
+        for i in range(n_empty_clusters):
+            n_samples_in_cluster[empty_clusters[i]] = 1
 
     for i in range(labels.shape[0]):
-        add_row_csr(data, indices, indptr, i, centers[labels[i]])
+        curr_label = labels[i]
+        for ind in range(indptr[i], indptr[i + 1]):
+            j = indices[ind]
+            centers[curr_label, j] += data[ind]
 
     centers /= n_samples_in_cluster[:, np.newaxis]
 
diff --git a/sklearn/utils/sparsefuncs_fast.pxd b/sklearn/utils/sparsefuncs_fast.pxd
diff --git a/sklearn/utils/sparsefuncs_fast.pyx b/sklearn/utils/sparsefuncs_fast.pyx
@@ -382,21 +382,6 @@ def _inplace_csr_row_normalize_l2(np.ndarray[floating, ndim=1] X_data,
             X_data[j] /= sum_
 
 
-cdef void add_row_csr(np.ndarray[np.float64_t, ndim=1] data,
-                      np.ndarray[int, ndim=1] indices,
-                      np.ndarray[int, ndim=1] indptr,
-                      int i, np.ndarray[np.float64_t, ndim=1, mode="c"] out):
-    """Add row i of CSR matrix (data, indices, indptr) to array out.
-
-    Equivalent to out += X[i].toarray(). Returns None.
-    """
-    cdef int ind, j
-
-    for ind in range(indptr[i], indptr[i + 1]):
-        j = indices[ind]
-        out[j] += data[ind]
-
-
 def assign_rows_csr(X,
                     np.ndarray[np.npy_intp, ndim=1] X_rows,
                     np.ndarray[np.npy_intp, ndim=1] out_rows,
@@ -427,8 +412,6 @@ def assign_rows_csr(X,
 
     out[out_rows] = 0.
     for i in range(X_rows.shape[0]):
-        # XXX we could reuse add_row_csr here, but the array slice
-        # is not optimized away.
         rX = X_rows[i]
         for ind in range(indptr[rX], indptr[rX + 1]):
             j = indices[ind]