DOC Improve OPTICS doc (scikit-learn#13866)

qinhanmin2014 · jnothman · commit b7b4d3e2f1a6 · 2019-05-15T17:34:22.000+10:00
diff --git a/sklearn/cluster/optics_.py b/sklearn/cluster/optics_.py
@@ -7,6 +7,7 @@
 Authors: Shane Grigsby <refuge@rocktalus.com>
          Adrin Jalali <adrinjalali@gmail.com>
          Erich Schubert <erich@debian.org>
+         Hanmin Qin <qinhanmin2005@sina.com>
 License: BSD 3 clause
 """
 
@@ -23,13 +24,15 @@
 class OPTICS(BaseEstimator, ClusterMixin):
     """Estimate clustering structure from vector array
 
-    OPTICS: Ordering Points To Identify the Clustering Structure Closely
+    OPTICS (Ordering Points To Identify the Clustering Structure), closely
     related to DBSCAN, finds core sample of high density and expands clusters
     from them [1]_. Unlike DBSCAN, keeps cluster hierarchy for a variable
     neighborhood radius. Better suited for usage on large datasets than the
     current sklearn implementation of DBSCAN.
 
-    Clusters are then extracted using a DBSCAN like method [1]_.
+    Clusters are then extracted using a DBSCAN-like method
+    (cluster_method = 'dbscan') or an automatic
+    technique proposed in [1]_ (cluster_method = 'xi').
 
     This implementation deviates from the original OPTICS by first performing
     k-nearest-neighborhood searches on all points to identify core sizes, then
@@ -49,22 +52,21 @@ class OPTICS(BaseEstimator, ClusterMixin):
         2).
 
     max_eps : float, optional (default=np.inf)
-        The maximum distance between two samples for them to be considered
-        as in the same neighborhood. Default value of ``np.inf`` will identify
-        clusters across all scales; reducing ``max_eps`` will result in
-        shorter run times.
+        The maximum distance between two samples for one to be considered as
+        in the neighborhood of the other. Default value of ``np.inf`` will
+        identify clusters across all scales; reducing ``max_eps`` will result
+        in shorter run times.
 
     metric : string or callable, optional (default='minkowski')
-        metric to use for distance computation. Any metric from scikit-learn
+        Metric to use for distance computation. Any metric from scikit-learn
         or scipy.spatial.distance can be used.
 
         If metric is a callable function, it is called on each
         pair of instances (rows) and the resulting value recorded. The callable
         should take two arrays as input and return one value indicating the
         distance between them. This works for Scipy's metrics, but is less
-        efficient than passing the metric name as a string.
-
-        Distance matrices are not supported.
+        efficient than passing the metric name as a string. If metric is
+        "precomputed", X is assumed to be a distance matrix and must be square.
 
         Valid values for metric are:
 
@@ -94,9 +96,9 @@ class OPTICS(BaseEstimator, ClusterMixin):
         reachability and ordering. Possible values are "xi" and "dbscan".
 
     eps : float, optional (default=None)
-        The maximum distance between two samples for them to be considered
-        as in the same neighborhood. By default it assumes the same value as
-        ``max_eps``.
+        The maximum distance between two samples for one to be considered as
+        in the neighborhood of the other. By default it assumes the same value
+        as ``max_eps``.
         Used only when ``cluster_method='dbscan'``.
 
     xi : float, between 0 and 1, optional (default=0.05)
@@ -219,8 +221,10 @@ def fit(self, X, y=None):
 
         Parameters
         ----------
-        X : array, shape (n_samples, n_features)
-            The data.
+        X : array, shape (n_samples, n_features), or (n_samples, n_samples)  \
+if metric=’precomputed’.
+            A feature array, or array of distances between samples if
+            metric='precomputed'.
 
         y : ignored
 
@@ -332,31 +336,32 @@ def compute_optics_graph(X, min_samples, max_eps, metric, p, metric_params,
 
     Parameters
     ----------
-    X : array, shape (n_samples, n_features)
-        The data.
+    X : array, shape (n_samples, n_features), or (n_samples, n_samples)  \
+if metric=’precomputed’.
+        A feature array, or array of distances between samples if
+        metric='precomputed'
 
     min_samples : int (default=5)
         The number of samples in a neighborhood for a point to be considered
         as a core point. Expressed as an absolute number or a fraction of the
         number of samples (rounded to be at least 2).
 
     max_eps : float, optional (default=np.inf)
-        The maximum distance between two samples for them to be considered
-        as in the same neighborhood. Default value of "np.inf" will identify
-        clusters across all scales; reducing `max_eps` will result in
-        shorter run times.
+        The maximum distance between two samples for one to be considered as
+        in the neighborhood of the other. Default value of ``np.inf`` will
+        identify clusters across all scales; reducing ``max_eps`` will result
+        in shorter run times.
 
     metric : string or callable, optional (default='minkowski')
-        metric to use for distance computation. Any metric from scikit-learn
+        Metric to use for distance computation. Any metric from scikit-learn
         or scipy.spatial.distance can be used.
 
         If metric is a callable function, it is called on each
         pair of instances (rows) and the resulting value recorded. The callable
         should take two arrays as input and return one value indicating the
         distance between them. This works for Scipy's metrics, but is less
-        efficient than passing the metric name as a string.
-
-        Distance matrices are not supported.
+        efficient than passing the metric name as a string. If metric is
+        "precomputed", X is assumed to be a distance matrix and must be square.
 
         Valid values for metric are:
 
@@ -771,8 +776,7 @@ def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples,
         clusters.
     """
 
-    # all indices are inclusive (specially at the end)
-    # add an inf to the end of reachability plot
+    # Our implementation adds an inf to the end of reachability plot
     # this helps to find potential clusters at the end of the
     # reachability plot even if there's no upward region at the end of it.
     reachability_plot = np.hstack((reachability_plot, np.inf))
@@ -783,6 +787,10 @@ def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples,
     index = 0
     mib = 0.  # maximum in between, section 4.3.2
 
+    # Our implementation corrects a mistake in the original
+    # paper, i.e., in Definition 9 steep downward point,
+    # r(p) * (1 - x1) <= r(p + 1) should be
+    # r(p) * (1 - x1) >= r(p + 1)
     with np.errstate(invalid='ignore'):
         ratio = reachability_plot[:-1] / reachability_plot[1:]
         steep_upward = ratio <= xi_complement