77Authors: Shane Grigsby <[email protected] > 8899 Erich Schubert <[email protected] > 10+ 1011License: BSD 3 clause
1112"""
1213
2324class OPTICS (BaseEstimator , ClusterMixin ):
2425 """Estimate clustering structure from vector array
2526
26- OPTICS: Ordering Points To Identify the Clustering Structure Closely
27+ OPTICS ( Ordering Points To Identify the Clustering Structure), closely
2728 related to DBSCAN, finds core sample of high density and expands clusters
2829 from them [1]_. Unlike DBSCAN, keeps cluster hierarchy for a variable
2930 neighborhood radius. Better suited for usage on large datasets than the
3031 current sklearn implementation of DBSCAN.
3132
32- Clusters are then extracted using a DBSCAN like method [1]_.
33+ Clusters are then extracted using a DBSCAN-like method
34+ (cluster_method = 'dbscan') or an automatic
35+ technique proposed in [1]_ (cluster_method = 'xi').
3336
3437 This implementation deviates from the original OPTICS by first performing
3538 k-nearest-neighborhood searches on all points to identify core sizes, then
@@ -49,22 +52,21 @@ class OPTICS(BaseEstimator, ClusterMixin):
4952 2).
5053
5154 max_eps : float, optional (default=np.inf)
52- The maximum distance between two samples for them to be considered
53- as in the same neighborhood. Default value of ``np.inf`` will identify
54- clusters across all scales; reducing ``max_eps`` will result in
55- shorter run times.
55+ The maximum distance between two samples for one to be considered as
56+ in the neighborhood of the other . Default value of ``np.inf`` will
57+ identify clusters across all scales; reducing ``max_eps`` will result
58+ in shorter run times.
5659
5760 metric : string or callable, optional (default='minkowski')
58- metric to use for distance computation. Any metric from scikit-learn
61+ Metric to use for distance computation. Any metric from scikit-learn
5962 or scipy.spatial.distance can be used.
6063
6164 If metric is a callable function, it is called on each
6265 pair of instances (rows) and the resulting value recorded. The callable
6366 should take two arrays as input and return one value indicating the
6467 distance between them. This works for Scipy's metrics, but is less
65- efficient than passing the metric name as a string.
66-
67- Distance matrices are not supported.
68+ efficient than passing the metric name as a string. If metric is
69+ "precomputed", X is assumed to be a distance matrix and must be square.
6870
6971 Valid values for metric are:
7072
@@ -94,9 +96,9 @@ class OPTICS(BaseEstimator, ClusterMixin):
9496 reachability and ordering. Possible values are "xi" and "dbscan".
9597
9698 eps : float, optional (default=None)
97- The maximum distance between two samples for them to be considered
98- as in the same neighborhood. By default it assumes the same value as
99- ``max_eps``.
99+ The maximum distance between two samples for one to be considered as
100+ in the neighborhood of the other . By default it assumes the same value
101+ as ``max_eps``.
100102 Used only when ``cluster_method='dbscan'``.
101103
102104 xi : float, between 0 and 1, optional (default=0.05)
@@ -219,8 +221,10 @@ def fit(self, X, y=None):
219221
220222 Parameters
221223 ----------
222- X : array, shape (n_samples, n_features)
223- The data.
224+ X : array, shape (n_samples, n_features), or (n_samples, n_samples) \
225+ if metric=’precomputed’.
226+ A feature array, or array of distances between samples if
227+ metric='precomputed'.
224228
225229 y : ignored
226230
@@ -332,31 +336,32 @@ def compute_optics_graph(X, min_samples, max_eps, metric, p, metric_params,
332336
333337 Parameters
334338 ----------
335- X : array, shape (n_samples, n_features)
336- The data.
339+ X : array, shape (n_samples, n_features), or (n_samples, n_samples) \
340+ if metric=’precomputed’.
341+ A feature array, or array of distances between samples if
342+ metric='precomputed'
337343
338344 min_samples : int (default=5)
339345 The number of samples in a neighborhood for a point to be considered
340346 as a core point. Expressed as an absolute number or a fraction of the
341347 number of samples (rounded to be at least 2).
342348
343349 max_eps : float, optional (default=np.inf)
344- The maximum distance between two samples for them to be considered
345- as in the same neighborhood. Default value of " np.inf" will identify
346- clusters across all scales; reducing `max_eps` will result in
347- shorter run times.
350+ The maximum distance between two samples for one to be considered as
351+ in the neighborhood of the other . Default value of `` np.inf`` will
352+ identify clusters across all scales; reducing `` max_eps`` will result
353+ in shorter run times.
348354
349355 metric : string or callable, optional (default='minkowski')
350- metric to use for distance computation. Any metric from scikit-learn
356+ Metric to use for distance computation. Any metric from scikit-learn
351357 or scipy.spatial.distance can be used.
352358
353359 If metric is a callable function, it is called on each
354360 pair of instances (rows) and the resulting value recorded. The callable
355361 should take two arrays as input and return one value indicating the
356362 distance between them. This works for Scipy's metrics, but is less
357- efficient than passing the metric name as a string.
358-
359- Distance matrices are not supported.
363+ efficient than passing the metric name as a string. If metric is
364+ "precomputed", X is assumed to be a distance matrix and must be square.
360365
361366 Valid values for metric are:
362367
@@ -771,8 +776,7 @@ def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples,
771776 clusters.
772777 """
773778
774- # all indices are inclusive (specially at the end)
775- # add an inf to the end of reachability plot
779+ # Our implementation adds an inf to the end of reachability plot
776780 # this helps to find potential clusters at the end of the
777781 # reachability plot even if there's no upward region at the end of it.
778782 reachability_plot = np .hstack ((reachability_plot , np .inf ))
@@ -783,6 +787,10 @@ def _xi_cluster(reachability_plot, predecessor_plot, ordering, xi, min_samples,
783787 index = 0
784788 mib = 0. # maximum in between, section 4.3.2
785789
790+ # Our implementation corrects a mistake in the original
791+ # paper, i.e., in Definition 9 steep downward point,
792+ # r(p) * (1 - x1) <= r(p + 1) should be
793+ # r(p) * (1 - x1) >= r(p + 1)
786794 with np .errstate (invalid = 'ignore' ):
787795 ratio = reachability_plot [:- 1 ] / reachability_plot [1 :]
788796 steep_upward = ratio <= xi_complement
0 commit comments