1# -*- coding: utf-8 -*-
2"""
3HDBSCAN: Hierarchical Density-Based Spatial Clustering
4         of Applications with Noise
5"""
6
7import numpy as np
8
9from sklearn.base import BaseEstimator, ClusterMixin
10from sklearn.metrics import pairwise_distances
11from scipy.sparse import issparse
12from sklearn.neighbors import KDTree, BallTree
13from joblib import Memory
14import six
15from warnings import warn
16from sklearn.utils import check_array
17from joblib.parallel import cpu_count
18
19from scipy.sparse import csgraph
20
21from ._hdbscan_linkage import (single_linkage,
22                               mst_linkage_core,
23                               mst_linkage_core_vector,
24                               label)
25from ._hdbscan_tree import (condense_tree,
26                            compute_stability,
27                            get_clusters,
28                            outlier_scores)
29from ._hdbscan_reachability import (mutual_reachability,
30                                    sparse_mutual_reachability)
31
32from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm
33from .dist_metrics import DistanceMetric
34
35from .plots import CondensedTree, SingleLinkageTree, MinimumSpanningTree
36from .prediction import PredictionData
37
38FAST_METRICS = (KDTree.valid_metrics + BallTree.valid_metrics +
39                ['cosine', 'arccos'])
40
41# Author: Leland McInnes <leland.mcinnes@gmail.com>
42#         Steve Astels <sastels@gmail.com>
43#         John Healy <jchealy@gmail.com>
44#
45# License: BSD 3 clause
46from numpy import isclose
47
48def _tree_to_labels(X, single_linkage_tree, min_cluster_size=10,
49                    cluster_selection_method='eom',
50                    allow_single_cluster=False,
51                    match_reference_implementation=False,
52					cluster_selection_epsilon=0.0):
53    """Converts a pretrained tree and cluster size into a
54    set of labels and probabilities.
55    """
56    condensed_tree = condense_tree(single_linkage_tree,
57                                   min_cluster_size)
58    stability_dict = compute_stability(condensed_tree)
59    labels, probabilities, stabilities = get_clusters(condensed_tree,
60                                                      stability_dict,
61                                                      cluster_selection_method,
62                                                      allow_single_cluster,
63                                                      match_reference_implementation,
64													  cluster_selection_epsilon)
65
66    return (labels, probabilities, stabilities, condensed_tree,
67            single_linkage_tree)
68
69
70def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2,
71                     leaf_size=None, gen_min_span_tree=False, **kwargs):
72    if metric == 'minkowski':
73        distance_matrix = pairwise_distances(X, metric=metric, p=p)
74    elif metric == 'arccos':
75        distance_matrix = pairwise_distances(X, metric='cosine', **kwargs)
76    elif metric == 'precomputed':
77        # Treating this case explicitly, instead of letting
78        #   sklearn.metrics.pairwise_distances handle it,
79        #   enables the usage of numpy.inf in the distance
80        #   matrix to indicate missing distance information.
81        # TODO: Check if copying is necessary
82        distance_matrix = X.copy()
83    else:
84        distance_matrix = pairwise_distances(X, metric=metric, **kwargs)
85
86    if issparse(distance_matrix):
87        # raise TypeError('Sparse distance matrices not yet supported')
88        return _hdbscan_sparse_distance_matrix(distance_matrix, min_samples,
89                                               alpha, metric, p,
90                                               leaf_size, gen_min_span_tree,
91                                               **kwargs)
92
93    mutual_reachability_ = mutual_reachability(distance_matrix,
94                                               min_samples, alpha)
95
96    min_spanning_tree = mst_linkage_core(mutual_reachability_)
97
98    # Warn if the MST couldn't be constructed around the missing distances
99    if np.isinf(min_spanning_tree.T[2]).any():
100        warn('The minimum spanning tree contains edge weights with value '
101             'infinity. Potentially, you are missing too many distances '
102             'in the initial distance matrix for the given neighborhood '
103             'size.', UserWarning)
104
105    # mst_linkage_core does not generate a full minimal spanning tree
106    # If a tree is required then we must build the edges from the information
107    # returned by mst_linkage_core (i.e. just the order of points to be merged)
108    if gen_min_span_tree:
109        result_min_span_tree = min_spanning_tree.copy()
110        for index, row in enumerate(result_min_span_tree[1:], 1):
111            candidates = np.where(isclose(mutual_reachability_[int(row[1])],
112                                          row[2]))[0]
113            candidates = np.intersect1d(candidates,
114                                        min_spanning_tree[:index, :2].astype(
115                                            int))
116            candidates = candidates[candidates != row[1]]
117            assert len(candidates) > 0
118            row[0] = candidates[0]
119    else:
120        result_min_span_tree = None
121
122    # Sort edges of the min_spanning_tree by weight
123    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]),
124                        :]
125
126    # Convert edge list into standard hierarchical clustering format
127    single_linkage_tree = label(min_spanning_tree)
128
129    return single_linkage_tree, result_min_span_tree
130
131
132def _hdbscan_sparse_distance_matrix(X, min_samples=5, alpha=1.0,
133                                    metric='minkowski', p=2, leaf_size=40,
134                                    gen_min_span_tree=False, **kwargs):
135    assert issparse(X)
136    # Check for connected component on X
137    if csgraph.connected_components(X, directed=False, return_labels=False) > 1:
138        raise ValueError('Sparse distance matrix has multiple connected '
139                         'components!\nThat is, there exist groups of points '
140                         'that are completely disjoint -- there are no distance '
141                         'relations connecting them\n'
142                         'Run hdbscan on each component.')
143
144    lil_matrix = X.tolil()
145
146    # Compute sparse mutual reachability graph
147    # if max_dist > 0, max distance to use when the reachability is infinite
148    max_dist = kwargs.get("max_dist", 0.)
149    mutual_reachability_ = sparse_mutual_reachability(lil_matrix,
150                                                      min_points=min_samples,
151                                                      max_dist=max_dist,
152                                                      alpha=alpha)
153    # Check connected component on mutual reachability
154    # If more than one component, it means that even if the distance matrix X
155    # has one component, there exists with less than `min_samples` neighbors
156    if csgraph.connected_components(mutual_reachability_, directed=False,
157                                    return_labels=False) > 1:
158        raise ValueError(('There exists points with less than %s neighbors. '
159                          'Ensure your distance matrix has non zeros values for '
160                          'at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), '
161                          'or specify a `max_dist` to use when distances are missing.')
162                         % (min_samples, min_samples))
163
164    # Compute the minimum spanning tree for the sparse graph
165    sparse_min_spanning_tree = csgraph.minimum_spanning_tree(
166        mutual_reachability_)
167
168    # Convert the graph to scipy cluster array format
169    nonzeros = sparse_min_spanning_tree.nonzero()
170    nonzero_vals = sparse_min_spanning_tree[nonzeros]
171    min_spanning_tree = np.vstack(nonzeros + (nonzero_vals,)).T
172
173    # Sort edges of the min_spanning_tree by weight
174    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]),
175                        :][0]
176
177    # Convert edge list into standard hierarchical clustering format
178    single_linkage_tree = label(min_spanning_tree)
179
180    if gen_min_span_tree:
181        return single_linkage_tree, min_spanning_tree
182    else:
183        return single_linkage_tree, None
184
185
186def _hdbscan_prims_kdtree(X, min_samples=5, alpha=1.0,
187                          metric='minkowski', p=2, leaf_size=40,
188                          gen_min_span_tree=False, **kwargs):
189    if X.dtype != np.float64:
190        X = X.astype(np.float64)
191
192    # The Cython routines used require contiguous arrays
193    if not X.flags['C_CONTIGUOUS']:
194        X = np.array(X, dtype=np.double, order='C')
195
196    tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
197
198    # TO DO: Deal with p for minkowski appropriately
199    dist_metric = DistanceMetric.get_metric(metric, **kwargs)
200
201    # Get distance to kth nearest neighbour
202    core_distances = tree.query(X, k=min_samples,
203                                dualtree=True,
204                                breadth_first=True)[0][:, -1].copy(order='C')
205    # Mutual reachability distance is implicit in mst_linkage_core_vector
206    min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric,
207                                                alpha)
208
209    # Sort edges of the min_spanning_tree by weight
210    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]),
211                        :]
212
213    # Convert edge list into standard hierarchical clustering format
214    single_linkage_tree = label(min_spanning_tree)
215
216    if gen_min_span_tree:
217        warn('Cannot generate Minimum Spanning Tree; '
218             'the implemented Prim\'s does not produce '
219             'the full minimum spanning tree ', UserWarning)
220
221    return single_linkage_tree, None
222
223
224def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0,
225                            metric='minkowski', p=2, leaf_size=40,
226                            gen_min_span_tree=False, **kwargs):
227    if X.dtype != np.float64:
228        X = X.astype(np.float64)
229
230    # The Cython routines used require contiguous arrays
231    if not X.flags['C_CONTIGUOUS']:
232        X = np.array(X, dtype=np.double, order='C')
233
234    tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
235
236    dist_metric = DistanceMetric.get_metric(metric, **kwargs)
237
238    # Get distance to kth nearest neighbour
239    core_distances = tree.query(X, k=min_samples,
240                                dualtree=True,
241                                breadth_first=True)[0][:, -1].copy(order='C')
242
243    # Mutual reachability distance is implicit in mst_linkage_core_vector
244    min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric,
245                                                alpha)
246    # Sort edges of the min_spanning_tree by weight
247    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]),
248                        :]
249    # Convert edge list into standard hierarchical clustering format
250    single_linkage_tree = label(min_spanning_tree)
251
252    if gen_min_span_tree:
253        warn('Cannot generate Minimum Spanning Tree; '
254             'the implemented Prim\'s does not produce '
255             'the full minimum spanning tree ', UserWarning)
256
257    return single_linkage_tree, None
258
259
260def _hdbscan_boruvka_kdtree(X, min_samples=5, alpha=1.0,
261                            metric='minkowski', p=2, leaf_size=40,
262                            approx_min_span_tree=True,
263                            gen_min_span_tree=False,
264                            core_dist_n_jobs=4, **kwargs):
265    if leaf_size < 3:
266        leaf_size = 3
267
268    if core_dist_n_jobs < 1:
269        core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1)
270
271    if X.dtype != np.float64:
272        X = X.astype(np.float64)
273
274    tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
275    alg = KDTreeBoruvkaAlgorithm(tree, min_samples, metric=metric,
276                                 leaf_size=leaf_size // 3,
277                                 approx_min_span_tree=approx_min_span_tree,
278                                 n_jobs=core_dist_n_jobs, **kwargs)
279    min_spanning_tree = alg.spanning_tree()
280    # Sort edges of the min_spanning_tree by weight
281    row_order = np.argsort(min_spanning_tree.T[2])
282    min_spanning_tree = min_spanning_tree[row_order, :]
283    # Convert edge list into standard hierarchical clustering format
284    single_linkage_tree = label(min_spanning_tree)
285
286    if gen_min_span_tree:
287        return single_linkage_tree, min_spanning_tree
288    else:
289        return single_linkage_tree, None
290
291
292def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0,
293                              metric='minkowski', p=2, leaf_size=40,
294                              approx_min_span_tree=True,
295                              gen_min_span_tree=False,
296                              core_dist_n_jobs=4, **kwargs):
297    if leaf_size < 3:
298        leaf_size = 3
299
300    if core_dist_n_jobs < 1:
301        core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1)
302
303    if X.dtype != np.float64:
304        X = X.astype(np.float64)
305
306    tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
307    alg = BallTreeBoruvkaAlgorithm(tree, min_samples, metric=metric,
308                                   leaf_size=leaf_size // 3,
309                                   approx_min_span_tree=approx_min_span_tree,
310                                   n_jobs=core_dist_n_jobs, **kwargs)
311    min_spanning_tree = alg.spanning_tree()
312    # Sort edges of the min_spanning_tree by weight
313    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]),
314                        :]
315    # Convert edge list into standard hierarchical clustering format
316    single_linkage_tree = label(min_spanning_tree)
317
318    if gen_min_span_tree:
319        return single_linkage_tree, min_spanning_tree
320    else:
321        return single_linkage_tree, None
322
323
324def check_precomputed_distance_matrix(X):
325    """Perform check_array(X) after removing infinite values (numpy.inf) from the given distance matrix.
326    """
327    tmp = X.copy()
328    tmp[np.isinf(tmp)] = 1
329    check_array(tmp)
330
331
332def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, cluster_selection_epsilon=0.0,
333            metric='minkowski', p=2, leaf_size=40,
334            algorithm='best', memory=Memory(cachedir=None, verbose=0),
335            approx_min_span_tree=True, gen_min_span_tree=False,
336            core_dist_n_jobs=4,
337            cluster_selection_method='eom', allow_single_cluster=False,
338            match_reference_implementation=False, **kwargs):
339    """Perform HDBSCAN clustering from a vector array or distance matrix.
340
341    Parameters
342    ----------
343    X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
344            array of shape (n_samples, n_samples)
345        A feature array, or array of distances between samples if
346        ``metric='precomputed'``.
347
348    min_cluster_size : int, optional (default=5)
349        The minimum number of samples in a group for that group to be
350        considered a cluster; groupings smaller than this size will be left
351        as noise.
352
353    min_samples : int, optional (default=None)
354        The number of samples in a neighborhood for a point
355        to be considered as a core point. This includes the point itself.
356        defaults to the min_cluster_size.
357
358	cluster_selection_epsilon: float, optional (default=0.0)
359		A distance threshold. Clusters below this value will be merged.
360        See [3]_ for more information.
361
362    alpha : float, optional (default=1.0)
363        A distance scaling parameter as used in robust single linkage.
364        See [2]_ for more information.
365
366    metric : string or callable, optional (default='minkowski')
367        The metric to use when calculating distance between instances in a
368        feature array. If metric is a string or callable, it must be one of
369        the options allowed by metrics.pairwise.pairwise_distances for its
370        metric parameter.
371        If metric is "precomputed", X is assumed to be a distance matrix and
372        must be square.
373
374    p : int, optional (default=2)
375        p value to use if using the minkowski metric.
376
377    leaf_size : int, optional (default=40)
378        Leaf size for trees responsible for fast nearest
379        neighbour queries.
380
381    algorithm : string, optional (default='best')
382        Exactly which algorithm to use; hdbscan has variants specialised
383        for different characteristics of the data. By default this is set
384        to ``best`` which chooses the "best" algorithm given the nature of
385        the data. You can force other options if you believe you know
386        better. Options are:
387            * ``best``
388            * ``generic``
389            * ``prims_kdtree``
390            * ``prims_balltree``
391            * ``boruvka_kdtree``
392            * ``boruvka_balltree``
393
394    memory : instance of joblib.Memory or string, optional
395        Used to cache the output of the computation of the tree.
396        By default, no caching is done. If a string is given, it is the
397        path to the caching directory.
398
399    approx_min_span_tree : bool, optional (default=True)
400        Whether to accept an only approximate minimum spanning tree.
401        For some algorithms this can provide a significant speedup, but
402        the resulting clustering may be of marginally lower quality.
403        If you are willing to sacrifice speed for correctness you may want
404        to explore this; in general this should be left at the default True.
405
406    gen_min_span_tree : bool, optional (default=False)
407        Whether to generate the minimum spanning tree for later analysis.
408
409    core_dist_n_jobs : int, optional (default=4)
410        Number of parallel jobs to run in core distance computations (if
411        supported by the specific algorithm). For ``core_dist_n_jobs``
412        below -1, (n_cpus + 1 + core_dist_n_jobs) are used.
413
414    cluster_selection_method : string, optional (default='eom')
415        The method used to select clusters from the condensed tree. The
416        standard approach for HDBSCAN* is to use an Excess of Mass algorithm
417        to find the most persistent clusters. Alternatively you can instead
418        select the clusters at the leaves of the tree -- this provides the
419        most fine grained and homogeneous clusters. Options are:
420            * ``eom``
421            * ``leaf``
422
423    allow_single_cluster : bool, optional (default=False)
424        By default HDBSCAN* will not produce a single cluster, setting this
425        to t=True will override this and allow single cluster results in
426        the case that you feel this is a valid result for your dataset.
427        (default False)
428
429    match_reference_implementation : bool, optional (default=False)
430        There exist some interpretational differences between this
431        HDBSCAN* implementation and the original authors reference
432        implementation in Java. This can result in very minor differences
433        in clustering results. Setting this flag to True will, at a some
434        performance cost, ensure that the clustering results match the
435        reference implementation.
436
437    **kwargs : optional
438        Arguments passed to the distance metric
439
440    Returns
441    -------
442    labels : ndarray, shape (n_samples, )
443        Cluster labels for each point.  Noisy samples are given the label -1.
444
445    probabilities : ndarray, shape (n_samples, )
446        Cluster membership strengths for each point. Noisy samples are assigned
447        0.
448
449    cluster_persistence : array, shape  (n_clusters, )
450        A score of how persistent each cluster is. A score of 1.0 represents
451        a perfectly stable cluster that persists over all distance scales,
452        while a score of 0.0 represents a perfectly ephemeral cluster. These
453        scores can be guage the relative coherence of the clusters output
454        by the algorithm.
455
456    condensed_tree : record array
457        The condensed cluster hierarchy used to generate clusters.
458
459    single_linkage_tree : ndarray, shape (n_samples - 1, 4)
460        The single linkage tree produced during clustering in scipy
461        hierarchical clustering format
462        (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html).
463
464    min_spanning_tree : ndarray, shape (n_samples - 1, 3)
465        The minimum spanning as an edgelist. If gen_min_span_tree was False
466        this will be None.
467
468    References
469    ----------
470
471    .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
472       Density-based clustering based on hierarchical density estimates.
473       In Pacific-Asia Conference on Knowledge Discovery and Data Mining
474       (pp. 160-172). Springer Berlin Heidelberg.
475
476    .. [2] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the
477       cluster tree. In Advances in Neural Information Processing Systems
478       (pp. 343-351).
479
480    .. [3] Malzer, C., & Baum, M. (2019). A Hybrid Approach To Hierarchical
481	   Density-based Cluster Selection. arxiv preprint 1911.02282.
482    """
483    if min_samples is None:
484        min_samples = min_cluster_size
485
486    if type(min_samples) is not int or type(min_cluster_size) is not int:
487        raise ValueError('Min samples and min cluster size must be integers!')
488
489    if min_samples <= 0 or min_cluster_size <= 0:
490        raise ValueError('Min samples and Min cluster size must be positive'
491                         ' integers')
492
493    if min_cluster_size == 1:
494        raise ValueError('Min cluster size must be greater than one')
495
496    if type(cluster_selection_epsilon) is int:
497        cluster_selection_epsilon = float(cluster_selection_epsilon)
498
499    if type(cluster_selection_epsilon) is not float or cluster_selection_epsilon < 0.0:
500        raise ValueError('Epsilon must be a float value greater than or equal to 0!')
501
502    if not isinstance(alpha, float) or alpha <= 0.0:
503        raise ValueError('Alpha must be a positive float value greater than'
504                         ' 0!')
505
506    if leaf_size < 1:
507        raise ValueError('Leaf size must be greater than 0!')
508
509    if metric == 'minkowski':
510        if p is None:
511            raise TypeError('Minkowski metric given but no p value supplied!')
512        if p < 0:
513            raise ValueError('Minkowski metric with negative p value is not'
514                             ' defined!')
515
516    if match_reference_implementation:
517        min_samples = min_samples - 1
518        min_cluster_size = min_cluster_size + 1
519        approx_min_span_tree = False
520
521    if cluster_selection_method not in ('eom', 'leaf'):
522        raise ValueError('Invalid Cluster Selection Method: %s\n'
523                         'Should be one of: "eom", "leaf"\n')
524
525    # Checks input and converts to an nd-array where possible
526    if metric != 'precomputed' or issparse(X):
527        X = check_array(X, accept_sparse='csr')
528    else:
529        # Only non-sparse, precomputed distance matrices are handled here
530        #   and thereby allowed to contain numpy.inf for missing distances
531        check_precomputed_distance_matrix(X)
532
533    # Python 2 and 3 compliant string_type checking
534    if isinstance(memory, six.string_types):
535        memory = Memory(cachedir=memory, verbose=0)
536
537    size = X.shape[0]
538    min_samples = min(size - 1, min_samples)
539    if min_samples == 0:
540        min_samples = 1
541
542    if algorithm != 'best':
543        if metric != 'precomputed' and issparse(X) and algorithm != 'generic':
544            raise ValueError("Sparse data matrices only support algorithm 'generic'.")
545
546        if algorithm == 'generic':
547            (single_linkage_tree,
548             result_min_span_tree) = memory.cache(
549                _hdbscan_generic)(X, min_samples, alpha, metric,
550                                  p, leaf_size, gen_min_span_tree, **kwargs)
551        elif algorithm == 'prims_kdtree':
552            if metric not in KDTree.valid_metrics:
553                raise ValueError("Cannot use Prim's with KDTree for this"
554                                 " metric!")
555            (single_linkage_tree, result_min_span_tree) = memory.cache(
556                _hdbscan_prims_kdtree)(X, min_samples, alpha,
557                                       metric, p, leaf_size,
558                                       gen_min_span_tree, **kwargs)
559        elif algorithm == 'prims_balltree':
560            if metric not in BallTree.valid_metrics:
561                raise ValueError("Cannot use Prim's with BallTree for this"
562                                 " metric!")
563            (single_linkage_tree, result_min_span_tree) = memory.cache(
564                _hdbscan_prims_balltree)(X, min_samples, alpha,
565                                         metric, p, leaf_size,
566                                         gen_min_span_tree, **kwargs)
567        elif algorithm == 'boruvka_kdtree':
568            if metric not in BallTree.valid_metrics:
569                raise ValueError("Cannot use Boruvka with KDTree for this"
570                                 " metric!")
571            (single_linkage_tree, result_min_span_tree) = memory.cache(
572                _hdbscan_boruvka_kdtree)(X, min_samples, alpha,
573                                         metric, p, leaf_size,
574                                         approx_min_span_tree,
575                                         gen_min_span_tree,
576                                         core_dist_n_jobs, **kwargs)
577        elif algorithm == 'boruvka_balltree':
578            if metric not in BallTree.valid_metrics:
579                raise ValueError("Cannot use Boruvka with BallTree for this"
580                                 " metric!")
581            if (X.shape[0] // leaf_size) > 16000:
582                warn("A large dataset size and small leaf_size may induce excessive "
583                     "memory usage. If you are running out of memory consider "
584                     "increasing the ``leaf_size`` parameter.")
585            (single_linkage_tree, result_min_span_tree) = memory.cache(
586                _hdbscan_boruvka_balltree)(X, min_samples, alpha,
587                                           metric, p, leaf_size,
588                                           approx_min_span_tree,
589                                           gen_min_span_tree,
590                                           core_dist_n_jobs, **kwargs)
591        else:
592            raise TypeError('Unknown algorithm type %s specified' % algorithm)
593    else:
594
595        if issparse(X) or metric not in FAST_METRICS:
596            # We can't do much with sparse matrices ...
597            (single_linkage_tree, result_min_span_tree) = memory.cache(
598                _hdbscan_generic)(X, min_samples,
599                                  alpha, metric, p, leaf_size,
600                                  gen_min_span_tree, **kwargs)
601        elif metric in KDTree.valid_metrics:
602            # TO DO: Need heuristic to decide when to go to boruvka;
603            # still debugging for now
604            if X.shape[1] > 60:
605                (single_linkage_tree, result_min_span_tree) = memory.cache(
606                    _hdbscan_prims_kdtree)(X, min_samples, alpha,
607                                           metric, p, leaf_size,
608                                           gen_min_span_tree, **kwargs)
609            else:
610                (single_linkage_tree, result_min_span_tree) = memory.cache(
611                    _hdbscan_boruvka_kdtree)(X, min_samples, alpha,
612                                             metric, p, leaf_size,
613                                             approx_min_span_tree,
614                                             gen_min_span_tree,
615                                             core_dist_n_jobs, **kwargs)
616        else:  # Metric is a valid BallTree metric
617            # TO DO: Need heuristic to decide when to go to boruvka;
618            # still debugging for now
619            if X.shape[1] > 60:
620                (single_linkage_tree, result_min_span_tree) = memory.cache(
621                    _hdbscan_prims_balltree)(X, min_samples, alpha,
622                                             metric, p, leaf_size,
623                                             gen_min_span_tree, **kwargs)
624            else:
625                (single_linkage_tree, result_min_span_tree) = memory.cache(
626                    _hdbscan_boruvka_balltree)(X, min_samples, alpha,
627                                               metric, p, leaf_size,
628                                               approx_min_span_tree,
629                                               gen_min_span_tree,
630                                               core_dist_n_jobs, **kwargs)
631
632    return _tree_to_labels(X,
633                           single_linkage_tree,
634                           min_cluster_size,
635                           cluster_selection_method,
636                           allow_single_cluster,
637                           match_reference_implementation,
638						   cluster_selection_epsilon) + \
639            (result_min_span_tree,)
640
641
642# Inherits from sklearn
643class HDBSCAN(BaseEstimator, ClusterMixin):
644    """Perform HDBSCAN clustering from vector array or distance matrix.
645
646    HDBSCAN - Hierarchical Density-Based Spatial Clustering of Applications
647    with Noise. Performs DBSCAN over varying epsilon values and integrates
648    the result to find a clustering that gives the best stability over epsilon.
649    This allows HDBSCAN to find clusters of varying densities (unlike DBSCAN),
650    and be more robust to parameter selection.
651
652    Parameters
653    ----------
654    min_cluster_size : int, optional (default=5)
655        The minimum size of clusters; single linkage splits that contain
656        fewer points than this will be considered points "falling out" of a
657        cluster rather than a cluster splitting into two new clusters.
658
659    min_samples : int, optional (default=None)
660        The number of samples in a neighbourhood for a point to be
661        considered a core point.
662
663    metric : string, or callable, optional (default='euclidean')
664        The metric to use when calculating distance between instances in a
665        feature array. If metric is a string or callable, it must be one of
666        the options allowed by metrics.pairwise.pairwise_distances for its
667        metric parameter.
668        If metric is "precomputed", X is assumed to be a distance matrix and
669        must be square.
670
671    p : int, optional (default=None)
672        p value to use if using the minkowski metric.
673
674    alpha : float, optional (default=1.0)
675        A distance scaling parameter as used in robust single linkage.
676        See [3]_ for more information.
677
678    cluster_selection_epsilon: float, optional (default=0.0)
679		A distance threshold. Clusters below this value will be merged.
680        See [5]_ for more information.
681
682    algorithm : string, optional (default='best')
683        Exactly which algorithm to use; hdbscan has variants specialised
684        for different characteristics of the data. By default this is set
685        to ``best`` which chooses the "best" algorithm given the nature of
686        the data. You can force other options if you believe you know
687        better. Options are:
688            * ``best``
689            * ``generic``
690            * ``prims_kdtree``
691            * ``prims_balltree``
692            * ``boruvka_kdtree``
693            * ``boruvka_balltree``
694
695    leaf_size: int, optional (default=40)
696        If using a space tree algorithm (kdtree, or balltree) the number
697        of points ina leaf node of the tree. This does not alter the
698        resulting clustering, but may have an effect on the runtime
699        of the algorithm.
700
701    memory : Instance of joblib.Memory or string (optional)
702        Used to cache the output of the computation of the tree.
703        By default, no caching is done. If a string is given, it is the
704        path to the caching directory.
705
706    approx_min_span_tree : bool, optional (default=True)
707        Whether to accept an only approximate minimum spanning tree.
708        For some algorithms this can provide a significant speedup, but
709        the resulting clustering may be of marginally lower quality.
710        If you are willing to sacrifice speed for correctness you may want
711        to explore this; in general this should be left at the default True.
712
713    gen_min_span_tree: bool, optional (default=False)
714        Whether to generate the minimum spanning tree with regard
715        to mutual reachability distance for later analysis.
716
717    core_dist_n_jobs : int, optional (default=4)
718        Number of parallel jobs to run in core distance computations (if
719        supported by the specific algorithm). For ``core_dist_n_jobs``
720        below -1, (n_cpus + 1 + core_dist_n_jobs) are used.
721
722    cluster_selection_method : string, optional (default='eom')
723        The method used to select clusters from the condensed tree. The
724        standard approach for HDBSCAN* is to use an Excess of Mass algorithm
725        to find the most persistent clusters. Alternatively you can instead
726        select the clusters at the leaves of the tree -- this provides the
727        most fine grained and homogeneous clusters. Options are:
728            * ``eom``
729            * ``leaf``
730
731    allow_single_cluster : bool, optional (default=False)
732        By default HDBSCAN* will not produce a single cluster, setting this
733        to True will override this and allow single cluster results in
734        the case that you feel this is a valid result for your dataset.
735
736    prediction_data : boolean, optional
737        Whether to generate extra cached data for predicting labels or
738        membership vectors few new unseen points later. If you wish to
739        persist the clustering object for later re-use you probably want
740        to set this to True.
741        (default False)
742
743    match_reference_implementation : bool, optional (default=False)
744        There exist some interpretational differences between this
745        HDBSCAN* implementation and the original authors reference
746        implementation in Java. This can result in very minor differences
747        in clustering results. Setting this flag to True will, at a some
748        performance cost, ensure that the clustering results match the
749        reference implementation.
750
751    **kwargs : optional
752        Arguments passed to the distance metric
753
754    Attributes
755    ----------
756    labels_ : ndarray, shape (n_samples, )
757        Cluster labels for each point in the dataset given to fit().
758        Noisy samples are given the label -1.
759
760    probabilities_ : ndarray, shape (n_samples, )
761        The strength with which each sample is a member of its assigned
762        cluster. Noise points have probability zero; points in clusters
763        have values assigned proportional to the degree that they
764        persist as part of the cluster.
765
766    cluster_persistence_ : ndarray, shape (n_clusters, )
767        A score of how persistent each cluster is. A score of 1.0 represents
768        a perfectly stable cluster that persists over all distance scales,
769        while a score of 0.0 represents a perfectly ephemeral cluster. These
770        scores can be guage the relative coherence of the clusters output
771        by the algorithm.
772
773    condensed_tree_ : CondensedTree object
774        The condensed tree produced by HDBSCAN. The object has methods
775        for converting to pandas, networkx, and plotting.
776
777    single_linkage_tree_ : SingleLinkageTree object
778        The single linkage tree produced by HDBSCAN. The object has methods
779        for converting to pandas, networkx, and plotting.
780
781    minimum_spanning_tree_ : MinimumSpanningTree object
782        The minimum spanning tree of the mutual reachability graph generated
783        by HDBSCAN. Note that this is not generated by default and will only
784        be available if `gen_min_span_tree` was set to True on object creation.
785        Even then in some optimized cases a tre may not be generated.
786
787    outlier_scores_ : ndarray, shape (n_samples, )
788        Outlier scores for clustered points; the larger the score the more
789        outlier-like the point. Useful as an outlier detection technique.
790        Based on the GLOSH algorithm by Campello, Moulavi, Zimek and Sander.
791
792    prediction_data_ : PredictionData object
793        Cached data used for predicting the cluster labels of new or
794        unseen points. Necessary only if you are using functions from
795        ``hdbscan.prediction`` (see
796        :func:`~hdbscan.prediction.approximate_predict`,
797        :func:`~hdbscan.prediction.membership_vector`,
798        and :func:`~hdbscan.prediction.all_points_membership_vectors`).
799
800    exemplars_ : list
801        A list of exemplar points for clusters. Since HDBSCAN supports
802        arbitrary shapes for clusters we cannot provide a single cluster
803        exemplar per cluster. Instead a list is returned with each element
804        of the list being a numpy array of exemplar points for a cluster --
805        these points are the "most representative" points of the cluster.
806
807    relative_validity_ : float
808        A fast approximation of the Density Based Cluster Validity (DBCV)
809        score [4]. The only differece, and the speed, comes from the fact
810        that this relative_validity_ is computed using the mutual-
811        reachability minimum spanning tree, i.e. minimum_spanning_tree_,
812        instead of the all-points minimum spanning tree used in the
813        reference. This score might not be an objective measure of the
814        goodness of clusterering. It may only be used to compare results
815        across different choices of hyper-parameters, therefore is only a
816        relative score.
817
818    References
819    ----------
820
821    .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April).
822       Density-based clustering based on hierarchical density estimates.
823       In Pacific-Asia Conference on Knowledge Discovery and Data Mining
824       (pp. 160-172). Springer Berlin Heidelberg.
825
826    .. [2] Campello, R. J., Moulavi, D., Zimek, A., & Sander, J. (2015).
827       Hierarchical density estimates for data clustering, visualization,
828       and outlier detection. ACM Transactions on Knowledge Discovery
829       from Data (TKDD), 10(1), 5.
830
831    .. [3] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the
832       cluster tree. In Advances in Neural Information Processing Systems
833       (pp. 343-351).
834
835    .. [4] Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and
836       Sander, J., 2014. Density-Based Clustering Validation. In SDM
837       (pp. 839-847).
838
839    .. [5] Malzer, C., & Baum, M. (2019). A Hybrid Approach To Hierarchical
840	   Density-based Cluster Selection. arxiv preprint 1911.02282.
841
842    """
843
844    def __init__(self, min_cluster_size=5, min_samples=None, cluster_selection_epsilon=0.0,
845                 metric='euclidean', alpha=1.0, p=None,
846                 algorithm='best', leaf_size=40,
847                 memory=Memory(cachedir=None, verbose=0),
848                 approx_min_span_tree=True,
849                 gen_min_span_tree=False,
850                 core_dist_n_jobs=4,
851                 cluster_selection_method='eom',
852                 allow_single_cluster=False,
853                 prediction_data=False,
854                 match_reference_implementation=False, **kwargs):
855        self.min_cluster_size = min_cluster_size
856        self.min_samples = min_samples
857        self.alpha = alpha
858        self.cluster_selection_epsilon = cluster_selection_epsilon
859        self.metric = metric
860        self.p = p
861        self.algorithm = algorithm
862        self.leaf_size = leaf_size
863        self.memory = memory
864        self.approx_min_span_tree = approx_min_span_tree
865        self.gen_min_span_tree = gen_min_span_tree
866        self.core_dist_n_jobs = core_dist_n_jobs
867        self.cluster_selection_method = cluster_selection_method
868        self.allow_single_cluster = allow_single_cluster
869        self.match_reference_implementation = match_reference_implementation
870        self.prediction_data = prediction_data
871
872        self._metric_kwargs = kwargs
873
874        self._condensed_tree = None
875        self._single_linkage_tree = None
876        self._min_spanning_tree = None
877        self._raw_data = None
878        self._outlier_scores = None
879        self._prediction_data = None
880        self._relative_validity = None
881
882    def fit(self, X, y=None):
883        """Perform HDBSCAN clustering from features or distance matrix.
884
885        Parameters
886        ----------
887        X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
888                array of shape (n_samples, n_samples)
889            A feature array, or array of distances between samples if
890            ``metric='precomputed'``.
891
892        Returns
893        -------
894        self : object
895            Returns self
896        """
897        if self.metric != 'precomputed':
898            X = check_array(X, accept_sparse='csr')
899            self._raw_data = X
900        elif issparse(X):
901            # Handle sparse precomputed distance matrices separately
902            X = check_array(X, accept_sparse='csr')
903        else:
904            # Only non-sparse, precomputed distance matrices are allowed
905            #   to have numpy.inf values indicating missing distances
906            check_precomputed_distance_matrix(X)
907
908        kwargs = self.get_params()
909        # prediction data only applies to the persistent model, so remove
910        # it from the keyword args we pass on the the function
911        kwargs.pop('prediction_data', None)
912        kwargs.update(self._metric_kwargs)
913
914        (self.labels_,
915         self.probabilities_,
916         self.cluster_persistence_,
917         self._condensed_tree,
918         self._single_linkage_tree,
919         self._min_spanning_tree) = hdbscan(X, **kwargs)
920
921        if self.prediction_data:
922            self.generate_prediction_data()
923
924        return self
925
926    def fit_predict(self, X, y=None):
927        """Performs clustering on X and returns cluster labels.
928
929        Parameters
930        ----------
931        X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \
932                array of shape (n_samples, n_samples)
933            A feature array, or array of distances between samples if
934            ``metric='precomputed'``.
935
936        Returns
937        -------
938        y : ndarray, shape (n_samples, )
939            cluster labels
940        """
941        self.fit(X)
942        return self.labels_
943
944    def generate_prediction_data(self):
945        """
946        Create data that caches intermediate results used for predicting
947        the label of new/unseen points. This data is only useful if
948        you are intending to use functions from ``hdbscan.prediction``.
949        """
950
951        if self.metric in FAST_METRICS:
952            min_samples = self.min_samples or self.min_cluster_size
953            if self.metric in KDTree.valid_metrics:
954                tree_type = 'kdtree'
955            elif self.metric in BallTree.valid_metrics:
956                tree_type = 'balltree'
957            else:
958                warn('Metric {} not supported for prediction data!'.format(self.metric))
959                return
960
961            self._prediction_data = PredictionData(
962                self._raw_data, self.condensed_tree_, min_samples,
963                tree_type=tree_type, metric=self.metric,
964                **self._metric_kwargs
965            )
966        else:
967            warn('Cannot generate prediction data for non-vector'
968                 'space inputs -- access to the source data rather'
969                 'than mere distances is required!')
970
971
972    def weighted_cluster_centroid(self, cluster_id):
973        """Provide an approximate representative point for a given cluster.
974        Note that this technique assumes a euclidean metric for speed of
975        computation. For more general metrics use the ``weighted_cluster_medoid``
976        method which is slower, but can work with the metric the model trained
977        with.
978
979        Parameters
980        ----------
981        cluster_id: int
982            The id of the cluster to compute a centroid for.
983
984        Returns
985        -------
986        centroid: array of shape (n_features,)
987            A representative centroid for cluster ``cluster_id``.
988        """
989        if not hasattr(self, 'labels_'):
990            raise AttributeError('Model has not been fit to data')
991
992        if cluster_id == -1:
993            raise ValueError('Cannot calculate weighted centroid for -1 cluster '
994                             'since it is a noise cluster')
995
996        mask = self.labels_ == cluster_id
997        cluster_data = self._raw_data[mask]
998        cluster_membership_strengths = self.probabilities_[mask]
999
1000        return np.average(cluster_data, weights=cluster_membership_strengths, axis=0)
1001
1002
1003    def weighted_cluster_medoid(self, cluster_id):
1004        """Provide an approximate representative point for a given cluster.
1005        Note that this technique can be very slow and memory intensive for
1006        large clusters. For faster results use the ``weighted_cluster_centroid``
1007        method which is faster, but assumes a euclidean metric.
1008
1009        Parameters
1010        ----------
1011        cluster_id: int
1012            The id of the cluster to compute a medoid for.
1013
1014        Returns
1015        -------
1016        centroid: array of shape (n_features,)
1017            A representative medoid for cluster ``cluster_id``.
1018        """
1019        if not hasattr(self, 'labels_'):
1020            raise AttributeError('Model has not been fit to data')
1021
1022        if cluster_id == -1:
1023            raise ValueError('Cannot calculate weighted centroid for -1 cluster '
1024                             'since it is a noise cluster')
1025
1026        mask = self.labels_ == cluster_id
1027        cluster_data = self._raw_data[mask]
1028        cluster_membership_strengths = self.probabilities_[mask]
1029
1030        dist_mat = pairwise_distances(cluster_data,
1031                                      metric=self.metric,
1032                                      **self._metric_kwargs)
1033
1034        dist_mat = dist_mat * cluster_membership_strengths
1035        medoid_index = np.argmin(dist_mat.sum(axis=1))
1036        return cluster_data[medoid_index]
1037
1038
1039    @property
1040    def prediction_data_(self):
1041        if self._prediction_data is None:
1042            raise AttributeError('No prediction data was generated')
1043        else:
1044            return self._prediction_data
1045
1046    @property
1047    def outlier_scores_(self):
1048        if self._outlier_scores is not None:
1049            return self._outlier_scores
1050        else:
1051            if self._condensed_tree is not None:
1052                self._outlier_scores = outlier_scores(self._condensed_tree)
1053                return self._outlier_scores
1054            else:
1055                raise AttributeError('No condensed tree was generated; try running fit first.')
1056
1057    @property
1058    def condensed_tree_(self):
1059        if self._condensed_tree is not None:
1060            return CondensedTree(self._condensed_tree,
1061                                 self.cluster_selection_method,
1062                                 self.allow_single_cluster)
1063        else:
1064            raise AttributeError('No condensed tree was generated; try running fit first.')
1065
1066    @property
1067    def single_linkage_tree_(self):
1068        if self._single_linkage_tree is not None:
1069            return SingleLinkageTree(self._single_linkage_tree)
1070        else:
1071            raise AttributeError('No single linkage tree was generated; try running fit'
1072                 ' first.')
1073
1074    @property
1075    def minimum_spanning_tree_(self):
1076        if self._min_spanning_tree is not None:
1077            if self._raw_data is not None:
1078                return MinimumSpanningTree(self._min_spanning_tree,
1079                                           self._raw_data)
1080            else:
1081                warn('No raw data is available; this may be due to using'
1082                     ' a precomputed metric matrix. No minimum spanning'
1083                     ' tree will be provided without raw data.')
1084                return None
1085        else:
1086            raise AttributeError('No minimum spanning tree was generated.'
1087                 'This may be due to optimized algorithm variations that skip'
1088                 ' explicit generation of the spanning tree.')
1089
1090    @property
1091    def exemplars_(self):
1092        if self._prediction_data is not None:
1093            return self._prediction_data.exemplars
1094        elif self.metric in FAST_METRICS:
1095            self.generate_prediction_data()
1096            return self._prediction_data.exemplars
1097        else:
1098            raise AttributeError('Currently exemplars require the use of vector input data'
1099                                 'with a suitable metric. This will likely change in the '
1100                                 'future, but for now no exemplars can be provided')
1101
1102    @property
1103    def relative_validity_(self):
1104        if self._relative_validity is not None:
1105            return self._relative_validity
1106
1107        if not self.gen_min_span_tree:
1108            raise AttributeError("Minimum spanning tree not present. " +
1109                                 "Either HDBSCAN object was created with " +
1110                                 "gen_min_span_tree=False or the tree was " +
1111                                 "not generated in spite of it owing to " +
1112                                 "internal optimization criteria.")
1113            return
1114
1115        labels = self.labels_
1116        sizes = np.bincount(labels + 1)
1117        noise_size = sizes[0]
1118        cluster_size = sizes[1:]
1119        total = noise_size + np.sum(cluster_size)
1120        num_clusters = len(cluster_size)
1121        DSC = np.zeros(num_clusters)
1122        min_outlier_sep = np.inf  # only required if num_clusters = 1
1123        correction_const = 2  # only required if num_clusters = 1
1124
1125        # Unltimately, for each Ci, we only require the
1126        # minimum of DSPC(Ci, Cj) over all Cj != Ci.
1127        # So let's call this value DSPC_wrt(Ci), i.e.
1128        # density separation 'with respect to' Ci.
1129        DSPC_wrt = np.ones(num_clusters) * np.inf
1130        max_distance = 0
1131
1132        mst_df = self.minimum_spanning_tree_.to_pandas()
1133
1134        for edge in mst_df.iterrows():
1135            label1 = labels[int(edge[1]['from'])]
1136            label2 = labels[int(edge[1]['to'])]
1137            length = edge[1]['distance']
1138
1139            max_distance = max(max_distance, length)
1140
1141            if label1 == -1 and label2 == -1:
1142                continue
1143            elif label1 == -1 or label2 == -1:
1144                # If exactly one of the points is noise
1145                min_outlier_sep = min(min_outlier_sep, length)
1146                continue
1147
1148            if label1 == label2:
1149                # Set the density sparseness of the cluster
1150                # to the sparsest value seen so far.
1151                DSC[label1] = max(length, DSC[label1])
1152            else:
1153                # Check whether density separations with
1154                # respect to each of these clusters can
1155                # be reduced.
1156                DSPC_wrt[label1] = min(length, DSPC_wrt[label1])
1157                DSPC_wrt[label2] = min(length, DSPC_wrt[label2])
1158
1159        # In case min_outlier_sep is still np.inf, we assign a new value to it.
1160        # This only makes sense if num_clusters = 1 since it has turned out
1161        # that the MR-MST has no edges between a noise point and a core point.
1162        min_outlier_sep = max_distance if min_outlier_sep == np.inf else min_outlier_sep
1163
1164        # DSPC_wrt[Ci] might be infinite if the connected component for Ci is
1165        # an "island" in the MR-MST. Whereas for other clusters Cj and Ck, the
1166        # MR-MST might contain an edge with one point in Cj and ther other one
1167        # in Ck. Here, we replace the infinite density separation of Ci by
1168        # another large enough value.
1169        #
1170        # TODO: Think of a better yet efficient way to handle this.
1171        correction = correction_const * (max_distance if num_clusters > 1 else min_outlier_sep)
1172        DSPC_wrt[np.where(DSPC_wrt == np.inf)] = correction
1173
1174        V_index = [(DSPC_wrt[i] - DSC[i]) / max(DSPC_wrt[i], DSC[i]) for i in range(num_clusters)]
1175        score = np.sum([(cluster_size[i] * V_index[i]) / total for i in range(num_clusters)])
1176        self._relative_validity = score
1177        return self._relative_validity
1178