1"""flat.py
2
3Provides alternative functions to hdbscan.HDBSCAN and others to
41. Allow prediction on a flat clustering by specifying 'n_clusters'.
5    This is done by choosing the best cluster_selection_epsilon that produces
6    the required number of clusters without adding unnecessary outliers.
72. Makes approximate_predict, membership_vector, and
8    all_points_membership_vectors consistent with cluster_selection_epsilon
9
10Provides the following functions:
11==================================
12HDBSCAN_flat: trained HDBSCAN instance with 'n_clusters' clusters
13    The attributes (labels, probabilities, prediction_data) are tuned to
14    produce 'n_clusters' clusters.
15
16approximate_predict_flat: labels and probabilities for novel points
17    Allows selecting n_clusters for novel points, or using the
18    original clustering (potentially specified using cluster_selection_epsilon)
19
20membership_vector_flat: Soft-clustering probabilities for novel points
21    Similar to approximate_predict_flat, but for soft-clustering.
22    **Use with caution**
23
24all_points_membership_vectors_flat: Soft-clustering probabilities
25    Similar to membership_vector_flat, but for points in training set
26    **Use with caution**
27"""
28
29import copy
30from warnings import warn
31
32import numpy as np
33from ._hdbscan_tree import compute_stability, get_cluster_tree_leaves
34from .hdbscan_ import HDBSCAN, _tree_to_labels
35from .plots import _bfs_from_cluster_tree
36from .prediction import (PredictionData,
37                         _find_cluster_and_probability,
38                         _find_neighbor_and_lambda)
39from ._prediction_utils import (get_tree_row_with_child,
40                                dist_membership_vector,
41                                outlier_membership_vector,
42                                prob_in_some_cluster,
43                                all_points_dist_membership_vector,
44                                all_points_outlier_membership_vector,
45                                all_points_prob_in_some_cluster)
46
47
48def HDBSCAN_flat(X, n_clusters=None,
49                 cluster_selection_epsilon=0.,
50                 clusterer=None, inplace=False, **kwargs):
51    """
52    Train a HDBSCAN clusterer by specifying n_clusters.
53    Or, modify a trained clusterer to return specific n_clusters.
54
55    Parameters
56    ----------
57    X: array-like
58        Data to be passed to HDBSCAN for training.
59
60    n_clusters: int, default=None
61        Number of clusters to produce.
62        If None, revert to default HDBSCAN
63
64    cluster_selection_epsilon: float, default=0.
65        core-distance below which to stop splitting clusters.
66        This can indirectly impose n_clusters.
67        This argument is ignored if n_clusters is supplied.
68
69    clusterer: HDBSCAN, default=None
70        If supplied, modify this clusterer to produce n_clusters clusters.
71
72    inplace: bool, default=False
73        If 'clusterer' parameter is supplied, and inplace is True,
74            modify the previous clusterer inplace.
75            If False, return a modified copy of the previous clusterer.
76
77    **kwargs: keyword arguments
78        All init arguments for HDBSCAN
79
80    Returns
81    -------
82    new_clusterer: HDBSCAN
83        New HDBSCAN instance; returned irrespective of inplace=True or False
84
85    Usage
86    -----
87    # Extract flat clustering from HDBSCAN's hierarchy for 7 clusters
88    clusterer = HDBSCAN_flat(X_train, n_clusters=7,
89                             min_cluster_size=12, min_samples=8)
90    labels = clusterer.labels_
91    proba = clusterer.probabilities_
92
93    # Use a previously initialized/trained HDBSCAN
94    old_clusterer = HDBSCAN(min_cluster_size=12, min_samples=8)
95    clusterer = HDBSCAN_flat(X_train, n_clusters=7,
96                             clusterer=old_clusterer, inplace=True)
97    labels = clusterer.labels_
98    proba = clusterer.probabilities_
99
100    See Also
101    ---------
102    :py:func:`hdbscan.HDBSCAN`
103    :py:func:`re_init`
104    """
105    # Handle the trivial case first.
106    if (n_clusters is None) and (cluster_selection_epsilon == 0.):
107        if (not isinstance(clusterer, HDBSCAN)) or (not inplace):
108            # Always generate prediction_data to avoid later woes
109            kwargs['prediction_data'] = True
110            new_clusterer = HDBSCAN(**kwargs)
111        else:
112            new_clusterer = clusterer
113            new_clusterer.prediction_data = True
114
115        new_clusterer.fit(X)
116        return new_clusterer
117
118    if (n_clusters is not None) and (cluster_selection_epsilon != 0.):
119        warn(f"'cluster_selection_epsilon' (={cluster_selection_epsilon})"
120             f" is ignored when 'n_clusters' is supplied.")
121        cluster_selection_epsilon = 0.
122        # This will later be chosen according to n_clusters
123
124    if not isinstance(clusterer, HDBSCAN):
125        # Initialize and train clusterer if one was not previously supplied.
126        # Always generate prediction data
127        kwargs['prediction_data'] = True
128        new_clusterer = HDBSCAN(**kwargs)
129        # We do not pass cluster_selection_epsilon here.
130        # While this adds unnecessary computation, it makes the code
131        #   easier to read and debug.
132        new_clusterer.fit(X)
133    else:
134        if inplace:
135            new_clusterer = clusterer
136        else:
137            new_clusterer = copy.deepcopy(clusterer)
138
139        new_clusterer.prediction_data = True
140
141        # Train on 'X'. Do this even if the supplied clusterer was trained,
142        #   because we want to make sure it fits 'X'.
143        new_clusterer.prediction_data = True
144        new_clusterer.fit(X)
145
146    if new_clusterer.cluster_selection_method == 'eom':
147        max_eom_clusters = len(
148                new_clusterer.condensed_tree_._select_clusters())
149
150    # Pick an epsilon value right after a split produces n_clusters,
151    #   and the don't split further for smaller epsilon (larger lambda)
152    if n_clusters is not None:
153        if ((new_clusterer.cluster_selection_method == 'eom') and
154                (n_clusters > max_eom_clusters)):
155            warn(f"Cannot predict more than {max_eom_clusters} with cluster "
156                 "selection method 'eom'. Changing to method 'leaf'...")
157            new_clusterer.cluster_selection_method = 'leaf'
158        epsilon = select_epsilon(new_clusterer.condensed_tree_, n_clusters)
159    else:
160        # Or use the specified cluster_selection_epsilon
161        epsilon = cluster_selection_epsilon
162
163    new_clusterer.cluster_selection_epsilon = float(epsilon)
164
165    # Extract tree related stuff, in order to re-assign labels
166    single_linkage_tree = new_clusterer.single_linkage_tree_
167    single_linkage_tree = single_linkage_tree.to_numpy()
168    min_cluster_size = new_clusterer.min_cluster_size
169    cluster_selection_method = new_clusterer.cluster_selection_method
170    allow_single_cluster = new_clusterer.allow_single_cluster
171    match_reference_implementation = False
172
173    # Get labels according to the required cluster_selection_epsilon
174    output = _tree_to_labels(
175                        None,
176                        single_linkage_tree, min_cluster_size,
177                        cluster_selection_method,
178                        allow_single_cluster,
179                        match_reference_implementation,
180                        cluster_selection_epsilon=epsilon)
181
182    # Reflect the related changes in HDBSCAN.
183    (new_clusterer.labels_,
184     new_clusterer.probabilities_,
185     new_clusterer.cluster_persistence_,
186     new_clusterer._condensed_tree,
187     new_clusterer._single_linkage_tree) = output
188
189    # PredictionData attached to HDBSCAN should also change.
190    # A function re_init is defined in this module to handle this.
191    re_init(new_clusterer.prediction_data_,
192            new_clusterer.condensed_tree_,
193            cluster_selection_epsilon=epsilon)
194    return new_clusterer
195
196
197def approximate_predict_flat(clusterer,
198                             points_to_predict,
199                             n_clusters=None,
200                             cluster_selection_epsilon=None,
201                             prediction_data=None,
202                             return_prediction_data=False):
203    """
204    Predict the cluster label of new points at a particular flat clustering,
205        specified by n_clusters. This is a modified version of
206        hdbscan.approximate_predict to allow selection of n_clusters.
207
208    Parameters
209    ----------
210    clusterer : HDBSCAN
211        A clustering object that has been fit to the data and
212        either had ``prediction_data=True`` set, or called the
213        ``generate_prediction_data`` method after the fact.
214
215    points_to_predict : array, or array-like (n_samples, n_features)
216        The new data points to predict cluster labels for. They should
217        have the same dimensionality as the original dataset over which
218        clusterer was fit.
219
220    n_clusters: int, default=None
221        The number of clusters to have in the flat clustering
222            (over the training data, not points_to_predict)
223        Ignored when prediction_data is supplied.
224
225    cluster_selection_epsilon: float, default=None
226        core-distance below which to stop splitting clusters.
227        This can indirectly impose n_clusters.
228        This argument is ignored if n_clusters is supplied.
229
230    prediction_data: PredictionData, default=None
231        If supplied, use this to predict clusters for points_to_predict.
232        This allows predicting on multiple datasets without corrupting
233            prediction data associated with clusterer.
234
235        If neither n_clusters, nor prediction_data are supplied,
236            then the prediction_data associated with clusterer is used.
237
238    return_prediction_data: bool, default=False
239        If True, return prediction_data along with labels and proba.
240
241    Returns
242    -------
243    labels : array (n_samples,)
244        The predicted labels of the ``points_to_predict``
245
246    probabilities : array (n_samples,)
247        The soft cluster scores for each of the ``points_to_predict``
248
249    prediction_data: PredictionData, optional
250        prediction_data used to predict.
251        Returned if return_prediciton_data is set to True.
252
253
254    Usage
255    -----
256    # From a fitted HDBSCAN model, predict for n_clusters=5
257    labels, proba = approximate_predict_flat(
258                        clusterer, X_predict, n_clusters=5)
259
260    # Store prediciton data for later use.
261    labels, proba, pred_data = approximate_predict_flat(
262                                    clusterer, X_predict, n_clusters=5,
263                                    return_prediction_data=True)
264    # and use this prediction data to predict on new points
265    labels1, proba1 = approximate_predict_flat(
266                                    clusterer, X_pred1,
267                                    prediction_data=pred_data)
268
269    See Also
270    ---------
271    :py:func:`hdbscan.prediction.approximate_predict`
272    """
273    # Get number of fitted clusters for later use.
274    n_clusters_fit = np.sum(np.unique(clusterer.labels_) >= 0)
275    if n_clusters is not None:
276        n_clusters = int(n_clusters)  # Ensure n_clusters is int
277
278    # We'll need the condensed tree later...
279    condensed_tree = clusterer.condensed_tree_
280
281    # If none of the three arguments: prediction_data, n_clusters,
282    #   and cluster_selection_epsilon are supplied,
283    # then use clusterer's prediciton data directly
284    if ((prediction_data is None) and
285            ((n_clusters is None) or (n_clusters == n_clusters_fit)) and
286            (cluster_selection_epsilon is None)):
287        prediction_data = clusterer.prediction_data_
288
289    # If either of n_clusters or cluster_selection_epsilon were supplied,
290    #   then build prediction data from these by modifying clusterer's
291    if not isinstance(prediction_data, PredictionData):
292        if clusterer.prediction_data_ is None:
293            raise ValueError(
294                    'Clusterer does not have prediction data!'
295                    ' Try fitting with prediction_data=True set,'
296                    ' or run generate_prediction_data on the clusterer')
297        # Get prediction data from clusterer
298        prediction_data = clusterer.prediction_data_
299        # Modify prediction_data to reflect new n_clusters
300        # First, make a copy of prediction data to avoid modifying source
301        prediction_data = copy.deepcopy(prediction_data)
302        # Cluster selection method is hold by condensed_tree.
303        # Change from 'eom' to 'leaf' if n_clusters is too large.
304        if ((condensed_tree.cluster_selection_method == 'eom') and (
305                (n_clusters is not None) and (n_clusters > n_clusters_fit))):
306            warn(f"Cannot predict more than {n_clusters_fit} with cluster "
307                 "selection method 'eom'. Changing to method 'leaf'...")
308            condensed_tree.cluster_selection_method = 'leaf'
309        # This change does not affect the tree associated with 'clusterer'
310        # Re-initialize prediction_data for the specified n_clusters or epsilon
311        re_init(prediction_data, condensed_tree,
312                n_clusters=n_clusters,
313                cluster_selection_epsilon=cluster_selection_epsilon)
314
315    # ============================================================
316    # Now we're ready to use prediction_data
317    # The rest of the code is copied from HDBSCAN's approximate_predict,
318    #   but modified to use prediction_data instead of clusterer's attribute
319    points_to_predict = np.asarray(points_to_predict)
320
321    if points_to_predict.shape[1] != prediction_data.raw_data.shape[1]:
322        raise ValueError('New points dimension does not match fit data!')
323
324    if prediction_data.cluster_tree.shape[0] == 0:
325        warn('Prediction data does not have any defined clusters, new data'
326             ' will be automatically predicted as noise.')
327        labels = -1 * np.ones(points_to_predict.shape[0], dtype=np.int32)
328        probabilities = np.zeros(points_to_predict.shape[0], dtype=np.float32)
329        if return_prediction_data:
330            return labels, probabilities, prediction_data
331        else:
332            return labels, probabilities
333
334    labels = np.empty(points_to_predict.shape[0], dtype=np.int)
335    probabilities = np.empty(points_to_predict.shape[0], dtype=np.float64)
336
337    min_samples = clusterer.min_samples or clusterer.min_cluster_size
338    neighbor_distances, neighbor_indices = prediction_data.tree.query(
339                                              points_to_predict,
340                                              k=2 * min_samples)
341
342    for i in range(points_to_predict.shape[0]):
343        label, prob = _find_cluster_and_probability(
344            condensed_tree,
345            prediction_data.cluster_tree,
346            neighbor_indices[i],
347            neighbor_distances[i],
348            prediction_data.core_distances,
349            prediction_data.cluster_map,
350            prediction_data.max_lambdas,
351            min_samples
352        )
353        labels[i] = label
354        probabilities[i] = prob
355
356    if return_prediction_data:
357        return labels, probabilities, prediction_data
358    else:
359        return labels, probabilities
360
361
362def membership_vector_flat(
363        clusterer, points_to_predict,
364        prediction_data=None, n_clusters=None,
365        cluster_selection_epsilon=0.):
366    """
367    (Adaptation of hdbscan's membership_vector for n_clusters, epsilon)
368    Predict soft cluster membership probabilities;
369        a vector for each point in ``points_to_predict`` that gives
370        a probability that the given point is a member of a cluster
371        for each of the selected clusters of the ``clusterer``.
372
373    Parameters
374    ----------
375    clusterer: HDBSCAN
376        A clustering object that has been fit to the data and
377        either had ``prediction_data=True`` set, or called the
378        ``generate_prediction_data`` method after the fact.
379
380    points_to_predict: array, or array-like (n_samples, n_features)
381        The new data points to predict cluster labels for. They should
382        have the same dimensionality as the original dataset over which
383        clusterer was fit.
384
385    prediction_data: PredictionData, default=None
386        Prediction data associated with HDBSCAN for some flat clustering
387
388    n_clusters: int, default=None
389        Number of clusters over which to compute membership probabilities.
390        These clusters are obtained as a flat clustering at some
391            cluster_selection_epsilon.
392
393    cluster_selection_epsilon: float, default=0.
394        core-distance below which to stop splitting clusters.
395        This can indirectly impose n_clusters.
396        This argument is ignored if n_clusters is supplied.
397
398    Note: If neither n_clusters nor cluster_selection_epsilon are supplied,
399        the clusterer's original clustering is used.
400
401    Returns
402    -------
403    membership_vectors : array (n_samples, n_clusters)
404        The probability that point ``i`` is a member of cluster ``j`` is
405        in ``membership_vectors[i, j]``.
406
407    See Also
408    --------
409    :py:func:`hdbscan.predict.membership_vector`
410    :py:func:`hdbscan.predict.all_points_membership_vectors`
411    """
412    points_to_predict = points_to_predict.astype(np.float64)
413    # Extract condensed tree for later use
414    condensed_tree = clusterer.condensed_tree_
415
416    # Choose flat clustering based on cluster_selection_epsilon or n_clusters.
417    # If neither is specified, use clusterer's cluster_selection_epsilon
418    if ((n_clusters is None) and
419            (cluster_selection_epsilon == 0.) and
420            (prediction_data is None)):
421        epsilon = clusterer.cluster_selection_epsilon
422        # Use the same prediction_data as clusterer's
423        prediction_data = clusterer.prediction_data_
424    elif prediction_data is None:
425        if n_clusters is not None:
426            # Compute cluster_selection_epsilon so that a flat clustering
427            #   produces a specified number of n_clusters
428            # With method 'eom', we may fail to get 'n_clusters' clusters. So,
429            try:
430                epsilon = select_epsilon(condensed_tree, n_clusters)
431            except AssertionError:
432                warn(f"Failed to predict {n_clusters} clusters with "
433                     "cluster selection method 'eom'. Switching to 'leaf'...")
434                condensed_tree.cluster_selection_method = 'leaf'
435                epsilon = select_epsilon(condensed_tree, n_clusters)
436        else:
437            epsilon = cluster_selection_epsilon
438        # Create another instance of prediction_data that is consistent
439        #   with the selected value of epsilon.
440        prediction_data = copy.deepcopy(clusterer.prediction_data_)
441        re_init(prediction_data, condensed_tree,
442                cluster_selection_epsilon=epsilon)
443
444    # Flat clustering from prediction data
445    clusters = clusters_from_prediction_data(prediction_data)
446
447    # Initialize probabilities
448    result = np.empty((points_to_predict.shape[0], clusters.shape[0]),
449                      dtype=np.float64)
450
451    # k-NN for prediciton points to training set
452    min_samples = clusterer.min_samples or clusterer.min_cluster_size
453    neighbor_distances, neighbor_indices = \
454        prediction_data.tree.query(points_to_predict,
455                                   k=2*min_samples)
456
457    # Loop over prediction points to compute probabilities
458    for i in range(points_to_predict.shape[0]):
459        # We need to find where in the tree the new point would go
460        # for the purposes of outlier membership approximation
461        nearest_neighbor, lambda_ = \
462            _find_neighbor_and_lambda(
463                neighbor_indices[i],
464                neighbor_distances[i],
465                prediction_data.core_distances,
466                min_samples)
467
468        # Find row in tree where nearest neighbor drops out,
469        #   so we can get a lambda value for the nearest neighbor
470        neighbor_tree_row = get_tree_row_with_child(
471                    condensed_tree._raw_tree, nearest_neighbor)
472
473        # Assign lambda as min(lambda-to-neighbor, neighbor's-lambda-to-tree)
474        # Equivalently, this assigns core distance for prediction point as
475        #   max(dist-to-neighbor, neighbor's-dist-to-tree)
476        if neighbor_tree_row['lambda_val'] <= lambda_:
477            lambda_ = neighbor_tree_row['lambda_val']
478
479        # Probabilities based on distance to closest exemplar in each cluster:
480        # Use new prediction_data that points to exemplars that are specific
481        #   to the choice of n_clusters
482        distance_vec = dist_membership_vector(
483            points_to_predict[i],
484            prediction_data.exemplars,
485            prediction_data.dist_metric)
486        # Probabilities based on how long the nearest exemplar persists in
487        #   each cluster (with respect to most persistent exemplar)
488        # Use new clusters that are defined by the choice of n_clusters.
489        outlier_vec = outlier_membership_vector(
490            nearest_neighbor,
491            lambda_,
492            clusters,
493            condensed_tree._raw_tree,
494            prediction_data.leaf_max_lambdas,
495            prediction_data.cluster_tree)
496
497        # Merge the two probabilities to produce a single set of probabilities
498        result[i] = distance_vec ** 0.5 * outlier_vec ** 2.0
499        result[i] /= result[i].sum()
500
501        # Include probability that the nearest neighbor belongs to a cluster
502        result[i] *= prob_in_some_cluster(
503            nearest_neighbor,
504            lambda_,
505            clusters,
506            condensed_tree._raw_tree,
507            prediction_data.leaf_max_lambdas,
508            prediction_data.cluster_tree)
509
510    # Rename variable so it's easy to understand what's being returned
511    membership_vectors = result
512    return membership_vectors
513
514
515def all_points_membership_vectors_flat(
516        clusterer, prediction_data=None,
517        n_clusters=None, cluster_selection_epsilon=None):
518    """
519    (Adaptation of hdbscan's all_points_membership_vector
520        for n_clusters, epsilon)
521    Predict soft cluster membership vectors for all points in the
522    original dataset the clusterer was trained on. This function is more
523    efficient by making use of the fact that all points are already in the
524    condensed tree, and processing in bulk.
525
526    Parameters
527    ----------
528    clusterer : HDBSCAN
529         A clustering object that has been fit to the data and
530        either had ``prediction_data=True`` set, or called the
531        ``generate_prediction_data`` method after the fact.
532        This method does not work if the clusterer was trained
533        with ``metric='precomputed'``.
534
535    prediction_data: PredictionData, default=None
536        Prediction data associated with HDBSCAN for some flat clustering
537
538    n_clusters: int, optional, default=None
539        Number of clusters over which to compute membership probabilities.
540        These clusters are obtained as a flat clustering at some
541            cluster_selection_epsilon.
542
543    cluster_selection_epsilon: float, optional, default=None
544        core-distance below which to stop splitting clusters.
545        This can indirectly impose n_clusters.
546        This argument is ignored if n_clusters is supplied.
547
548    Note: If neither n_clusters nor cluster_selection_epsilon are supplied,
549        the clusterer's original clustering is used.
550
551    Returns
552    -------
553    membership_vectors : array (n_samples, n_clusters)
554        The probability that point ``i`` of the original dataset is a member of
555        cluster ``j`` is in ``membership_vectors[i, j]``.
556    See Also
557    --------
558    :py:func:`hdbscan.prediction.all_points_membership_vectors`
559    :py:func:`hdbscan.prediction.membership_vector`
560    """
561    # Extract condensed tree for later use
562    condensed_tree = clusterer.condensed_tree_
563
564    # Choose flat clustering based on cluster_selection_epsilon or n_clusters.
565    # If neither is specified, use clusterer's cluster_selection_epsilon
566    if (n_clusters is None) and (cluster_selection_epsilon is None):
567        epsilon = clusterer.cluster_selection_epsilon
568        # Use the same prediction_data as clusterer's
569        prediction_data = clusterer.prediction_data_
570    elif prediction_data is None:
571        if n_clusters is not None:
572            # Compute cluster_selection_epsilon so that a flat clustering
573            #   produces a specified number of n_clusters
574            # With method 'eom', we may fail to get 'n_clusters' clusters. So,
575            try:
576                epsilon = select_epsilon(condensed_tree, n_clusters)
577            except AssertionError:
578                warn(f"Failed to predict {n_clusters} clusters with "
579                     "cluster selection method 'eom'. Switching to 'leaf'...")
580                condensed_tree.cluster_selection_method = 'leaf'
581                epsilon = select_epsilon(condensed_tree, n_clusters)
582        else:
583            epsilon = cluster_selection_epsilon
584        # Create another instance of prediction_data that is consistent
585        #   with the selected value of epsilon.
586        prediction_data = copy.deepcopy(clusterer.prediction_data_)
587        re_init(prediction_data, condensed_tree,
588                cluster_selection_epsilon=epsilon)
589
590    # Flat clustering at the chosen epsilon from prediction_data
591    clusters = clusters_from_prediction_data(prediction_data)
592
593    all_points = prediction_data.raw_data
594
595    # When no clusters found, return array of 0's
596    if clusters.size == 0:
597        return np.zeros(all_points.shape[0])
598
599    # Probabilities based on distance to closest exemplar in each cluster:
600    # Use new prediction_data that points to exemplars that are specific
601    #   to the choice of n_clusters
602    distance_vecs = all_points_dist_membership_vector(
603        all_points,
604        prediction_data.exemplars,
605        prediction_data.dist_metric)
606
607    # Probabilities based on how long the point persists in
608    #   each cluster (with respect to most persistent exemplar)
609    # Use new clusters that are defined by the choice of n_clusters.
610    outlier_vecs = all_points_outlier_membership_vector(
611        clusters,
612        condensed_tree._raw_tree,
613        prediction_data.leaf_max_lambdas,
614        prediction_data.cluster_tree)
615
616    # Include probability that the point belongs to a cluster
617    in_cluster_probs = all_points_prob_in_some_cluster(
618        clusters,
619        condensed_tree._raw_tree,
620        prediction_data.leaf_max_lambdas,
621        prediction_data.cluster_tree)
622
623    # Aggregate the three probabilities to produce membership vectors
624    result = distance_vecs * outlier_vecs
625    row_sums = result.sum(axis=1)
626    result = result / row_sums[:, np.newaxis]
627    result *= in_cluster_probs[:, np.newaxis]
628
629    # Re-name variable to clarify what's being returned.
630    membership_vectors = result
631    return membership_vectors
632
633
634def select_epsilon(condensed_tree, n_clusters):
635    """
636    Pick optimal epsilon from condensed tree based on n_clusters,
637        calls functions specific to 'eom' or 'leaf' selection methods
638    """
639    cluster_selection_method = condensed_tree.cluster_selection_method
640    if cluster_selection_method == 'eom':
641        return select_epsilon_eom(condensed_tree, n_clusters)
642    if cluster_selection_method == 'leaf':
643        return select_epsilon_leaf(condensed_tree, n_clusters)
644    raise ValueError('Invalid Cluster Selection Method: %s\n'
645                     'Should be one of: "eom", "leaf"\n')
646
647
648def select_epsilon_eom(condensed_tree, n_clusters):
649    """
650    Select epsilon so that persistence-based clustering,
651        after truncating the tree at the above epsilon,
652        has exactly 'n_clusters' clusters
653    """
654    # With method 'eom', max clusters are produced for epsilon=0,
655    #   as computed by
656    eom_base_clusters = condensed_tree._select_clusters()
657    max_clusters = len(eom_base_clusters)
658    # Increasing epsilon can only reduce the number of ouput clusters.
659
660    assert n_clusters <= max_clusters, (
661            f"Cannot produce more than {max_clusters} with method 'eom'. " +
662            "Use method 'leaf' instead to extract flat clustering.")
663
664    tree = condensed_tree._raw_tree
665    # To select epsilon, consider all values where clusters are split
666    cluster_lambdas = tree['lambda_val'][tree['child_size'] > 1]
667    candidate_epsilons = 1./np.unique(cluster_lambdas) - 1.e-12
668    # Subtract the extra e-12 to avoid numerical errors in comparison
669    # Then, we avoid splitting for all epsilon below this.
670    candidate_epsilons = np.sort(candidate_epsilons)[::-1]
671
672    for epsilon in candidate_epsilons:
673        sel_clusters = _new_select_clusters(condensed_tree, epsilon)
674        if len(sel_clusters) == n_clusters:
675            break
676    else:
677        raise RuntimeError("Could not find epsilon")
678
679    return epsilon
680
681
682def select_epsilon_leaf(condensed_tree, n_clusters):
683    """
684    Select epsilon so that the leaves of condensed tree,
685        after truncating at the above epsilon,
686        has exactly 'n_clusters' clusters
687    """
688    # Use an epsilon value that produces the right number of clusters.
689    # The condensed tree of HDBSCAN has this information.
690    # Extract the lambda levels (=1/distance) from the condensed tree
691    lambdas = condensed_tree._raw_tree['lambda_val']
692    # We don't want values that produce a large cluster and
693    #   just one or two individual points.
694    child_sizes = condensed_tree._raw_tree['child_size']
695    child_sizes = child_sizes.astype(int)
696    # Keep only those lambda values corresponding to cluster separation;
697    #   i.e., with child_sizes > 1
698    lambdas = lambdas[child_sizes > 1]
699    # Get the unique values, because when two clusters fall out of one,
700    #   the entry with lambda is repeated.
701    lambdas = np.unique(lambdas.astype(float))
702    if n_clusters > len(lambdas) + 1:
703        warn(f"HDBSCAN can only compute {len(lambdas)+1} clusters. "
704             f"Setting n_clusters to {len(lambdas)+1}...")
705        n_clusters = len(lambdas) + 1
706
707    # lambda values are sorted by np.unique.
708    # Now, get epsilon (distance threshold) as 1/lambda
709    epsilon = 1./lambdas[n_clusters-2]
710    # At this epsilon, n_clusters have been split.
711    # Stop splits at epsilons smaller than this.
712    # To allow for numerical errors,
713    return epsilon - 1.e-12
714
715
716def re_init(predData, condensed_tree,
717            n_clusters=None, cluster_selection_epsilon=0.):
718    """
719    Modify PredictionData of HDBSCAN to account for epsilon.
720    epsilon is the cluster_selection_epsilon that controls granularity
721        of clusters; Large epsilon => More clusters
722
723    Parameters
724    ----------
725    predData: PredictionData
726        Contains data to use for predicting novel points.
727        Defined in the HDBSCAN module
728
729    condensed_tree: CondensedTree
730        Tree structure that contains hierarchical clustering.
731        Defined in the HDBSCAN module
732
733    n_clusters: int, optional, default=None
734        If specified, use this to obtain cluster_selection_epsilon
735            from CondensedTree; Overrides cluster_selection_epsilon parameter
736
737    cluster_selection_epsilon: float, default=0.
738        In cluster tree, nodes are not split further beyond (>=) this value.
739        epsilon is the inverse of core distance.
740
741    Returns
742    -------
743    None
744    """
745    # predData must be a pre-trained PredictionData instance from hdbscan
746    # If n_clusters is specified, compute cluster_selection_epsilon;
747    if (n_clusters is not None):
748        cluster_selection_epsilon = select_epsilon(condensed_tree, n_clusters)
749
750    # This is the key modification:
751    # Select clusters according to selection method and epsilon.
752    selected_clusters = _new_select_clusters(condensed_tree,
753                                             cluster_selection_epsilon)
754    # _new_select_clusters is a modification of get_clusters
755    #   from hdbscan._hdbscan_tree
756
757    # raw tree, used later to get exemplars and lambda values
758    raw_condensed_tree = condensed_tree._raw_tree
759
760    # Re-do the cluster map: Map cluster numbers in tree (N, N+1, ..)
761    #    to the cluster labels produced as output
762    predData.cluster_map = {int(c): n for n, c in
763                            enumerate(sorted(list(selected_clusters)))}
764    predData.reverse_cluster_map = {n: c for c, n in
765                                    predData.cluster_map.items()}
766
767    # Re-compute lambdas and exemplars for selected clusters;
768    predData.max_lambdas = {}
769    predData.exemplars = []
770
771    for cluster in selected_clusters:
772        # max_lambda <=> smallest distance <=> most persistent point(s)
773        predData.max_lambdas[cluster] = \
774                    raw_condensed_tree['lambda_val'][
775                        raw_condensed_tree['parent'] == cluster].max()
776
777        # Map all sub-clusters of selected cluster to the selected cluster's
778        #       label in output.
779        # Map lambdas too...
780        for sub_cluster in predData._clusters_below(cluster):
781            predData.cluster_map[sub_cluster] = predData.cluster_map[cluster]
782            predData.max_lambdas[sub_cluster] = predData.max_lambdas[cluster]
783
784        # Create set of exemplar points for later use.
785        # Novel points are assigned based on cluster of closest exemplar.
786        cluster_exemplars = np.array([], dtype=np.int64)
787        # For each selected cluster, get all of its leaves,
788        #       and leaves of leaves, and so on...
789        for leaf in predData._recurse_leaf_dfs(cluster):
790            # Largest lambda => Most persistent points
791            leaf_max_lambda = raw_condensed_tree['lambda_val'][
792                raw_condensed_tree['parent'] == leaf].max()
793            # Get the most persistent points
794            points = raw_condensed_tree['child'][
795                (raw_condensed_tree['parent'] == leaf)
796                & (raw_condensed_tree['lambda_val'] == leaf_max_lambda)
797            ]
798            # Add most persistent points as exemplars
799            cluster_exemplars = np.hstack([cluster_exemplars, points])
800
801        # Add exemplars for each leaf of each selected cluster.
802        predData.exemplars.append(predData.raw_data[cluster_exemplars])
803    return
804
805
806def _new_select_clusters(condensed_tree,
807                         cluster_selection_epsilon,
808                         allow_single_cluster=False,
809                         match_reference_implementation=False):
810    """
811    Adaptation of get_clusters from hdbscan._hdbscan_tree.
812    Avoids the label and proba computation at the end,
813        and returns only the selected clusters instead.
814    """
815    tree = condensed_tree._raw_tree
816    cluster_selection_method = condensed_tree.cluster_selection_method
817    stability = compute_stability(tree)
818
819    if allow_single_cluster:
820        node_list = sorted(stability.keys(), reverse=True)
821    else:
822        node_list = sorted(stability.keys(), reverse=True)[:-1]
823        # (exclude root)
824
825    cluster_tree = tree[tree['child_size'] > 1]
826    is_cluster = {cluster: True for cluster in node_list}
827
828    if cluster_selection_method == 'eom':
829        for node in node_list:
830            child_selection = (cluster_tree['parent'] == node)
831            subtree_stability = np.sum([
832                stability[child] for
833                child in cluster_tree['child'][child_selection]])
834            if subtree_stability > stability[node]:
835                is_cluster[node] = False
836                stability[node] = subtree_stability
837            else:
838                for sub_node in _bfs_from_cluster_tree(cluster_tree, node):
839                    if sub_node != node:
840                        is_cluster[sub_node] = False
841
842        if cluster_selection_epsilon != 0.0:
843            eom_clusters = set([c for c in is_cluster if is_cluster[c]])
844            selected_clusters = epsilon_search(eom_clusters, cluster_tree,
845                                               cluster_selection_epsilon,
846                                               allow_single_cluster)
847            for c in is_cluster:
848                if c in selected_clusters:
849                    is_cluster[c] = True
850                else:
851                    is_cluster[c] = False
852
853    elif cluster_selection_method == 'leaf':
854        leaves = set(get_cluster_tree_leaves(cluster_tree))
855        if len(leaves) == 0:
856            for c in is_cluster:
857                is_cluster[c] = False
858            is_cluster[tree['parent'].min()] = True
859
860        if cluster_selection_epsilon != 0.0:
861            selected_clusters = epsilon_search(leaves, cluster_tree,
862                                               cluster_selection_epsilon,
863                                               allow_single_cluster)
864        else:
865            selected_clusters = leaves
866
867        for c in is_cluster:
868            if c in selected_clusters:
869                is_cluster[c] = True
870            else:
871                is_cluster[c] = False
872    else:
873        raise ValueError('Invalid Cluster Selection Method: %s\n'
874                         'Should be one of: "eom", "leaf"\n')
875
876    clusters = set([int(c) for c in is_cluster if is_cluster[c]])
877    return clusters
878
879
880def epsilon_search(leaves, cluster_tree, cluster_selection_epsilon,
881                   allow_single_cluster):
882    selected_clusters = []
883    processed = []
884
885    for leaf in leaves:
886        eps = 1/cluster_tree['lambda_val'][cluster_tree['child'] == leaf][0]
887        if eps < cluster_selection_epsilon:
888            if leaf not in processed:
889                epsilon_child = traverse_upwards(
890                        cluster_tree, cluster_selection_epsilon,
891                        leaf, allow_single_cluster)
892                if hasattr(epsilon_child, '__len__'):
893                    epsilon_child = epsilon_child[0]
894
895                selected_clusters.append(epsilon_child)
896
897                for sub_node in _bfs_from_cluster_tree(cluster_tree,
898                                                       epsilon_child):
899                    if sub_node != epsilon_child:
900                        processed.append(sub_node)
901        else:
902            selected_clusters.append(leaf)
903
904    return set(selected_clusters)
905
906
907def traverse_upwards(cluster_tree, cluster_selection_epsilon,
908                     leaf, allow_single_cluster):
909    root = cluster_tree['parent'].min()
910    parent = cluster_tree[cluster_tree['child'] == leaf]['parent']
911    if parent == root:
912        if allow_single_cluster:
913            return parent
914        else:
915            return leaf  # return node closest to root
916
917    parent_eps = 1/cluster_tree[cluster_tree['child'] == parent]['lambda_val']
918    if parent_eps > cluster_selection_epsilon:
919        return parent
920    else:
921        return traverse_upwards(cluster_tree, cluster_selection_epsilon,
922                                parent, allow_single_cluster)
923
924
925def clusters_from_prediction_data(prediction_data):
926    """
927    Extract selected clusters from PredictionData instance.
928    """
929    return np.array(
930            sorted(list(prediction_data.reverse_cluster_map.values()))
931            ).astype(np.intp)
932