1# Support various prediction methods for predicting cluster membership
2# of new or unseen points. There are several ways to interpret how
3# to do this correctly, so we provide several methods for
4# the different use cases that may arise.
5
6import numpy as np
7
8from sklearn.neighbors import KDTree, BallTree
9from .dist_metrics import DistanceMetric
10from ._hdbscan_tree import recurse_leaf_dfs
11from ._prediction_utils import (get_tree_row_with_child,
12                                dist_membership_vector,
13                                outlier_membership_vector,
14                                prob_in_some_cluster,
15                                all_points_dist_membership_vector,
16                                all_points_outlier_membership_vector,
17                                all_points_prob_in_some_cluster)
18from warnings import warn
19
20
21class PredictionData(object):
22    """
23    Extra data that allows for faster prediction if cached.
24
25    Parameters
26    ----------
27
28    data : array (n_samples, n_features)
29        The original data set that was clustered
30
31    condensed_tree : CondensedTree
32        The condensed tree object created by a clustering
33
34    min_samples : int
35        The min_samples value used in clustering
36
37    tree_type : string, optional
38        Which type of space tree to use for core distance computation.
39        One of:
40            * ``kdtree``
41            * ``balltree``
42
43    metric : string, optional
44        The metric used to determine distance for the clustering.
45        This is the metric that will be used for the space tree to determine
46        core distances etc.
47
48    **kwargs :
49        Any further arguments to the metric.
50
51    Attributes
52    ----------
53
54    raw_data : array (n_samples, n_features)
55        The original data set that was clustered
56
57    tree : KDTree or BallTree
58        A space partitioning tree that can be queried for nearest neighbors.
59
60    core_distances : array (n_samples,)
61        The core distances for every point in the original data set.
62
63    cluster_map : dict
64        A dictionary mapping cluster numbers in the condensed tree to labels
65        in the final selected clustering.
66
67    cluster_tree : structured array
68        A version of the condensed tree that only contains clusters, not
69        individual points.
70
71    max_lambdas : dict
72        A dictionary mapping cluster numbers in the condensed tree to the
73        maximum lambda value seen in that cluster.
74    """
75    _tree_type_map = {'kdtree': KDTree, 'balltree': BallTree}
76
77    def _clusters_below(self, cluster):
78        result = []
79        to_process = [cluster]
80
81        while to_process:
82            result.extend(to_process)
83            to_process = \
84                self.cluster_tree['child'][np.in1d(self.cluster_tree['parent'],
85                                                   to_process)]
86            to_process = to_process.tolist()
87
88        return result
89
90    def _recurse_leaf_dfs(self, current_node):
91        children = self.cluster_tree[self.cluster_tree['parent'] == current_node]['child']
92        if len(children) == 0:
93            return [current_node, ]
94        else:
95            return sum(
96                [recurse_leaf_dfs(self.cluster_tree, child) for child in children], [])
97
98    def __init__(self, data, condensed_tree, min_samples,
99                 tree_type='kdtree', metric='euclidean', **kwargs):
100        self.raw_data = data.astype(np.float64)
101        self.tree = self._tree_type_map[tree_type](self.raw_data,
102                                                   metric=metric, **kwargs)
103        self.core_distances = self.tree.query(data, k=min_samples)[0][:, -1]
104        self.dist_metric = DistanceMetric.get_metric(metric, **kwargs)
105
106        selected_clusters = sorted(condensed_tree._select_clusters())
107        # raw_condensed_tree = condensed_tree.to_numpy()
108        raw_condensed_tree = condensed_tree._raw_tree
109
110        self.cluster_map = {c: n for n, c in enumerate(sorted(list(selected_clusters)))}
111        self.reverse_cluster_map = {n: c for c, n in self.cluster_map.items()}
112
113        self.cluster_tree = raw_condensed_tree[raw_condensed_tree['child_size'] > 1]
114        self.max_lambdas = {}
115        self.leaf_max_lambdas = {}
116        self.exemplars = []
117
118        all_clusters = set(np.hstack([self.cluster_tree['parent'],
119                                      self.cluster_tree['child']]))
120
121        for cluster in all_clusters:
122            self.leaf_max_lambdas[cluster] = raw_condensed_tree['lambda_val'][
123                raw_condensed_tree['parent'] == cluster].max()
124
125        for cluster in selected_clusters:
126            self.max_lambdas[cluster] = \
127                raw_condensed_tree['lambda_val'][raw_condensed_tree['parent'] == cluster].max()
128
129            for sub_cluster in self._clusters_below(cluster):
130                self.cluster_map[sub_cluster] = self.cluster_map[cluster]
131                self.max_lambdas[sub_cluster] = self.max_lambdas[cluster]
132
133            cluster_exemplars = np.array([], dtype=np.int64)
134            for leaf in self._recurse_leaf_dfs(cluster):
135                leaf_max_lambda = raw_condensed_tree['lambda_val'][
136                    raw_condensed_tree['parent'] == leaf].max()
137                points = raw_condensed_tree['child'][
138                    (raw_condensed_tree['parent'] == leaf)
139                    & (raw_condensed_tree['lambda_val'] == leaf_max_lambda)
140                ]
141                cluster_exemplars = np.hstack([cluster_exemplars, points])
142
143            self.exemplars.append(self.raw_data[cluster_exemplars])
144
145
146def _find_neighbor_and_lambda(neighbor_indices, neighbor_distances,
147                              core_distances, min_samples):
148    """
149    Find the nearest mutual reachability neighbor of a point, and  compute
150    the associated lambda value for the point, given the mutual reachability
151    distance to a nearest neighbor.
152
153    Parameters
154    ----------
155    neighbor_indices : array (2 * min_samples, )
156        An array of raw distance based nearest neighbor indices.
157
158    neighbor_distances : array (2 * min_samples, )
159        An array of raw distances to the nearest neighbors.
160
161    core_distances : array (n_samples, )
162        An array of core distances for all points
163
164    min_samples : int
165        The min_samples value used to generate core distances.
166
167    Returns
168    -------
169    neighbor : int
170        The index into the full raw data set of the nearest mutual reachability
171        distance neighbor of the point.
172
173    lambda_ : float
174        The lambda value at which this point joins/merges with `neighbor`.
175    """
176    neighbor_core_distances = core_distances[neighbor_indices]
177    point_core_distances = neighbor_distances[min_samples] * np.ones(
178        neighbor_indices.shape[0])
179    mr_distances = np.vstack((
180        neighbor_core_distances,
181        point_core_distances,
182        neighbor_distances
183    )).max(axis=0)
184
185    nn_index = mr_distances.argmin()
186
187    nearest_neighbor = neighbor_indices[nn_index]
188    if mr_distances[nn_index] > 0.0:
189        lambda_ = 1. / mr_distances[nn_index]
190    else:
191        lambda_ = np.finfo(np.double).max
192
193    return nearest_neighbor, lambda_
194
195
196def _extend_condensed_tree(tree, neighbor_indices, neighbor_distances,
197                           core_distances, min_samples):
198    """
199    Create a new condensed tree with an additional point added, allowing for
200    computations as if this point had been part of the original tree. Note
201    that this makes as little change to the tree as possible, with no
202    re-optimizing/re-condensing so that the selected clusters remain
203    effectively unchanged.
204
205    Parameters
206    ----------
207    tree : structured array
208        The raw format condensed tree to update.
209
210    neighbor_indices : array (2 * min_samples, )
211        An array of raw distance based nearest neighbor indices.
212
213    neighbor_distances : array (2 * min_samples, )
214        An array of raw distances to the nearest neighbors.
215
216    core_distances : array (n_samples, )
217        An array of core distances for all points
218
219    min_samples : int
220        The min_samples value used to generate core distances.
221
222    Returns
223    -------
224    new_tree : structured array
225        The original tree with an extra row providing the parent cluster
226        and lambda information for a new point given index -1.
227    """
228    tree_root = tree['parent'].min()
229
230    nearest_neighbor, lambda_ = _find_neighbor_and_lambda(neighbor_indices,
231                                                          neighbor_distances,
232                                                          core_distances,
233                                                          min_samples
234                                                          )
235
236    neighbor_tree_row = get_tree_row_with_child(tree, nearest_neighbor)
237    potential_cluster = neighbor_tree_row['parent']
238
239    if neighbor_tree_row['lambda_val'] <= lambda_:
240        # New point departs with the old
241        new_tree_row = (potential_cluster, -1, 1,
242                        neighbor_tree_row['lambda_val'])
243    else:
244        # Find appropriate cluster based on lambda of new point
245        while potential_cluster > tree_root and \
246                tree[tree['child']
247                     == potential_cluster]['lambda_val'] >= lambda_:
248            potential_cluster = tree['parent'][tree['child'] == potential_cluster][0]
249
250        new_tree_row = (potential_cluster, -1, 1, lambda_)
251
252    return np.append(tree, new_tree_row)
253
254
255def _find_cluster_and_probability(tree, cluster_tree, neighbor_indices,
256                                  neighbor_distances, core_distances,
257                                  cluster_map, max_lambdas,
258                                  min_samples):
259    """
260    Return the cluster label (of the original clustering) and membership
261    probability of a new data point.
262
263    Parameters
264    ----------
265    tree : CondensedTree
266        The condensed tree associated with the clustering.
267
268    cluster_tree : structured_array
269        The raw form of the condensed tree with only cluster information (no
270        data on individual points). This is significantly more compact.
271
272    neighbor_indices : array (2 * min_samples, )
273        An array of raw distance based nearest neighbor indices.
274
275    neighbor_distances : array (2 * min_samples, )
276        An array of raw distances to the nearest neighbors.
277
278    core_distances : array (n_samples, )
279        An array of core distances for all points
280
281    cluster_map : dict
282        A dictionary mapping cluster numbers in the condensed tree to labels
283        in the final selected clustering.
284
285    max_lambdas : dict
286        A dictionary mapping cluster numbers in the condensed tree to the
287        maximum lambda value seen in that cluster.
288
289    min_samples : int
290        The min_samples value used to generate core distances.
291    """
292    raw_tree = tree._raw_tree
293    tree_root = cluster_tree['parent'].min()
294
295    nearest_neighbor, lambda_ = _find_neighbor_and_lambda(neighbor_indices,
296                                                          neighbor_distances,
297                                                          core_distances,
298                                                          min_samples
299                                                          )
300
301    neighbor_tree_row = get_tree_row_with_child(raw_tree, nearest_neighbor)
302    potential_cluster = neighbor_tree_row['parent']
303
304    if neighbor_tree_row['lambda_val'] > lambda_:
305        # Find appropriate cluster based on lambda of new point
306        while potential_cluster > tree_root and \
307                cluster_tree['lambda_val'][cluster_tree['child']
308                                           == potential_cluster] >= lambda_:
309            potential_cluster = cluster_tree['parent'][cluster_tree['child']
310                                                       == potential_cluster][0]
311
312    if potential_cluster in cluster_map:
313        cluster_label = cluster_map[potential_cluster]
314    else:
315        cluster_label = -1
316
317    if cluster_label >= 0:
318        max_lambda = max_lambdas[potential_cluster]
319
320        if max_lambda > 0.0:
321            lambda_ = min(max_lambda, lambda_)
322            prob = (lambda_ / max_lambda)
323        else:
324            prob = 1.0
325    else:
326        prob = 0.0
327
328    return cluster_label, prob
329
330
331def approximate_predict(clusterer, points_to_predict):
332    """Predict the cluster label of new points. The returned labels
333    will be those of the original clustering found by ``clusterer``,
334    and therefore are not (necessarily) the cluster labels that would
335    be found by clustering the original data combined with
336    ``points_to_predict``, hence the 'approximate' label.
337
338    If you simply wish to assign new points to an existing clustering
339    in the 'best' way possible, this is the function to use. If you
340    want to predict how ``points_to_predict`` would cluster with
341    the original data under HDBSCAN the most efficient existing approach
342    is to simply recluster with the new point(s) added to the original dataset.
343
344    Parameters
345    ----------
346    clusterer : HDBSCAN
347        A clustering object that has been fit to the data and
348        either had ``prediction_data=True`` set, or called the
349        ``generate_prediction_data`` method after the fact.
350
351    points_to_predict : array, or array-like (n_samples, n_features)
352        The new data points to predict cluster labels for. They should
353        have the same dimensionality as the original dataset over which
354        clusterer was fit.
355
356    Returns
357    -------
358    labels : array (n_samples,)
359        The predicted labels of the ``points_to_predict``
360
361    probabilities : array (n_samples,)
362        The soft cluster scores for each of the ``points_to_predict``
363
364    See Also
365    --------
366    :py:func:`hdbscan.predict.membership_vector`
367    :py:func:`hdbscan.predict.all_points_membership_vectors`
368
369    """
370    if clusterer.prediction_data_ is None:
371        raise ValueError('Clusterer does not have prediction data!'
372                         ' Try fitting with prediction_data=True set,'
373                         ' or run generate_prediction_data on the clusterer')
374
375    points_to_predict = np.asarray(points_to_predict)
376
377    if points_to_predict.shape[1] != \
378            clusterer.prediction_data_.raw_data.shape[1]:
379        raise ValueError('New points dimension does not match fit data!')
380
381    if clusterer.prediction_data_.cluster_tree.shape[0] == 0:
382        warn('Clusterer does not have any defined clusters, new data'
383             ' will be automatically predicted as noise.')
384        labels = -1 * np.ones(points_to_predict.shape[0], dtype=np.int32)
385        probabilities = np.zeros(points_to_predict.shape[0], dtype=np.float32)
386        return labels, probabilities
387
388    labels = np.empty(points_to_predict.shape[0], dtype=np.int)
389    probabilities = np.empty(points_to_predict.shape[0], dtype=np.float64)
390
391    min_samples = clusterer.min_samples or clusterer.min_cluster_size
392    neighbor_distances, neighbor_indices = \
393        clusterer.prediction_data_.tree.query(points_to_predict,
394                                              k=2 * min_samples)
395
396    for i in range(points_to_predict.shape[0]):
397        label, prob = _find_cluster_and_probability(
398            clusterer.condensed_tree_,
399            clusterer.prediction_data_.cluster_tree,
400            neighbor_indices[i],
401            neighbor_distances[i],
402            clusterer.prediction_data_.core_distances,
403            clusterer.prediction_data_.cluster_map,
404            clusterer.prediction_data_.max_lambdas,
405            min_samples
406        )
407        labels[i] = label
408        probabilities[i] = prob
409
410    return labels, probabilities
411
412
413def approximate_predict_scores(clusterer, points_to_predict):
414    """Predict the outlier score of new points. The returned scores
415    will be based on the original clustering found by ``clusterer``,
416    and therefore are not (necessarily) the outlier scores that would
417    be found by clustering the original data combined with
418    ``points_to_predict``, hence the 'approximate' label.
419
420    If you simply wish to calculate the outlier scores for new points
421    in the 'best' way possible, this is the function to use. If you
422    want to predict the outlier score of ``points_to_predict`` with
423    the original data under HDBSCAN the most efficient existing approach
424    is to simply recluster with the new point(s) added to the original dataset.
425
426    Parameters
427    ----------
428    clusterer : HDBSCAN
429        A clustering object that has been fit to the data and
430        either had ``prediction_data=True`` set, or called the
431        ``generate_prediction_data`` method after the fact.
432
433    points_to_predict : array, or array-like (n_samples, n_features)
434        The new data points to predict cluster labels for. They should
435        have the same dimensionality as the original dataset over which
436        clusterer was fit.
437
438    Returns
439    -------
440    scores : array (n_samples,)
441        The predicted scores of the ``points_to_predict``
442
443    See Also
444    --------
445    :py:func:`hdbscan.predict.membership_vector`
446    :py:func:`hdbscan.predict.all_points_membership_vectors`
447
448    """
449    try:
450        clusterer.prediction_data_
451    except AttributeError:
452        raise ValueError('Clusterer does not have prediction data!'
453                         ' Try fitting with prediction_data=True set,'
454                         ' or run generate_prediction_data on the clusterer')
455
456    points_to_predict = np.asarray(points_to_predict)
457
458    if points_to_predict.shape[1] != \
459            clusterer.prediction_data_.raw_data.shape[1]:
460        raise ValueError('New points dimension does not match fit data!')
461
462    if clusterer.prediction_data_.cluster_tree.shape[0] == 0:
463        warn('Clusterer does not have any defined clusters, new data'
464             ' will be automatically predicted as outliers.')
465        scores = np.ones(points_to_predict.shape[0], dtype=np.int32)
466        return scores
467
468    scores = np.empty(points_to_predict.shape[0], dtype=np.float)
469
470    min_samples = clusterer.min_samples or clusterer.min_cluster_size
471    neighbor_distances, neighbor_indices = \
472        clusterer.prediction_data_.tree.query(points_to_predict,
473                                              k=2 * min_samples)
474
475    tree = clusterer.condensed_tree_._raw_tree
476
477    parent_array = tree['parent']
478
479    tree_root = parent_array.min()
480    max_lambdas = {}
481    for parent in np.unique(tree['parent']):
482        max_lambdas[parent] = tree[tree['parent'] == parent]['lambda_val'].max()
483
484    for n in np.argsort(parent_array):
485        cluster = tree['child'][n]
486        if cluster < tree_root:
487            break
488
489        parent = parent_array[n]
490        if max_lambdas[cluster] > max_lambdas[parent]:
491            max_lambdas[parent] = max_lambdas[cluster]
492
493    for i in range(points_to_predict.shape[0]):
494        neigh, lambda_ = _find_neighbor_and_lambda(
495            neighbor_indices[i],
496            neighbor_distances[i],
497            clusterer.prediction_data_.core_distances,
498            min_samples
499        )
500
501        neighbor_tree_row = get_tree_row_with_child(tree, neigh)
502        potential_cluster = neighbor_tree_row['parent']
503
504        if neighbor_distances[i].min() == 0:
505            # the point is in the dataset, fix lambda for rounding errors
506            lambda_ = neighbor_tree_row['lambda_val']
507
508        max_lambda = max_lambdas[potential_cluster]
509
510        if max_lambda > 0.0:
511            scores[i] = (max_lambda - lambda_) / max_lambda
512        else:
513            scores[i] = 0.0
514
515    return scores
516
517
518def membership_vector(clusterer, points_to_predict):
519    """Predict soft cluster membership. The result produces a vector
520    for each point in ``points_to_predict`` that gives a probability that
521    the given point is a member of a cluster for each of the selected clusters
522    of the ``clusterer``.
523
524    Parameters
525    ----------
526    clusterer : HDBSCAN
527        A clustering object that has been fit to the data and
528        either had ``prediction_data=True`` set, or called the
529        ``generate_prediction_data`` method after the fact.
530
531    points_to_predict : array, or array-like (n_samples, n_features)
532        The new data points to predict cluster labels for. They should
533        have the same dimensionality as the original dataset over which
534        clusterer was fit.
535
536    Returns
537    -------
538    membership_vectors : array (n_samples, n_clusters)
539        The probability that point ``i`` is a member of cluster ``j`` is
540        in ``membership_vectors[i, j]``.
541
542    See Also
543    --------
544    :py:func:`hdbscan.predict.predict`
545    :py:func:`hdbscan.predict.all_points_membership_vectors`
546"""
547
548    points_to_predict = points_to_predict.astype(np.float64)
549    clusters = np.array(
550        sorted(list(clusterer.condensed_tree_._select_clusters()))).astype(np.intp)
551
552    result = np.empty((points_to_predict.shape[0], clusters.shape[0]),
553                      dtype=np.float64)
554
555    min_samples = clusterer.min_samples or clusterer.min_cluster_size
556    neighbor_distances, neighbor_indices = \
557        clusterer.prediction_data_.tree.query(points_to_predict,
558                                              k=2 * min_samples)
559
560    for i in range(points_to_predict.shape[0]):
561
562        # We need to find where in the tree the new point would go
563        # for the purposes of outlier membership approximation
564        nearest_neighbor, lambda_ = \
565            _find_neighbor_and_lambda(
566                neighbor_indices[i],
567                neighbor_distances[i],
568                clusterer.prediction_data_.core_distances,
569                min_samples)
570
571        neighbor_tree_row = get_tree_row_with_child(
572            clusterer.condensed_tree_._raw_tree, nearest_neighbor)
573
574        if neighbor_tree_row['lambda_val'] <= lambda_:
575            lambda_ = neighbor_tree_row['lambda_val']
576
577        distance_vec = dist_membership_vector(
578            points_to_predict[i],
579            clusterer.prediction_data_.exemplars,
580            clusterer.prediction_data_.dist_metric)
581        outlier_vec = outlier_membership_vector(
582            nearest_neighbor,
583            lambda_,
584            clusters,
585            clusterer.condensed_tree_._raw_tree,
586            clusterer.prediction_data_.leaf_max_lambdas,
587            clusterer.prediction_data_.cluster_tree)
588
589        result[i] = distance_vec ** 0.5 * outlier_vec ** 2.0
590        result[i] /= result[i].sum()
591
592        result[i] *= prob_in_some_cluster(
593            nearest_neighbor,
594            lambda_,
595            clusters,
596            clusterer.condensed_tree_._raw_tree,
597            clusterer.prediction_data_.leaf_max_lambdas,
598            clusterer.prediction_data_.cluster_tree)
599
600    return result
601
602
603def all_points_membership_vectors(clusterer):
604    """Predict soft cluster membership vectors for all points in the
605    original dataset the clusterer was trained on. This function is more
606    efficient by making use of the fact that all points are already in the
607    condensed tree, and processing in bulk.
608
609    Parameters
610    ----------
611    clusterer : HDBSCAN
612         A clustering object that has been fit to the data and
613        either had ``prediction_data=True`` set, or called the
614        ``generate_prediction_data`` method after the fact.
615        This method does not work if the clusterer was trained
616        with ``metric='precomputed'``.
617
618    Returns
619    -------
620    membership_vectors : array (n_samples, n_clusters)
621        The probability that point ``i`` of the original dataset is a member of
622        cluster ``j`` is in ``membership_vectors[i, j]``.
623
624    See Also
625    --------
626    :py:func:`hdbscan.predict.predict`
627    :py:func:`hdbscan.predict.all_points_membership_vectors`
628    """
629    clusters = np.array(sorted(list(clusterer.condensed_tree_._select_clusters()))).astype(np.intp)
630    all_points = clusterer.prediction_data_.raw_data
631
632    # When no clusters found, return array of 0's
633    if clusters.size == 0:
634        return np.zeros(all_points.shape[0])
635
636    distance_vecs = all_points_dist_membership_vector(
637        all_points,
638        clusterer.prediction_data_.exemplars,
639        clusterer.prediction_data_.dist_metric)
640    outlier_vecs = all_points_outlier_membership_vector(
641        clusters,
642        clusterer.condensed_tree_._raw_tree,
643        clusterer.prediction_data_.leaf_max_lambdas,
644        clusterer.prediction_data_.cluster_tree)
645    in_cluster_probs = all_points_prob_in_some_cluster(
646        clusters,
647        clusterer.condensed_tree_._raw_tree,
648        clusterer.prediction_data_.leaf_max_lambdas,
649        clusterer.prediction_data_.cluster_tree)
650
651    result = distance_vecs * outlier_vecs
652    row_sums = result.sum(axis=1)
653    result = result / row_sums[:, np.newaxis]
654    result *= in_cluster_probs[:, np.newaxis]
655
656    return result
657