1# Support various prediction methods for predicting cluster membership 2# of new or unseen points. There are several ways to interpret how 3# to do this correctly, so we provide several methods for 4# the different use cases that may arise. 5 6import numpy as np 7 8from sklearn.neighbors import KDTree, BallTree 9from .dist_metrics import DistanceMetric 10from ._hdbscan_tree import recurse_leaf_dfs 11from ._prediction_utils import (get_tree_row_with_child, 12 dist_membership_vector, 13 outlier_membership_vector, 14 prob_in_some_cluster, 15 all_points_dist_membership_vector, 16 all_points_outlier_membership_vector, 17 all_points_prob_in_some_cluster) 18from warnings import warn 19 20 21class PredictionData(object): 22 """ 23 Extra data that allows for faster prediction if cached. 24 25 Parameters 26 ---------- 27 28 data : array (n_samples, n_features) 29 The original data set that was clustered 30 31 condensed_tree : CondensedTree 32 The condensed tree object created by a clustering 33 34 min_samples : int 35 The min_samples value used in clustering 36 37 tree_type : string, optional 38 Which type of space tree to use for core distance computation. 39 One of: 40 * ``kdtree`` 41 * ``balltree`` 42 43 metric : string, optional 44 The metric used to determine distance for the clustering. 45 This is the metric that will be used for the space tree to determine 46 core distances etc. 47 48 **kwargs : 49 Any further arguments to the metric. 50 51 Attributes 52 ---------- 53 54 raw_data : array (n_samples, n_features) 55 The original data set that was clustered 56 57 tree : KDTree or BallTree 58 A space partitioning tree that can be queried for nearest neighbors. 59 60 core_distances : array (n_samples,) 61 The core distances for every point in the original data set. 62 63 cluster_map : dict 64 A dictionary mapping cluster numbers in the condensed tree to labels 65 in the final selected clustering. 66 67 cluster_tree : structured array 68 A version of the condensed tree that only contains clusters, not 69 individual points. 70 71 max_lambdas : dict 72 A dictionary mapping cluster numbers in the condensed tree to the 73 maximum lambda value seen in that cluster. 74 """ 75 _tree_type_map = {'kdtree': KDTree, 'balltree': BallTree} 76 77 def _clusters_below(self, cluster): 78 result = [] 79 to_process = [cluster] 80 81 while to_process: 82 result.extend(to_process) 83 to_process = \ 84 self.cluster_tree['child'][np.in1d(self.cluster_tree['parent'], 85 to_process)] 86 to_process = to_process.tolist() 87 88 return result 89 90 def _recurse_leaf_dfs(self, current_node): 91 children = self.cluster_tree[self.cluster_tree['parent'] == current_node]['child'] 92 if len(children) == 0: 93 return [current_node, ] 94 else: 95 return sum( 96 [recurse_leaf_dfs(self.cluster_tree, child) for child in children], []) 97 98 def __init__(self, data, condensed_tree, min_samples, 99 tree_type='kdtree', metric='euclidean', **kwargs): 100 self.raw_data = data.astype(np.float64) 101 self.tree = self._tree_type_map[tree_type](self.raw_data, 102 metric=metric, **kwargs) 103 self.core_distances = self.tree.query(data, k=min_samples)[0][:, -1] 104 self.dist_metric = DistanceMetric.get_metric(metric, **kwargs) 105 106 selected_clusters = sorted(condensed_tree._select_clusters()) 107 # raw_condensed_tree = condensed_tree.to_numpy() 108 raw_condensed_tree = condensed_tree._raw_tree 109 110 self.cluster_map = {c: n for n, c in enumerate(sorted(list(selected_clusters)))} 111 self.reverse_cluster_map = {n: c for c, n in self.cluster_map.items()} 112 113 self.cluster_tree = raw_condensed_tree[raw_condensed_tree['child_size'] > 1] 114 self.max_lambdas = {} 115 self.leaf_max_lambdas = {} 116 self.exemplars = [] 117 118 all_clusters = set(np.hstack([self.cluster_tree['parent'], 119 self.cluster_tree['child']])) 120 121 for cluster in all_clusters: 122 self.leaf_max_lambdas[cluster] = raw_condensed_tree['lambda_val'][ 123 raw_condensed_tree['parent'] == cluster].max() 124 125 for cluster in selected_clusters: 126 self.max_lambdas[cluster] = \ 127 raw_condensed_tree['lambda_val'][raw_condensed_tree['parent'] == cluster].max() 128 129 for sub_cluster in self._clusters_below(cluster): 130 self.cluster_map[sub_cluster] = self.cluster_map[cluster] 131 self.max_lambdas[sub_cluster] = self.max_lambdas[cluster] 132 133 cluster_exemplars = np.array([], dtype=np.int64) 134 for leaf in self._recurse_leaf_dfs(cluster): 135 leaf_max_lambda = raw_condensed_tree['lambda_val'][ 136 raw_condensed_tree['parent'] == leaf].max() 137 points = raw_condensed_tree['child'][ 138 (raw_condensed_tree['parent'] == leaf) 139 & (raw_condensed_tree['lambda_val'] == leaf_max_lambda) 140 ] 141 cluster_exemplars = np.hstack([cluster_exemplars, points]) 142 143 self.exemplars.append(self.raw_data[cluster_exemplars]) 144 145 146def _find_neighbor_and_lambda(neighbor_indices, neighbor_distances, 147 core_distances, min_samples): 148 """ 149 Find the nearest mutual reachability neighbor of a point, and compute 150 the associated lambda value for the point, given the mutual reachability 151 distance to a nearest neighbor. 152 153 Parameters 154 ---------- 155 neighbor_indices : array (2 * min_samples, ) 156 An array of raw distance based nearest neighbor indices. 157 158 neighbor_distances : array (2 * min_samples, ) 159 An array of raw distances to the nearest neighbors. 160 161 core_distances : array (n_samples, ) 162 An array of core distances for all points 163 164 min_samples : int 165 The min_samples value used to generate core distances. 166 167 Returns 168 ------- 169 neighbor : int 170 The index into the full raw data set of the nearest mutual reachability 171 distance neighbor of the point. 172 173 lambda_ : float 174 The lambda value at which this point joins/merges with `neighbor`. 175 """ 176 neighbor_core_distances = core_distances[neighbor_indices] 177 point_core_distances = neighbor_distances[min_samples] * np.ones( 178 neighbor_indices.shape[0]) 179 mr_distances = np.vstack(( 180 neighbor_core_distances, 181 point_core_distances, 182 neighbor_distances 183 )).max(axis=0) 184 185 nn_index = mr_distances.argmin() 186 187 nearest_neighbor = neighbor_indices[nn_index] 188 if mr_distances[nn_index] > 0.0: 189 lambda_ = 1. / mr_distances[nn_index] 190 else: 191 lambda_ = np.finfo(np.double).max 192 193 return nearest_neighbor, lambda_ 194 195 196def _extend_condensed_tree(tree, neighbor_indices, neighbor_distances, 197 core_distances, min_samples): 198 """ 199 Create a new condensed tree with an additional point added, allowing for 200 computations as if this point had been part of the original tree. Note 201 that this makes as little change to the tree as possible, with no 202 re-optimizing/re-condensing so that the selected clusters remain 203 effectively unchanged. 204 205 Parameters 206 ---------- 207 tree : structured array 208 The raw format condensed tree to update. 209 210 neighbor_indices : array (2 * min_samples, ) 211 An array of raw distance based nearest neighbor indices. 212 213 neighbor_distances : array (2 * min_samples, ) 214 An array of raw distances to the nearest neighbors. 215 216 core_distances : array (n_samples, ) 217 An array of core distances for all points 218 219 min_samples : int 220 The min_samples value used to generate core distances. 221 222 Returns 223 ------- 224 new_tree : structured array 225 The original tree with an extra row providing the parent cluster 226 and lambda information for a new point given index -1. 227 """ 228 tree_root = tree['parent'].min() 229 230 nearest_neighbor, lambda_ = _find_neighbor_and_lambda(neighbor_indices, 231 neighbor_distances, 232 core_distances, 233 min_samples 234 ) 235 236 neighbor_tree_row = get_tree_row_with_child(tree, nearest_neighbor) 237 potential_cluster = neighbor_tree_row['parent'] 238 239 if neighbor_tree_row['lambda_val'] <= lambda_: 240 # New point departs with the old 241 new_tree_row = (potential_cluster, -1, 1, 242 neighbor_tree_row['lambda_val']) 243 else: 244 # Find appropriate cluster based on lambda of new point 245 while potential_cluster > tree_root and \ 246 tree[tree['child'] 247 == potential_cluster]['lambda_val'] >= lambda_: 248 potential_cluster = tree['parent'][tree['child'] == potential_cluster][0] 249 250 new_tree_row = (potential_cluster, -1, 1, lambda_) 251 252 return np.append(tree, new_tree_row) 253 254 255def _find_cluster_and_probability(tree, cluster_tree, neighbor_indices, 256 neighbor_distances, core_distances, 257 cluster_map, max_lambdas, 258 min_samples): 259 """ 260 Return the cluster label (of the original clustering) and membership 261 probability of a new data point. 262 263 Parameters 264 ---------- 265 tree : CondensedTree 266 The condensed tree associated with the clustering. 267 268 cluster_tree : structured_array 269 The raw form of the condensed tree with only cluster information (no 270 data on individual points). This is significantly more compact. 271 272 neighbor_indices : array (2 * min_samples, ) 273 An array of raw distance based nearest neighbor indices. 274 275 neighbor_distances : array (2 * min_samples, ) 276 An array of raw distances to the nearest neighbors. 277 278 core_distances : array (n_samples, ) 279 An array of core distances for all points 280 281 cluster_map : dict 282 A dictionary mapping cluster numbers in the condensed tree to labels 283 in the final selected clustering. 284 285 max_lambdas : dict 286 A dictionary mapping cluster numbers in the condensed tree to the 287 maximum lambda value seen in that cluster. 288 289 min_samples : int 290 The min_samples value used to generate core distances. 291 """ 292 raw_tree = tree._raw_tree 293 tree_root = cluster_tree['parent'].min() 294 295 nearest_neighbor, lambda_ = _find_neighbor_and_lambda(neighbor_indices, 296 neighbor_distances, 297 core_distances, 298 min_samples 299 ) 300 301 neighbor_tree_row = get_tree_row_with_child(raw_tree, nearest_neighbor) 302 potential_cluster = neighbor_tree_row['parent'] 303 304 if neighbor_tree_row['lambda_val'] > lambda_: 305 # Find appropriate cluster based on lambda of new point 306 while potential_cluster > tree_root and \ 307 cluster_tree['lambda_val'][cluster_tree['child'] 308 == potential_cluster] >= lambda_: 309 potential_cluster = cluster_tree['parent'][cluster_tree['child'] 310 == potential_cluster][0] 311 312 if potential_cluster in cluster_map: 313 cluster_label = cluster_map[potential_cluster] 314 else: 315 cluster_label = -1 316 317 if cluster_label >= 0: 318 max_lambda = max_lambdas[potential_cluster] 319 320 if max_lambda > 0.0: 321 lambda_ = min(max_lambda, lambda_) 322 prob = (lambda_ / max_lambda) 323 else: 324 prob = 1.0 325 else: 326 prob = 0.0 327 328 return cluster_label, prob 329 330 331def approximate_predict(clusterer, points_to_predict): 332 """Predict the cluster label of new points. The returned labels 333 will be those of the original clustering found by ``clusterer``, 334 and therefore are not (necessarily) the cluster labels that would 335 be found by clustering the original data combined with 336 ``points_to_predict``, hence the 'approximate' label. 337 338 If you simply wish to assign new points to an existing clustering 339 in the 'best' way possible, this is the function to use. If you 340 want to predict how ``points_to_predict`` would cluster with 341 the original data under HDBSCAN the most efficient existing approach 342 is to simply recluster with the new point(s) added to the original dataset. 343 344 Parameters 345 ---------- 346 clusterer : HDBSCAN 347 A clustering object that has been fit to the data and 348 either had ``prediction_data=True`` set, or called the 349 ``generate_prediction_data`` method after the fact. 350 351 points_to_predict : array, or array-like (n_samples, n_features) 352 The new data points to predict cluster labels for. They should 353 have the same dimensionality as the original dataset over which 354 clusterer was fit. 355 356 Returns 357 ------- 358 labels : array (n_samples,) 359 The predicted labels of the ``points_to_predict`` 360 361 probabilities : array (n_samples,) 362 The soft cluster scores for each of the ``points_to_predict`` 363 364 See Also 365 -------- 366 :py:func:`hdbscan.predict.membership_vector` 367 :py:func:`hdbscan.predict.all_points_membership_vectors` 368 369 """ 370 if clusterer.prediction_data_ is None: 371 raise ValueError('Clusterer does not have prediction data!' 372 ' Try fitting with prediction_data=True set,' 373 ' or run generate_prediction_data on the clusterer') 374 375 points_to_predict = np.asarray(points_to_predict) 376 377 if points_to_predict.shape[1] != \ 378 clusterer.prediction_data_.raw_data.shape[1]: 379 raise ValueError('New points dimension does not match fit data!') 380 381 if clusterer.prediction_data_.cluster_tree.shape[0] == 0: 382 warn('Clusterer does not have any defined clusters, new data' 383 ' will be automatically predicted as noise.') 384 labels = -1 * np.ones(points_to_predict.shape[0], dtype=np.int32) 385 probabilities = np.zeros(points_to_predict.shape[0], dtype=np.float32) 386 return labels, probabilities 387 388 labels = np.empty(points_to_predict.shape[0], dtype=np.int) 389 probabilities = np.empty(points_to_predict.shape[0], dtype=np.float64) 390 391 min_samples = clusterer.min_samples or clusterer.min_cluster_size 392 neighbor_distances, neighbor_indices = \ 393 clusterer.prediction_data_.tree.query(points_to_predict, 394 k=2 * min_samples) 395 396 for i in range(points_to_predict.shape[0]): 397 label, prob = _find_cluster_and_probability( 398 clusterer.condensed_tree_, 399 clusterer.prediction_data_.cluster_tree, 400 neighbor_indices[i], 401 neighbor_distances[i], 402 clusterer.prediction_data_.core_distances, 403 clusterer.prediction_data_.cluster_map, 404 clusterer.prediction_data_.max_lambdas, 405 min_samples 406 ) 407 labels[i] = label 408 probabilities[i] = prob 409 410 return labels, probabilities 411 412 413def approximate_predict_scores(clusterer, points_to_predict): 414 """Predict the outlier score of new points. The returned scores 415 will be based on the original clustering found by ``clusterer``, 416 and therefore are not (necessarily) the outlier scores that would 417 be found by clustering the original data combined with 418 ``points_to_predict``, hence the 'approximate' label. 419 420 If you simply wish to calculate the outlier scores for new points 421 in the 'best' way possible, this is the function to use. If you 422 want to predict the outlier score of ``points_to_predict`` with 423 the original data under HDBSCAN the most efficient existing approach 424 is to simply recluster with the new point(s) added to the original dataset. 425 426 Parameters 427 ---------- 428 clusterer : HDBSCAN 429 A clustering object that has been fit to the data and 430 either had ``prediction_data=True`` set, or called the 431 ``generate_prediction_data`` method after the fact. 432 433 points_to_predict : array, or array-like (n_samples, n_features) 434 The new data points to predict cluster labels for. They should 435 have the same dimensionality as the original dataset over which 436 clusterer was fit. 437 438 Returns 439 ------- 440 scores : array (n_samples,) 441 The predicted scores of the ``points_to_predict`` 442 443 See Also 444 -------- 445 :py:func:`hdbscan.predict.membership_vector` 446 :py:func:`hdbscan.predict.all_points_membership_vectors` 447 448 """ 449 try: 450 clusterer.prediction_data_ 451 except AttributeError: 452 raise ValueError('Clusterer does not have prediction data!' 453 ' Try fitting with prediction_data=True set,' 454 ' or run generate_prediction_data on the clusterer') 455 456 points_to_predict = np.asarray(points_to_predict) 457 458 if points_to_predict.shape[1] != \ 459 clusterer.prediction_data_.raw_data.shape[1]: 460 raise ValueError('New points dimension does not match fit data!') 461 462 if clusterer.prediction_data_.cluster_tree.shape[0] == 0: 463 warn('Clusterer does not have any defined clusters, new data' 464 ' will be automatically predicted as outliers.') 465 scores = np.ones(points_to_predict.shape[0], dtype=np.int32) 466 return scores 467 468 scores = np.empty(points_to_predict.shape[0], dtype=np.float) 469 470 min_samples = clusterer.min_samples or clusterer.min_cluster_size 471 neighbor_distances, neighbor_indices = \ 472 clusterer.prediction_data_.tree.query(points_to_predict, 473 k=2 * min_samples) 474 475 tree = clusterer.condensed_tree_._raw_tree 476 477 parent_array = tree['parent'] 478 479 tree_root = parent_array.min() 480 max_lambdas = {} 481 for parent in np.unique(tree['parent']): 482 max_lambdas[parent] = tree[tree['parent'] == parent]['lambda_val'].max() 483 484 for n in np.argsort(parent_array): 485 cluster = tree['child'][n] 486 if cluster < tree_root: 487 break 488 489 parent = parent_array[n] 490 if max_lambdas[cluster] > max_lambdas[parent]: 491 max_lambdas[parent] = max_lambdas[cluster] 492 493 for i in range(points_to_predict.shape[0]): 494 neigh, lambda_ = _find_neighbor_and_lambda( 495 neighbor_indices[i], 496 neighbor_distances[i], 497 clusterer.prediction_data_.core_distances, 498 min_samples 499 ) 500 501 neighbor_tree_row = get_tree_row_with_child(tree, neigh) 502 potential_cluster = neighbor_tree_row['parent'] 503 504 if neighbor_distances[i].min() == 0: 505 # the point is in the dataset, fix lambda for rounding errors 506 lambda_ = neighbor_tree_row['lambda_val'] 507 508 max_lambda = max_lambdas[potential_cluster] 509 510 if max_lambda > 0.0: 511 scores[i] = (max_lambda - lambda_) / max_lambda 512 else: 513 scores[i] = 0.0 514 515 return scores 516 517 518def membership_vector(clusterer, points_to_predict): 519 """Predict soft cluster membership. The result produces a vector 520 for each point in ``points_to_predict`` that gives a probability that 521 the given point is a member of a cluster for each of the selected clusters 522 of the ``clusterer``. 523 524 Parameters 525 ---------- 526 clusterer : HDBSCAN 527 A clustering object that has been fit to the data and 528 either had ``prediction_data=True`` set, or called the 529 ``generate_prediction_data`` method after the fact. 530 531 points_to_predict : array, or array-like (n_samples, n_features) 532 The new data points to predict cluster labels for. They should 533 have the same dimensionality as the original dataset over which 534 clusterer was fit. 535 536 Returns 537 ------- 538 membership_vectors : array (n_samples, n_clusters) 539 The probability that point ``i`` is a member of cluster ``j`` is 540 in ``membership_vectors[i, j]``. 541 542 See Also 543 -------- 544 :py:func:`hdbscan.predict.predict` 545 :py:func:`hdbscan.predict.all_points_membership_vectors` 546""" 547 548 points_to_predict = points_to_predict.astype(np.float64) 549 clusters = np.array( 550 sorted(list(clusterer.condensed_tree_._select_clusters()))).astype(np.intp) 551 552 result = np.empty((points_to_predict.shape[0], clusters.shape[0]), 553 dtype=np.float64) 554 555 min_samples = clusterer.min_samples or clusterer.min_cluster_size 556 neighbor_distances, neighbor_indices = \ 557 clusterer.prediction_data_.tree.query(points_to_predict, 558 k=2 * min_samples) 559 560 for i in range(points_to_predict.shape[0]): 561 562 # We need to find where in the tree the new point would go 563 # for the purposes of outlier membership approximation 564 nearest_neighbor, lambda_ = \ 565 _find_neighbor_and_lambda( 566 neighbor_indices[i], 567 neighbor_distances[i], 568 clusterer.prediction_data_.core_distances, 569 min_samples) 570 571 neighbor_tree_row = get_tree_row_with_child( 572 clusterer.condensed_tree_._raw_tree, nearest_neighbor) 573 574 if neighbor_tree_row['lambda_val'] <= lambda_: 575 lambda_ = neighbor_tree_row['lambda_val'] 576 577 distance_vec = dist_membership_vector( 578 points_to_predict[i], 579 clusterer.prediction_data_.exemplars, 580 clusterer.prediction_data_.dist_metric) 581 outlier_vec = outlier_membership_vector( 582 nearest_neighbor, 583 lambda_, 584 clusters, 585 clusterer.condensed_tree_._raw_tree, 586 clusterer.prediction_data_.leaf_max_lambdas, 587 clusterer.prediction_data_.cluster_tree) 588 589 result[i] = distance_vec ** 0.5 * outlier_vec ** 2.0 590 result[i] /= result[i].sum() 591 592 result[i] *= prob_in_some_cluster( 593 nearest_neighbor, 594 lambda_, 595 clusters, 596 clusterer.condensed_tree_._raw_tree, 597 clusterer.prediction_data_.leaf_max_lambdas, 598 clusterer.prediction_data_.cluster_tree) 599 600 return result 601 602 603def all_points_membership_vectors(clusterer): 604 """Predict soft cluster membership vectors for all points in the 605 original dataset the clusterer was trained on. This function is more 606 efficient by making use of the fact that all points are already in the 607 condensed tree, and processing in bulk. 608 609 Parameters 610 ---------- 611 clusterer : HDBSCAN 612 A clustering object that has been fit to the data and 613 either had ``prediction_data=True`` set, or called the 614 ``generate_prediction_data`` method after the fact. 615 This method does not work if the clusterer was trained 616 with ``metric='precomputed'``. 617 618 Returns 619 ------- 620 membership_vectors : array (n_samples, n_clusters) 621 The probability that point ``i`` of the original dataset is a member of 622 cluster ``j`` is in ``membership_vectors[i, j]``. 623 624 See Also 625 -------- 626 :py:func:`hdbscan.predict.predict` 627 :py:func:`hdbscan.predict.all_points_membership_vectors` 628 """ 629 clusters = np.array(sorted(list(clusterer.condensed_tree_._select_clusters()))).astype(np.intp) 630 all_points = clusterer.prediction_data_.raw_data 631 632 # When no clusters found, return array of 0's 633 if clusters.size == 0: 634 return np.zeros(all_points.shape[0]) 635 636 distance_vecs = all_points_dist_membership_vector( 637 all_points, 638 clusterer.prediction_data_.exemplars, 639 clusterer.prediction_data_.dist_metric) 640 outlier_vecs = all_points_outlier_membership_vector( 641 clusters, 642 clusterer.condensed_tree_._raw_tree, 643 clusterer.prediction_data_.leaf_max_lambdas, 644 clusterer.prediction_data_.cluster_tree) 645 in_cluster_probs = all_points_prob_in_some_cluster( 646 clusters, 647 clusterer.condensed_tree_._raw_tree, 648 clusterer.prediction_data_.leaf_max_lambdas, 649 clusterer.prediction_data_.cluster_tree) 650 651 result = distance_vecs * outlier_vecs 652 row_sums = result.sum(axis=1) 653 result = result / row_sums[:, np.newaxis] 654 result *= in_cluster_probs[:, np.newaxis] 655 656 return result 657