1"""flat.py 2 3Provides alternative functions to hdbscan.HDBSCAN and others to 41. Allow prediction on a flat clustering by specifying 'n_clusters'. 5 This is done by choosing the best cluster_selection_epsilon that produces 6 the required number of clusters without adding unnecessary outliers. 72. Makes approximate_predict, membership_vector, and 8 all_points_membership_vectors consistent with cluster_selection_epsilon 9 10Provides the following functions: 11================================== 12HDBSCAN_flat: trained HDBSCAN instance with 'n_clusters' clusters 13 The attributes (labels, probabilities, prediction_data) are tuned to 14 produce 'n_clusters' clusters. 15 16approximate_predict_flat: labels and probabilities for novel points 17 Allows selecting n_clusters for novel points, or using the 18 original clustering (potentially specified using cluster_selection_epsilon) 19 20membership_vector_flat: Soft-clustering probabilities for novel points 21 Similar to approximate_predict_flat, but for soft-clustering. 22 **Use with caution** 23 24all_points_membership_vectors_flat: Soft-clustering probabilities 25 Similar to membership_vector_flat, but for points in training set 26 **Use with caution** 27""" 28 29import copy 30from warnings import warn 31 32import numpy as np 33from ._hdbscan_tree import compute_stability, get_cluster_tree_leaves 34from .hdbscan_ import HDBSCAN, _tree_to_labels 35from .plots import _bfs_from_cluster_tree 36from .prediction import (PredictionData, 37 _find_cluster_and_probability, 38 _find_neighbor_and_lambda) 39from ._prediction_utils import (get_tree_row_with_child, 40 dist_membership_vector, 41 outlier_membership_vector, 42 prob_in_some_cluster, 43 all_points_dist_membership_vector, 44 all_points_outlier_membership_vector, 45 all_points_prob_in_some_cluster) 46 47 48def HDBSCAN_flat(X, n_clusters=None, 49 cluster_selection_epsilon=0., 50 clusterer=None, inplace=False, **kwargs): 51 """ 52 Train a HDBSCAN clusterer by specifying n_clusters. 53 Or, modify a trained clusterer to return specific n_clusters. 54 55 Parameters 56 ---------- 57 X: array-like 58 Data to be passed to HDBSCAN for training. 59 60 n_clusters: int, default=None 61 Number of clusters to produce. 62 If None, revert to default HDBSCAN 63 64 cluster_selection_epsilon: float, default=0. 65 core-distance below which to stop splitting clusters. 66 This can indirectly impose n_clusters. 67 This argument is ignored if n_clusters is supplied. 68 69 clusterer: HDBSCAN, default=None 70 If supplied, modify this clusterer to produce n_clusters clusters. 71 72 inplace: bool, default=False 73 If 'clusterer' parameter is supplied, and inplace is True, 74 modify the previous clusterer inplace. 75 If False, return a modified copy of the previous clusterer. 76 77 **kwargs: keyword arguments 78 All init arguments for HDBSCAN 79 80 Returns 81 ------- 82 new_clusterer: HDBSCAN 83 New HDBSCAN instance; returned irrespective of inplace=True or False 84 85 Usage 86 ----- 87 # Extract flat clustering from HDBSCAN's hierarchy for 7 clusters 88 clusterer = HDBSCAN_flat(X_train, n_clusters=7, 89 min_cluster_size=12, min_samples=8) 90 labels = clusterer.labels_ 91 proba = clusterer.probabilities_ 92 93 # Use a previously initialized/trained HDBSCAN 94 old_clusterer = HDBSCAN(min_cluster_size=12, min_samples=8) 95 clusterer = HDBSCAN_flat(X_train, n_clusters=7, 96 clusterer=old_clusterer, inplace=True) 97 labels = clusterer.labels_ 98 proba = clusterer.probabilities_ 99 100 See Also 101 --------- 102 :py:func:`hdbscan.HDBSCAN` 103 :py:func:`re_init` 104 """ 105 # Handle the trivial case first. 106 if (n_clusters is None) and (cluster_selection_epsilon == 0.): 107 if (not isinstance(clusterer, HDBSCAN)) or (not inplace): 108 # Always generate prediction_data to avoid later woes 109 kwargs['prediction_data'] = True 110 new_clusterer = HDBSCAN(**kwargs) 111 else: 112 new_clusterer = clusterer 113 new_clusterer.prediction_data = True 114 115 new_clusterer.fit(X) 116 return new_clusterer 117 118 if (n_clusters is not None) and (cluster_selection_epsilon != 0.): 119 warn(f"'cluster_selection_epsilon' (={cluster_selection_epsilon})" 120 f" is ignored when 'n_clusters' is supplied.") 121 cluster_selection_epsilon = 0. 122 # This will later be chosen according to n_clusters 123 124 if not isinstance(clusterer, HDBSCAN): 125 # Initialize and train clusterer if one was not previously supplied. 126 # Always generate prediction data 127 kwargs['prediction_data'] = True 128 new_clusterer = HDBSCAN(**kwargs) 129 # We do not pass cluster_selection_epsilon here. 130 # While this adds unnecessary computation, it makes the code 131 # easier to read and debug. 132 new_clusterer.fit(X) 133 else: 134 if inplace: 135 new_clusterer = clusterer 136 else: 137 new_clusterer = copy.deepcopy(clusterer) 138 139 new_clusterer.prediction_data = True 140 141 # Train on 'X'. Do this even if the supplied clusterer was trained, 142 # because we want to make sure it fits 'X'. 143 new_clusterer.prediction_data = True 144 new_clusterer.fit(X) 145 146 if new_clusterer.cluster_selection_method == 'eom': 147 max_eom_clusters = len( 148 new_clusterer.condensed_tree_._select_clusters()) 149 150 # Pick an epsilon value right after a split produces n_clusters, 151 # and the don't split further for smaller epsilon (larger lambda) 152 if n_clusters is not None: 153 if ((new_clusterer.cluster_selection_method == 'eom') and 154 (n_clusters > max_eom_clusters)): 155 warn(f"Cannot predict more than {max_eom_clusters} with cluster " 156 "selection method 'eom'. Changing to method 'leaf'...") 157 new_clusterer.cluster_selection_method = 'leaf' 158 epsilon = select_epsilon(new_clusterer.condensed_tree_, n_clusters) 159 else: 160 # Or use the specified cluster_selection_epsilon 161 epsilon = cluster_selection_epsilon 162 163 new_clusterer.cluster_selection_epsilon = float(epsilon) 164 165 # Extract tree related stuff, in order to re-assign labels 166 single_linkage_tree = new_clusterer.single_linkage_tree_ 167 single_linkage_tree = single_linkage_tree.to_numpy() 168 min_cluster_size = new_clusterer.min_cluster_size 169 cluster_selection_method = new_clusterer.cluster_selection_method 170 allow_single_cluster = new_clusterer.allow_single_cluster 171 match_reference_implementation = False 172 173 # Get labels according to the required cluster_selection_epsilon 174 output = _tree_to_labels( 175 None, 176 single_linkage_tree, min_cluster_size, 177 cluster_selection_method, 178 allow_single_cluster, 179 match_reference_implementation, 180 cluster_selection_epsilon=epsilon) 181 182 # Reflect the related changes in HDBSCAN. 183 (new_clusterer.labels_, 184 new_clusterer.probabilities_, 185 new_clusterer.cluster_persistence_, 186 new_clusterer._condensed_tree, 187 new_clusterer._single_linkage_tree) = output 188 189 # PredictionData attached to HDBSCAN should also change. 190 # A function re_init is defined in this module to handle this. 191 re_init(new_clusterer.prediction_data_, 192 new_clusterer.condensed_tree_, 193 cluster_selection_epsilon=epsilon) 194 return new_clusterer 195 196 197def approximate_predict_flat(clusterer, 198 points_to_predict, 199 n_clusters=None, 200 cluster_selection_epsilon=None, 201 prediction_data=None, 202 return_prediction_data=False): 203 """ 204 Predict the cluster label of new points at a particular flat clustering, 205 specified by n_clusters. This is a modified version of 206 hdbscan.approximate_predict to allow selection of n_clusters. 207 208 Parameters 209 ---------- 210 clusterer : HDBSCAN 211 A clustering object that has been fit to the data and 212 either had ``prediction_data=True`` set, or called the 213 ``generate_prediction_data`` method after the fact. 214 215 points_to_predict : array, or array-like (n_samples, n_features) 216 The new data points to predict cluster labels for. They should 217 have the same dimensionality as the original dataset over which 218 clusterer was fit. 219 220 n_clusters: int, default=None 221 The number of clusters to have in the flat clustering 222 (over the training data, not points_to_predict) 223 Ignored when prediction_data is supplied. 224 225 cluster_selection_epsilon: float, default=None 226 core-distance below which to stop splitting clusters. 227 This can indirectly impose n_clusters. 228 This argument is ignored if n_clusters is supplied. 229 230 prediction_data: PredictionData, default=None 231 If supplied, use this to predict clusters for points_to_predict. 232 This allows predicting on multiple datasets without corrupting 233 prediction data associated with clusterer. 234 235 If neither n_clusters, nor prediction_data are supplied, 236 then the prediction_data associated with clusterer is used. 237 238 return_prediction_data: bool, default=False 239 If True, return prediction_data along with labels and proba. 240 241 Returns 242 ------- 243 labels : array (n_samples,) 244 The predicted labels of the ``points_to_predict`` 245 246 probabilities : array (n_samples,) 247 The soft cluster scores for each of the ``points_to_predict`` 248 249 prediction_data: PredictionData, optional 250 prediction_data used to predict. 251 Returned if return_prediciton_data is set to True. 252 253 254 Usage 255 ----- 256 # From a fitted HDBSCAN model, predict for n_clusters=5 257 labels, proba = approximate_predict_flat( 258 clusterer, X_predict, n_clusters=5) 259 260 # Store prediciton data for later use. 261 labels, proba, pred_data = approximate_predict_flat( 262 clusterer, X_predict, n_clusters=5, 263 return_prediction_data=True) 264 # and use this prediction data to predict on new points 265 labels1, proba1 = approximate_predict_flat( 266 clusterer, X_pred1, 267 prediction_data=pred_data) 268 269 See Also 270 --------- 271 :py:func:`hdbscan.prediction.approximate_predict` 272 """ 273 # Get number of fitted clusters for later use. 274 n_clusters_fit = np.sum(np.unique(clusterer.labels_) >= 0) 275 if n_clusters is not None: 276 n_clusters = int(n_clusters) # Ensure n_clusters is int 277 278 # We'll need the condensed tree later... 279 condensed_tree = clusterer.condensed_tree_ 280 281 # If none of the three arguments: prediction_data, n_clusters, 282 # and cluster_selection_epsilon are supplied, 283 # then use clusterer's prediciton data directly 284 if ((prediction_data is None) and 285 ((n_clusters is None) or (n_clusters == n_clusters_fit)) and 286 (cluster_selection_epsilon is None)): 287 prediction_data = clusterer.prediction_data_ 288 289 # If either of n_clusters or cluster_selection_epsilon were supplied, 290 # then build prediction data from these by modifying clusterer's 291 if not isinstance(prediction_data, PredictionData): 292 if clusterer.prediction_data_ is None: 293 raise ValueError( 294 'Clusterer does not have prediction data!' 295 ' Try fitting with prediction_data=True set,' 296 ' or run generate_prediction_data on the clusterer') 297 # Get prediction data from clusterer 298 prediction_data = clusterer.prediction_data_ 299 # Modify prediction_data to reflect new n_clusters 300 # First, make a copy of prediction data to avoid modifying source 301 prediction_data = copy.deepcopy(prediction_data) 302 # Cluster selection method is hold by condensed_tree. 303 # Change from 'eom' to 'leaf' if n_clusters is too large. 304 if ((condensed_tree.cluster_selection_method == 'eom') and ( 305 (n_clusters is not None) and (n_clusters > n_clusters_fit))): 306 warn(f"Cannot predict more than {n_clusters_fit} with cluster " 307 "selection method 'eom'. Changing to method 'leaf'...") 308 condensed_tree.cluster_selection_method = 'leaf' 309 # This change does not affect the tree associated with 'clusterer' 310 # Re-initialize prediction_data for the specified n_clusters or epsilon 311 re_init(prediction_data, condensed_tree, 312 n_clusters=n_clusters, 313 cluster_selection_epsilon=cluster_selection_epsilon) 314 315 # ============================================================ 316 # Now we're ready to use prediction_data 317 # The rest of the code is copied from HDBSCAN's approximate_predict, 318 # but modified to use prediction_data instead of clusterer's attribute 319 points_to_predict = np.asarray(points_to_predict) 320 321 if points_to_predict.shape[1] != prediction_data.raw_data.shape[1]: 322 raise ValueError('New points dimension does not match fit data!') 323 324 if prediction_data.cluster_tree.shape[0] == 0: 325 warn('Prediction data does not have any defined clusters, new data' 326 ' will be automatically predicted as noise.') 327 labels = -1 * np.ones(points_to_predict.shape[0], dtype=np.int32) 328 probabilities = np.zeros(points_to_predict.shape[0], dtype=np.float32) 329 if return_prediction_data: 330 return labels, probabilities, prediction_data 331 else: 332 return labels, probabilities 333 334 labels = np.empty(points_to_predict.shape[0], dtype=np.int) 335 probabilities = np.empty(points_to_predict.shape[0], dtype=np.float64) 336 337 min_samples = clusterer.min_samples or clusterer.min_cluster_size 338 neighbor_distances, neighbor_indices = prediction_data.tree.query( 339 points_to_predict, 340 k=2 * min_samples) 341 342 for i in range(points_to_predict.shape[0]): 343 label, prob = _find_cluster_and_probability( 344 condensed_tree, 345 prediction_data.cluster_tree, 346 neighbor_indices[i], 347 neighbor_distances[i], 348 prediction_data.core_distances, 349 prediction_data.cluster_map, 350 prediction_data.max_lambdas, 351 min_samples 352 ) 353 labels[i] = label 354 probabilities[i] = prob 355 356 if return_prediction_data: 357 return labels, probabilities, prediction_data 358 else: 359 return labels, probabilities 360 361 362def membership_vector_flat( 363 clusterer, points_to_predict, 364 prediction_data=None, n_clusters=None, 365 cluster_selection_epsilon=0.): 366 """ 367 (Adaptation of hdbscan's membership_vector for n_clusters, epsilon) 368 Predict soft cluster membership probabilities; 369 a vector for each point in ``points_to_predict`` that gives 370 a probability that the given point is a member of a cluster 371 for each of the selected clusters of the ``clusterer``. 372 373 Parameters 374 ---------- 375 clusterer: HDBSCAN 376 A clustering object that has been fit to the data and 377 either had ``prediction_data=True`` set, or called the 378 ``generate_prediction_data`` method after the fact. 379 380 points_to_predict: array, or array-like (n_samples, n_features) 381 The new data points to predict cluster labels for. They should 382 have the same dimensionality as the original dataset over which 383 clusterer was fit. 384 385 prediction_data: PredictionData, default=None 386 Prediction data associated with HDBSCAN for some flat clustering 387 388 n_clusters: int, default=None 389 Number of clusters over which to compute membership probabilities. 390 These clusters are obtained as a flat clustering at some 391 cluster_selection_epsilon. 392 393 cluster_selection_epsilon: float, default=0. 394 core-distance below which to stop splitting clusters. 395 This can indirectly impose n_clusters. 396 This argument is ignored if n_clusters is supplied. 397 398 Note: If neither n_clusters nor cluster_selection_epsilon are supplied, 399 the clusterer's original clustering is used. 400 401 Returns 402 ------- 403 membership_vectors : array (n_samples, n_clusters) 404 The probability that point ``i`` is a member of cluster ``j`` is 405 in ``membership_vectors[i, j]``. 406 407 See Also 408 -------- 409 :py:func:`hdbscan.predict.membership_vector` 410 :py:func:`hdbscan.predict.all_points_membership_vectors` 411 """ 412 points_to_predict = points_to_predict.astype(np.float64) 413 # Extract condensed tree for later use 414 condensed_tree = clusterer.condensed_tree_ 415 416 # Choose flat clustering based on cluster_selection_epsilon or n_clusters. 417 # If neither is specified, use clusterer's cluster_selection_epsilon 418 if ((n_clusters is None) and 419 (cluster_selection_epsilon == 0.) and 420 (prediction_data is None)): 421 epsilon = clusterer.cluster_selection_epsilon 422 # Use the same prediction_data as clusterer's 423 prediction_data = clusterer.prediction_data_ 424 elif prediction_data is None: 425 if n_clusters is not None: 426 # Compute cluster_selection_epsilon so that a flat clustering 427 # produces a specified number of n_clusters 428 # With method 'eom', we may fail to get 'n_clusters' clusters. So, 429 try: 430 epsilon = select_epsilon(condensed_tree, n_clusters) 431 except AssertionError: 432 warn(f"Failed to predict {n_clusters} clusters with " 433 "cluster selection method 'eom'. Switching to 'leaf'...") 434 condensed_tree.cluster_selection_method = 'leaf' 435 epsilon = select_epsilon(condensed_tree, n_clusters) 436 else: 437 epsilon = cluster_selection_epsilon 438 # Create another instance of prediction_data that is consistent 439 # with the selected value of epsilon. 440 prediction_data = copy.deepcopy(clusterer.prediction_data_) 441 re_init(prediction_data, condensed_tree, 442 cluster_selection_epsilon=epsilon) 443 444 # Flat clustering from prediction data 445 clusters = clusters_from_prediction_data(prediction_data) 446 447 # Initialize probabilities 448 result = np.empty((points_to_predict.shape[0], clusters.shape[0]), 449 dtype=np.float64) 450 451 # k-NN for prediciton points to training set 452 min_samples = clusterer.min_samples or clusterer.min_cluster_size 453 neighbor_distances, neighbor_indices = \ 454 prediction_data.tree.query(points_to_predict, 455 k=2*min_samples) 456 457 # Loop over prediction points to compute probabilities 458 for i in range(points_to_predict.shape[0]): 459 # We need to find where in the tree the new point would go 460 # for the purposes of outlier membership approximation 461 nearest_neighbor, lambda_ = \ 462 _find_neighbor_and_lambda( 463 neighbor_indices[i], 464 neighbor_distances[i], 465 prediction_data.core_distances, 466 min_samples) 467 468 # Find row in tree where nearest neighbor drops out, 469 # so we can get a lambda value for the nearest neighbor 470 neighbor_tree_row = get_tree_row_with_child( 471 condensed_tree._raw_tree, nearest_neighbor) 472 473 # Assign lambda as min(lambda-to-neighbor, neighbor's-lambda-to-tree) 474 # Equivalently, this assigns core distance for prediction point as 475 # max(dist-to-neighbor, neighbor's-dist-to-tree) 476 if neighbor_tree_row['lambda_val'] <= lambda_: 477 lambda_ = neighbor_tree_row['lambda_val'] 478 479 # Probabilities based on distance to closest exemplar in each cluster: 480 # Use new prediction_data that points to exemplars that are specific 481 # to the choice of n_clusters 482 distance_vec = dist_membership_vector( 483 points_to_predict[i], 484 prediction_data.exemplars, 485 prediction_data.dist_metric) 486 # Probabilities based on how long the nearest exemplar persists in 487 # each cluster (with respect to most persistent exemplar) 488 # Use new clusters that are defined by the choice of n_clusters. 489 outlier_vec = outlier_membership_vector( 490 nearest_neighbor, 491 lambda_, 492 clusters, 493 condensed_tree._raw_tree, 494 prediction_data.leaf_max_lambdas, 495 prediction_data.cluster_tree) 496 497 # Merge the two probabilities to produce a single set of probabilities 498 result[i] = distance_vec ** 0.5 * outlier_vec ** 2.0 499 result[i] /= result[i].sum() 500 501 # Include probability that the nearest neighbor belongs to a cluster 502 result[i] *= prob_in_some_cluster( 503 nearest_neighbor, 504 lambda_, 505 clusters, 506 condensed_tree._raw_tree, 507 prediction_data.leaf_max_lambdas, 508 prediction_data.cluster_tree) 509 510 # Rename variable so it's easy to understand what's being returned 511 membership_vectors = result 512 return membership_vectors 513 514 515def all_points_membership_vectors_flat( 516 clusterer, prediction_data=None, 517 n_clusters=None, cluster_selection_epsilon=None): 518 """ 519 (Adaptation of hdbscan's all_points_membership_vector 520 for n_clusters, epsilon) 521 Predict soft cluster membership vectors for all points in the 522 original dataset the clusterer was trained on. This function is more 523 efficient by making use of the fact that all points are already in the 524 condensed tree, and processing in bulk. 525 526 Parameters 527 ---------- 528 clusterer : HDBSCAN 529 A clustering object that has been fit to the data and 530 either had ``prediction_data=True`` set, or called the 531 ``generate_prediction_data`` method after the fact. 532 This method does not work if the clusterer was trained 533 with ``metric='precomputed'``. 534 535 prediction_data: PredictionData, default=None 536 Prediction data associated with HDBSCAN for some flat clustering 537 538 n_clusters: int, optional, default=None 539 Number of clusters over which to compute membership probabilities. 540 These clusters are obtained as a flat clustering at some 541 cluster_selection_epsilon. 542 543 cluster_selection_epsilon: float, optional, default=None 544 core-distance below which to stop splitting clusters. 545 This can indirectly impose n_clusters. 546 This argument is ignored if n_clusters is supplied. 547 548 Note: If neither n_clusters nor cluster_selection_epsilon are supplied, 549 the clusterer's original clustering is used. 550 551 Returns 552 ------- 553 membership_vectors : array (n_samples, n_clusters) 554 The probability that point ``i`` of the original dataset is a member of 555 cluster ``j`` is in ``membership_vectors[i, j]``. 556 See Also 557 -------- 558 :py:func:`hdbscan.prediction.all_points_membership_vectors` 559 :py:func:`hdbscan.prediction.membership_vector` 560 """ 561 # Extract condensed tree for later use 562 condensed_tree = clusterer.condensed_tree_ 563 564 # Choose flat clustering based on cluster_selection_epsilon or n_clusters. 565 # If neither is specified, use clusterer's cluster_selection_epsilon 566 if (n_clusters is None) and (cluster_selection_epsilon is None): 567 epsilon = clusterer.cluster_selection_epsilon 568 # Use the same prediction_data as clusterer's 569 prediction_data = clusterer.prediction_data_ 570 elif prediction_data is None: 571 if n_clusters is not None: 572 # Compute cluster_selection_epsilon so that a flat clustering 573 # produces a specified number of n_clusters 574 # With method 'eom', we may fail to get 'n_clusters' clusters. So, 575 try: 576 epsilon = select_epsilon(condensed_tree, n_clusters) 577 except AssertionError: 578 warn(f"Failed to predict {n_clusters} clusters with " 579 "cluster selection method 'eom'. Switching to 'leaf'...") 580 condensed_tree.cluster_selection_method = 'leaf' 581 epsilon = select_epsilon(condensed_tree, n_clusters) 582 else: 583 epsilon = cluster_selection_epsilon 584 # Create another instance of prediction_data that is consistent 585 # with the selected value of epsilon. 586 prediction_data = copy.deepcopy(clusterer.prediction_data_) 587 re_init(prediction_data, condensed_tree, 588 cluster_selection_epsilon=epsilon) 589 590 # Flat clustering at the chosen epsilon from prediction_data 591 clusters = clusters_from_prediction_data(prediction_data) 592 593 all_points = prediction_data.raw_data 594 595 # When no clusters found, return array of 0's 596 if clusters.size == 0: 597 return np.zeros(all_points.shape[0]) 598 599 # Probabilities based on distance to closest exemplar in each cluster: 600 # Use new prediction_data that points to exemplars that are specific 601 # to the choice of n_clusters 602 distance_vecs = all_points_dist_membership_vector( 603 all_points, 604 prediction_data.exemplars, 605 prediction_data.dist_metric) 606 607 # Probabilities based on how long the point persists in 608 # each cluster (with respect to most persistent exemplar) 609 # Use new clusters that are defined by the choice of n_clusters. 610 outlier_vecs = all_points_outlier_membership_vector( 611 clusters, 612 condensed_tree._raw_tree, 613 prediction_data.leaf_max_lambdas, 614 prediction_data.cluster_tree) 615 616 # Include probability that the point belongs to a cluster 617 in_cluster_probs = all_points_prob_in_some_cluster( 618 clusters, 619 condensed_tree._raw_tree, 620 prediction_data.leaf_max_lambdas, 621 prediction_data.cluster_tree) 622 623 # Aggregate the three probabilities to produce membership vectors 624 result = distance_vecs * outlier_vecs 625 row_sums = result.sum(axis=1) 626 result = result / row_sums[:, np.newaxis] 627 result *= in_cluster_probs[:, np.newaxis] 628 629 # Re-name variable to clarify what's being returned. 630 membership_vectors = result 631 return membership_vectors 632 633 634def select_epsilon(condensed_tree, n_clusters): 635 """ 636 Pick optimal epsilon from condensed tree based on n_clusters, 637 calls functions specific to 'eom' or 'leaf' selection methods 638 """ 639 cluster_selection_method = condensed_tree.cluster_selection_method 640 if cluster_selection_method == 'eom': 641 return select_epsilon_eom(condensed_tree, n_clusters) 642 if cluster_selection_method == 'leaf': 643 return select_epsilon_leaf(condensed_tree, n_clusters) 644 raise ValueError('Invalid Cluster Selection Method: %s\n' 645 'Should be one of: "eom", "leaf"\n') 646 647 648def select_epsilon_eom(condensed_tree, n_clusters): 649 """ 650 Select epsilon so that persistence-based clustering, 651 after truncating the tree at the above epsilon, 652 has exactly 'n_clusters' clusters 653 """ 654 # With method 'eom', max clusters are produced for epsilon=0, 655 # as computed by 656 eom_base_clusters = condensed_tree._select_clusters() 657 max_clusters = len(eom_base_clusters) 658 # Increasing epsilon can only reduce the number of ouput clusters. 659 660 assert n_clusters <= max_clusters, ( 661 f"Cannot produce more than {max_clusters} with method 'eom'. " + 662 "Use method 'leaf' instead to extract flat clustering.") 663 664 tree = condensed_tree._raw_tree 665 # To select epsilon, consider all values where clusters are split 666 cluster_lambdas = tree['lambda_val'][tree['child_size'] > 1] 667 candidate_epsilons = 1./np.unique(cluster_lambdas) - 1.e-12 668 # Subtract the extra e-12 to avoid numerical errors in comparison 669 # Then, we avoid splitting for all epsilon below this. 670 candidate_epsilons = np.sort(candidate_epsilons)[::-1] 671 672 for epsilon in candidate_epsilons: 673 sel_clusters = _new_select_clusters(condensed_tree, epsilon) 674 if len(sel_clusters) == n_clusters: 675 break 676 else: 677 raise RuntimeError("Could not find epsilon") 678 679 return epsilon 680 681 682def select_epsilon_leaf(condensed_tree, n_clusters): 683 """ 684 Select epsilon so that the leaves of condensed tree, 685 after truncating at the above epsilon, 686 has exactly 'n_clusters' clusters 687 """ 688 # Use an epsilon value that produces the right number of clusters. 689 # The condensed tree of HDBSCAN has this information. 690 # Extract the lambda levels (=1/distance) from the condensed tree 691 lambdas = condensed_tree._raw_tree['lambda_val'] 692 # We don't want values that produce a large cluster and 693 # just one or two individual points. 694 child_sizes = condensed_tree._raw_tree['child_size'] 695 child_sizes = child_sizes.astype(int) 696 # Keep only those lambda values corresponding to cluster separation; 697 # i.e., with child_sizes > 1 698 lambdas = lambdas[child_sizes > 1] 699 # Get the unique values, because when two clusters fall out of one, 700 # the entry with lambda is repeated. 701 lambdas = np.unique(lambdas.astype(float)) 702 if n_clusters > len(lambdas) + 1: 703 warn(f"HDBSCAN can only compute {len(lambdas)+1} clusters. " 704 f"Setting n_clusters to {len(lambdas)+1}...") 705 n_clusters = len(lambdas) + 1 706 707 # lambda values are sorted by np.unique. 708 # Now, get epsilon (distance threshold) as 1/lambda 709 epsilon = 1./lambdas[n_clusters-2] 710 # At this epsilon, n_clusters have been split. 711 # Stop splits at epsilons smaller than this. 712 # To allow for numerical errors, 713 return epsilon - 1.e-12 714 715 716def re_init(predData, condensed_tree, 717 n_clusters=None, cluster_selection_epsilon=0.): 718 """ 719 Modify PredictionData of HDBSCAN to account for epsilon. 720 epsilon is the cluster_selection_epsilon that controls granularity 721 of clusters; Large epsilon => More clusters 722 723 Parameters 724 ---------- 725 predData: PredictionData 726 Contains data to use for predicting novel points. 727 Defined in the HDBSCAN module 728 729 condensed_tree: CondensedTree 730 Tree structure that contains hierarchical clustering. 731 Defined in the HDBSCAN module 732 733 n_clusters: int, optional, default=None 734 If specified, use this to obtain cluster_selection_epsilon 735 from CondensedTree; Overrides cluster_selection_epsilon parameter 736 737 cluster_selection_epsilon: float, default=0. 738 In cluster tree, nodes are not split further beyond (>=) this value. 739 epsilon is the inverse of core distance. 740 741 Returns 742 ------- 743 None 744 """ 745 # predData must be a pre-trained PredictionData instance from hdbscan 746 # If n_clusters is specified, compute cluster_selection_epsilon; 747 if (n_clusters is not None): 748 cluster_selection_epsilon = select_epsilon(condensed_tree, n_clusters) 749 750 # This is the key modification: 751 # Select clusters according to selection method and epsilon. 752 selected_clusters = _new_select_clusters(condensed_tree, 753 cluster_selection_epsilon) 754 # _new_select_clusters is a modification of get_clusters 755 # from hdbscan._hdbscan_tree 756 757 # raw tree, used later to get exemplars and lambda values 758 raw_condensed_tree = condensed_tree._raw_tree 759 760 # Re-do the cluster map: Map cluster numbers in tree (N, N+1, ..) 761 # to the cluster labels produced as output 762 predData.cluster_map = {int(c): n for n, c in 763 enumerate(sorted(list(selected_clusters)))} 764 predData.reverse_cluster_map = {n: c for c, n in 765 predData.cluster_map.items()} 766 767 # Re-compute lambdas and exemplars for selected clusters; 768 predData.max_lambdas = {} 769 predData.exemplars = [] 770 771 for cluster in selected_clusters: 772 # max_lambda <=> smallest distance <=> most persistent point(s) 773 predData.max_lambdas[cluster] = \ 774 raw_condensed_tree['lambda_val'][ 775 raw_condensed_tree['parent'] == cluster].max() 776 777 # Map all sub-clusters of selected cluster to the selected cluster's 778 # label in output. 779 # Map lambdas too... 780 for sub_cluster in predData._clusters_below(cluster): 781 predData.cluster_map[sub_cluster] = predData.cluster_map[cluster] 782 predData.max_lambdas[sub_cluster] = predData.max_lambdas[cluster] 783 784 # Create set of exemplar points for later use. 785 # Novel points are assigned based on cluster of closest exemplar. 786 cluster_exemplars = np.array([], dtype=np.int64) 787 # For each selected cluster, get all of its leaves, 788 # and leaves of leaves, and so on... 789 for leaf in predData._recurse_leaf_dfs(cluster): 790 # Largest lambda => Most persistent points 791 leaf_max_lambda = raw_condensed_tree['lambda_val'][ 792 raw_condensed_tree['parent'] == leaf].max() 793 # Get the most persistent points 794 points = raw_condensed_tree['child'][ 795 (raw_condensed_tree['parent'] == leaf) 796 & (raw_condensed_tree['lambda_val'] == leaf_max_lambda) 797 ] 798 # Add most persistent points as exemplars 799 cluster_exemplars = np.hstack([cluster_exemplars, points]) 800 801 # Add exemplars for each leaf of each selected cluster. 802 predData.exemplars.append(predData.raw_data[cluster_exemplars]) 803 return 804 805 806def _new_select_clusters(condensed_tree, 807 cluster_selection_epsilon, 808 allow_single_cluster=False, 809 match_reference_implementation=False): 810 """ 811 Adaptation of get_clusters from hdbscan._hdbscan_tree. 812 Avoids the label and proba computation at the end, 813 and returns only the selected clusters instead. 814 """ 815 tree = condensed_tree._raw_tree 816 cluster_selection_method = condensed_tree.cluster_selection_method 817 stability = compute_stability(tree) 818 819 if allow_single_cluster: 820 node_list = sorted(stability.keys(), reverse=True) 821 else: 822 node_list = sorted(stability.keys(), reverse=True)[:-1] 823 # (exclude root) 824 825 cluster_tree = tree[tree['child_size'] > 1] 826 is_cluster = {cluster: True for cluster in node_list} 827 828 if cluster_selection_method == 'eom': 829 for node in node_list: 830 child_selection = (cluster_tree['parent'] == node) 831 subtree_stability = np.sum([ 832 stability[child] for 833 child in cluster_tree['child'][child_selection]]) 834 if subtree_stability > stability[node]: 835 is_cluster[node] = False 836 stability[node] = subtree_stability 837 else: 838 for sub_node in _bfs_from_cluster_tree(cluster_tree, node): 839 if sub_node != node: 840 is_cluster[sub_node] = False 841 842 if cluster_selection_epsilon != 0.0: 843 eom_clusters = set([c for c in is_cluster if is_cluster[c]]) 844 selected_clusters = epsilon_search(eom_clusters, cluster_tree, 845 cluster_selection_epsilon, 846 allow_single_cluster) 847 for c in is_cluster: 848 if c in selected_clusters: 849 is_cluster[c] = True 850 else: 851 is_cluster[c] = False 852 853 elif cluster_selection_method == 'leaf': 854 leaves = set(get_cluster_tree_leaves(cluster_tree)) 855 if len(leaves) == 0: 856 for c in is_cluster: 857 is_cluster[c] = False 858 is_cluster[tree['parent'].min()] = True 859 860 if cluster_selection_epsilon != 0.0: 861 selected_clusters = epsilon_search(leaves, cluster_tree, 862 cluster_selection_epsilon, 863 allow_single_cluster) 864 else: 865 selected_clusters = leaves 866 867 for c in is_cluster: 868 if c in selected_clusters: 869 is_cluster[c] = True 870 else: 871 is_cluster[c] = False 872 else: 873 raise ValueError('Invalid Cluster Selection Method: %s\n' 874 'Should be one of: "eom", "leaf"\n') 875 876 clusters = set([int(c) for c in is_cluster if is_cluster[c]]) 877 return clusters 878 879 880def epsilon_search(leaves, cluster_tree, cluster_selection_epsilon, 881 allow_single_cluster): 882 selected_clusters = [] 883 processed = [] 884 885 for leaf in leaves: 886 eps = 1/cluster_tree['lambda_val'][cluster_tree['child'] == leaf][0] 887 if eps < cluster_selection_epsilon: 888 if leaf not in processed: 889 epsilon_child = traverse_upwards( 890 cluster_tree, cluster_selection_epsilon, 891 leaf, allow_single_cluster) 892 if hasattr(epsilon_child, '__len__'): 893 epsilon_child = epsilon_child[0] 894 895 selected_clusters.append(epsilon_child) 896 897 for sub_node in _bfs_from_cluster_tree(cluster_tree, 898 epsilon_child): 899 if sub_node != epsilon_child: 900 processed.append(sub_node) 901 else: 902 selected_clusters.append(leaf) 903 904 return set(selected_clusters) 905 906 907def traverse_upwards(cluster_tree, cluster_selection_epsilon, 908 leaf, allow_single_cluster): 909 root = cluster_tree['parent'].min() 910 parent = cluster_tree[cluster_tree['child'] == leaf]['parent'] 911 if parent == root: 912 if allow_single_cluster: 913 return parent 914 else: 915 return leaf # return node closest to root 916 917 parent_eps = 1/cluster_tree[cluster_tree['child'] == parent]['lambda_val'] 918 if parent_eps > cluster_selection_epsilon: 919 return parent 920 else: 921 return traverse_upwards(cluster_tree, cluster_selection_epsilon, 922 parent, allow_single_cluster) 923 924 925def clusters_from_prediction_data(prediction_data): 926 """ 927 Extract selected clusters from PredictionData instance. 928 """ 929 return np.array( 930 sorted(list(prediction_data.reverse_cluster_map.values())) 931 ).astype(np.intp) 932