1# -*- coding: utf-8 -*- 2""" 3HDBSCAN: Hierarchical Density-Based Spatial Clustering 4 of Applications with Noise 5""" 6 7import numpy as np 8 9from sklearn.base import BaseEstimator, ClusterMixin 10from sklearn.metrics import pairwise_distances 11from scipy.sparse import issparse 12from sklearn.neighbors import KDTree, BallTree 13from joblib import Memory 14import six 15from warnings import warn 16from sklearn.utils import check_array 17from joblib.parallel import cpu_count 18 19from scipy.sparse import csgraph 20 21from ._hdbscan_linkage import (single_linkage, 22 mst_linkage_core, 23 mst_linkage_core_vector, 24 label) 25from ._hdbscan_tree import (condense_tree, 26 compute_stability, 27 get_clusters, 28 outlier_scores) 29from ._hdbscan_reachability import (mutual_reachability, 30 sparse_mutual_reachability) 31 32from ._hdbscan_boruvka import KDTreeBoruvkaAlgorithm, BallTreeBoruvkaAlgorithm 33from .dist_metrics import DistanceMetric 34 35from .plots import CondensedTree, SingleLinkageTree, MinimumSpanningTree 36from .prediction import PredictionData 37 38FAST_METRICS = (KDTree.valid_metrics + BallTree.valid_metrics + 39 ['cosine', 'arccos']) 40 41# Author: Leland McInnes <leland.mcinnes@gmail.com> 42# Steve Astels <sastels@gmail.com> 43# John Healy <jchealy@gmail.com> 44# 45# License: BSD 3 clause 46from numpy import isclose 47 48def _tree_to_labels(X, single_linkage_tree, min_cluster_size=10, 49 cluster_selection_method='eom', 50 allow_single_cluster=False, 51 match_reference_implementation=False, 52 cluster_selection_epsilon=0.0): 53 """Converts a pretrained tree and cluster size into a 54 set of labels and probabilities. 55 """ 56 condensed_tree = condense_tree(single_linkage_tree, 57 min_cluster_size) 58 stability_dict = compute_stability(condensed_tree) 59 labels, probabilities, stabilities = get_clusters(condensed_tree, 60 stability_dict, 61 cluster_selection_method, 62 allow_single_cluster, 63 match_reference_implementation, 64 cluster_selection_epsilon) 65 66 return (labels, probabilities, stabilities, condensed_tree, 67 single_linkage_tree) 68 69 70def _hdbscan_generic(X, min_samples=5, alpha=1.0, metric='minkowski', p=2, 71 leaf_size=None, gen_min_span_tree=False, **kwargs): 72 if metric == 'minkowski': 73 distance_matrix = pairwise_distances(X, metric=metric, p=p) 74 elif metric == 'arccos': 75 distance_matrix = pairwise_distances(X, metric='cosine', **kwargs) 76 elif metric == 'precomputed': 77 # Treating this case explicitly, instead of letting 78 # sklearn.metrics.pairwise_distances handle it, 79 # enables the usage of numpy.inf in the distance 80 # matrix to indicate missing distance information. 81 # TODO: Check if copying is necessary 82 distance_matrix = X.copy() 83 else: 84 distance_matrix = pairwise_distances(X, metric=metric, **kwargs) 85 86 if issparse(distance_matrix): 87 # raise TypeError('Sparse distance matrices not yet supported') 88 return _hdbscan_sparse_distance_matrix(distance_matrix, min_samples, 89 alpha, metric, p, 90 leaf_size, gen_min_span_tree, 91 **kwargs) 92 93 mutual_reachability_ = mutual_reachability(distance_matrix, 94 min_samples, alpha) 95 96 min_spanning_tree = mst_linkage_core(mutual_reachability_) 97 98 # Warn if the MST couldn't be constructed around the missing distances 99 if np.isinf(min_spanning_tree.T[2]).any(): 100 warn('The minimum spanning tree contains edge weights with value ' 101 'infinity. Potentially, you are missing too many distances ' 102 'in the initial distance matrix for the given neighborhood ' 103 'size.', UserWarning) 104 105 # mst_linkage_core does not generate a full minimal spanning tree 106 # If a tree is required then we must build the edges from the information 107 # returned by mst_linkage_core (i.e. just the order of points to be merged) 108 if gen_min_span_tree: 109 result_min_span_tree = min_spanning_tree.copy() 110 for index, row in enumerate(result_min_span_tree[1:], 1): 111 candidates = np.where(isclose(mutual_reachability_[int(row[1])], 112 row[2]))[0] 113 candidates = np.intersect1d(candidates, 114 min_spanning_tree[:index, :2].astype( 115 int)) 116 candidates = candidates[candidates != row[1]] 117 assert len(candidates) > 0 118 row[0] = candidates[0] 119 else: 120 result_min_span_tree = None 121 122 # Sort edges of the min_spanning_tree by weight 123 min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), 124 :] 125 126 # Convert edge list into standard hierarchical clustering format 127 single_linkage_tree = label(min_spanning_tree) 128 129 return single_linkage_tree, result_min_span_tree 130 131 132def _hdbscan_sparse_distance_matrix(X, min_samples=5, alpha=1.0, 133 metric='minkowski', p=2, leaf_size=40, 134 gen_min_span_tree=False, **kwargs): 135 assert issparse(X) 136 # Check for connected component on X 137 if csgraph.connected_components(X, directed=False, return_labels=False) > 1: 138 raise ValueError('Sparse distance matrix has multiple connected ' 139 'components!\nThat is, there exist groups of points ' 140 'that are completely disjoint -- there are no distance ' 141 'relations connecting them\n' 142 'Run hdbscan on each component.') 143 144 lil_matrix = X.tolil() 145 146 # Compute sparse mutual reachability graph 147 # if max_dist > 0, max distance to use when the reachability is infinite 148 max_dist = kwargs.get("max_dist", 0.) 149 mutual_reachability_ = sparse_mutual_reachability(lil_matrix, 150 min_points=min_samples, 151 max_dist=max_dist, 152 alpha=alpha) 153 # Check connected component on mutual reachability 154 # If more than one component, it means that even if the distance matrix X 155 # has one component, there exists with less than `min_samples` neighbors 156 if csgraph.connected_components(mutual_reachability_, directed=False, 157 return_labels=False) > 1: 158 raise ValueError(('There exists points with less than %s neighbors. ' 159 'Ensure your distance matrix has non zeros values for ' 160 'at least `min_sample`=%s neighbors for each points (i.e. K-nn graph), ' 161 'or specify a `max_dist` to use when distances are missing.') 162 % (min_samples, min_samples)) 163 164 # Compute the minimum spanning tree for the sparse graph 165 sparse_min_spanning_tree = csgraph.minimum_spanning_tree( 166 mutual_reachability_) 167 168 # Convert the graph to scipy cluster array format 169 nonzeros = sparse_min_spanning_tree.nonzero() 170 nonzero_vals = sparse_min_spanning_tree[nonzeros] 171 min_spanning_tree = np.vstack(nonzeros + (nonzero_vals,)).T 172 173 # Sort edges of the min_spanning_tree by weight 174 min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), 175 :][0] 176 177 # Convert edge list into standard hierarchical clustering format 178 single_linkage_tree = label(min_spanning_tree) 179 180 if gen_min_span_tree: 181 return single_linkage_tree, min_spanning_tree 182 else: 183 return single_linkage_tree, None 184 185 186def _hdbscan_prims_kdtree(X, min_samples=5, alpha=1.0, 187 metric='minkowski', p=2, leaf_size=40, 188 gen_min_span_tree=False, **kwargs): 189 if X.dtype != np.float64: 190 X = X.astype(np.float64) 191 192 # The Cython routines used require contiguous arrays 193 if not X.flags['C_CONTIGUOUS']: 194 X = np.array(X, dtype=np.double, order='C') 195 196 tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs) 197 198 # TO DO: Deal with p for minkowski appropriately 199 dist_metric = DistanceMetric.get_metric(metric, **kwargs) 200 201 # Get distance to kth nearest neighbour 202 core_distances = tree.query(X, k=min_samples, 203 dualtree=True, 204 breadth_first=True)[0][:, -1].copy(order='C') 205 # Mutual reachability distance is implicit in mst_linkage_core_vector 206 min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, 207 alpha) 208 209 # Sort edges of the min_spanning_tree by weight 210 min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), 211 :] 212 213 # Convert edge list into standard hierarchical clustering format 214 single_linkage_tree = label(min_spanning_tree) 215 216 if gen_min_span_tree: 217 warn('Cannot generate Minimum Spanning Tree; ' 218 'the implemented Prim\'s does not produce ' 219 'the full minimum spanning tree ', UserWarning) 220 221 return single_linkage_tree, None 222 223 224def _hdbscan_prims_balltree(X, min_samples=5, alpha=1.0, 225 metric='minkowski', p=2, leaf_size=40, 226 gen_min_span_tree=False, **kwargs): 227 if X.dtype != np.float64: 228 X = X.astype(np.float64) 229 230 # The Cython routines used require contiguous arrays 231 if not X.flags['C_CONTIGUOUS']: 232 X = np.array(X, dtype=np.double, order='C') 233 234 tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) 235 236 dist_metric = DistanceMetric.get_metric(metric, **kwargs) 237 238 # Get distance to kth nearest neighbour 239 core_distances = tree.query(X, k=min_samples, 240 dualtree=True, 241 breadth_first=True)[0][:, -1].copy(order='C') 242 243 # Mutual reachability distance is implicit in mst_linkage_core_vector 244 min_spanning_tree = mst_linkage_core_vector(X, core_distances, dist_metric, 245 alpha) 246 # Sort edges of the min_spanning_tree by weight 247 min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), 248 :] 249 # Convert edge list into standard hierarchical clustering format 250 single_linkage_tree = label(min_spanning_tree) 251 252 if gen_min_span_tree: 253 warn('Cannot generate Minimum Spanning Tree; ' 254 'the implemented Prim\'s does not produce ' 255 'the full minimum spanning tree ', UserWarning) 256 257 return single_linkage_tree, None 258 259 260def _hdbscan_boruvka_kdtree(X, min_samples=5, alpha=1.0, 261 metric='minkowski', p=2, leaf_size=40, 262 approx_min_span_tree=True, 263 gen_min_span_tree=False, 264 core_dist_n_jobs=4, **kwargs): 265 if leaf_size < 3: 266 leaf_size = 3 267 268 if core_dist_n_jobs < 1: 269 core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1) 270 271 if X.dtype != np.float64: 272 X = X.astype(np.float64) 273 274 tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs) 275 alg = KDTreeBoruvkaAlgorithm(tree, min_samples, metric=metric, 276 leaf_size=leaf_size // 3, 277 approx_min_span_tree=approx_min_span_tree, 278 n_jobs=core_dist_n_jobs, **kwargs) 279 min_spanning_tree = alg.spanning_tree() 280 # Sort edges of the min_spanning_tree by weight 281 row_order = np.argsort(min_spanning_tree.T[2]) 282 min_spanning_tree = min_spanning_tree[row_order, :] 283 # Convert edge list into standard hierarchical clustering format 284 single_linkage_tree = label(min_spanning_tree) 285 286 if gen_min_span_tree: 287 return single_linkage_tree, min_spanning_tree 288 else: 289 return single_linkage_tree, None 290 291 292def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0, 293 metric='minkowski', p=2, leaf_size=40, 294 approx_min_span_tree=True, 295 gen_min_span_tree=False, 296 core_dist_n_jobs=4, **kwargs): 297 if leaf_size < 3: 298 leaf_size = 3 299 300 if core_dist_n_jobs < 1: 301 core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1) 302 303 if X.dtype != np.float64: 304 X = X.astype(np.float64) 305 306 tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs) 307 alg = BallTreeBoruvkaAlgorithm(tree, min_samples, metric=metric, 308 leaf_size=leaf_size // 3, 309 approx_min_span_tree=approx_min_span_tree, 310 n_jobs=core_dist_n_jobs, **kwargs) 311 min_spanning_tree = alg.spanning_tree() 312 # Sort edges of the min_spanning_tree by weight 313 min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]), 314 :] 315 # Convert edge list into standard hierarchical clustering format 316 single_linkage_tree = label(min_spanning_tree) 317 318 if gen_min_span_tree: 319 return single_linkage_tree, min_spanning_tree 320 else: 321 return single_linkage_tree, None 322 323 324def check_precomputed_distance_matrix(X): 325 """Perform check_array(X) after removing infinite values (numpy.inf) from the given distance matrix. 326 """ 327 tmp = X.copy() 328 tmp[np.isinf(tmp)] = 1 329 check_array(tmp) 330 331 332def hdbscan(X, min_cluster_size=5, min_samples=None, alpha=1.0, cluster_selection_epsilon=0.0, 333 metric='minkowski', p=2, leaf_size=40, 334 algorithm='best', memory=Memory(cachedir=None, verbose=0), 335 approx_min_span_tree=True, gen_min_span_tree=False, 336 core_dist_n_jobs=4, 337 cluster_selection_method='eom', allow_single_cluster=False, 338 match_reference_implementation=False, **kwargs): 339 """Perform HDBSCAN clustering from a vector array or distance matrix. 340 341 Parameters 342 ---------- 343 X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ 344 array of shape (n_samples, n_samples) 345 A feature array, or array of distances between samples if 346 ``metric='precomputed'``. 347 348 min_cluster_size : int, optional (default=5) 349 The minimum number of samples in a group for that group to be 350 considered a cluster; groupings smaller than this size will be left 351 as noise. 352 353 min_samples : int, optional (default=None) 354 The number of samples in a neighborhood for a point 355 to be considered as a core point. This includes the point itself. 356 defaults to the min_cluster_size. 357 358 cluster_selection_epsilon: float, optional (default=0.0) 359 A distance threshold. Clusters below this value will be merged. 360 See [3]_ for more information. 361 362 alpha : float, optional (default=1.0) 363 A distance scaling parameter as used in robust single linkage. 364 See [2]_ for more information. 365 366 metric : string or callable, optional (default='minkowski') 367 The metric to use when calculating distance between instances in a 368 feature array. If metric is a string or callable, it must be one of 369 the options allowed by metrics.pairwise.pairwise_distances for its 370 metric parameter. 371 If metric is "precomputed", X is assumed to be a distance matrix and 372 must be square. 373 374 p : int, optional (default=2) 375 p value to use if using the minkowski metric. 376 377 leaf_size : int, optional (default=40) 378 Leaf size for trees responsible for fast nearest 379 neighbour queries. 380 381 algorithm : string, optional (default='best') 382 Exactly which algorithm to use; hdbscan has variants specialised 383 for different characteristics of the data. By default this is set 384 to ``best`` which chooses the "best" algorithm given the nature of 385 the data. You can force other options if you believe you know 386 better. Options are: 387 * ``best`` 388 * ``generic`` 389 * ``prims_kdtree`` 390 * ``prims_balltree`` 391 * ``boruvka_kdtree`` 392 * ``boruvka_balltree`` 393 394 memory : instance of joblib.Memory or string, optional 395 Used to cache the output of the computation of the tree. 396 By default, no caching is done. If a string is given, it is the 397 path to the caching directory. 398 399 approx_min_span_tree : bool, optional (default=True) 400 Whether to accept an only approximate minimum spanning tree. 401 For some algorithms this can provide a significant speedup, but 402 the resulting clustering may be of marginally lower quality. 403 If you are willing to sacrifice speed for correctness you may want 404 to explore this; in general this should be left at the default True. 405 406 gen_min_span_tree : bool, optional (default=False) 407 Whether to generate the minimum spanning tree for later analysis. 408 409 core_dist_n_jobs : int, optional (default=4) 410 Number of parallel jobs to run in core distance computations (if 411 supported by the specific algorithm). For ``core_dist_n_jobs`` 412 below -1, (n_cpus + 1 + core_dist_n_jobs) are used. 413 414 cluster_selection_method : string, optional (default='eom') 415 The method used to select clusters from the condensed tree. The 416 standard approach for HDBSCAN* is to use an Excess of Mass algorithm 417 to find the most persistent clusters. Alternatively you can instead 418 select the clusters at the leaves of the tree -- this provides the 419 most fine grained and homogeneous clusters. Options are: 420 * ``eom`` 421 * ``leaf`` 422 423 allow_single_cluster : bool, optional (default=False) 424 By default HDBSCAN* will not produce a single cluster, setting this 425 to t=True will override this and allow single cluster results in 426 the case that you feel this is a valid result for your dataset. 427 (default False) 428 429 match_reference_implementation : bool, optional (default=False) 430 There exist some interpretational differences between this 431 HDBSCAN* implementation and the original authors reference 432 implementation in Java. This can result in very minor differences 433 in clustering results. Setting this flag to True will, at a some 434 performance cost, ensure that the clustering results match the 435 reference implementation. 436 437 **kwargs : optional 438 Arguments passed to the distance metric 439 440 Returns 441 ------- 442 labels : ndarray, shape (n_samples, ) 443 Cluster labels for each point. Noisy samples are given the label -1. 444 445 probabilities : ndarray, shape (n_samples, ) 446 Cluster membership strengths for each point. Noisy samples are assigned 447 0. 448 449 cluster_persistence : array, shape (n_clusters, ) 450 A score of how persistent each cluster is. A score of 1.0 represents 451 a perfectly stable cluster that persists over all distance scales, 452 while a score of 0.0 represents a perfectly ephemeral cluster. These 453 scores can be guage the relative coherence of the clusters output 454 by the algorithm. 455 456 condensed_tree : record array 457 The condensed cluster hierarchy used to generate clusters. 458 459 single_linkage_tree : ndarray, shape (n_samples - 1, 4) 460 The single linkage tree produced during clustering in scipy 461 hierarchical clustering format 462 (see http://docs.scipy.org/doc/scipy/reference/cluster.hierarchy.html). 463 464 min_spanning_tree : ndarray, shape (n_samples - 1, 3) 465 The minimum spanning as an edgelist. If gen_min_span_tree was False 466 this will be None. 467 468 References 469 ---------- 470 471 .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April). 472 Density-based clustering based on hierarchical density estimates. 473 In Pacific-Asia Conference on Knowledge Discovery and Data Mining 474 (pp. 160-172). Springer Berlin Heidelberg. 475 476 .. [2] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the 477 cluster tree. In Advances in Neural Information Processing Systems 478 (pp. 343-351). 479 480 .. [3] Malzer, C., & Baum, M. (2019). A Hybrid Approach To Hierarchical 481 Density-based Cluster Selection. arxiv preprint 1911.02282. 482 """ 483 if min_samples is None: 484 min_samples = min_cluster_size 485 486 if type(min_samples) is not int or type(min_cluster_size) is not int: 487 raise ValueError('Min samples and min cluster size must be integers!') 488 489 if min_samples <= 0 or min_cluster_size <= 0: 490 raise ValueError('Min samples and Min cluster size must be positive' 491 ' integers') 492 493 if min_cluster_size == 1: 494 raise ValueError('Min cluster size must be greater than one') 495 496 if type(cluster_selection_epsilon) is int: 497 cluster_selection_epsilon = float(cluster_selection_epsilon) 498 499 if type(cluster_selection_epsilon) is not float or cluster_selection_epsilon < 0.0: 500 raise ValueError('Epsilon must be a float value greater than or equal to 0!') 501 502 if not isinstance(alpha, float) or alpha <= 0.0: 503 raise ValueError('Alpha must be a positive float value greater than' 504 ' 0!') 505 506 if leaf_size < 1: 507 raise ValueError('Leaf size must be greater than 0!') 508 509 if metric == 'minkowski': 510 if p is None: 511 raise TypeError('Minkowski metric given but no p value supplied!') 512 if p < 0: 513 raise ValueError('Minkowski metric with negative p value is not' 514 ' defined!') 515 516 if match_reference_implementation: 517 min_samples = min_samples - 1 518 min_cluster_size = min_cluster_size + 1 519 approx_min_span_tree = False 520 521 if cluster_selection_method not in ('eom', 'leaf'): 522 raise ValueError('Invalid Cluster Selection Method: %s\n' 523 'Should be one of: "eom", "leaf"\n') 524 525 # Checks input and converts to an nd-array where possible 526 if metric != 'precomputed' or issparse(X): 527 X = check_array(X, accept_sparse='csr') 528 else: 529 # Only non-sparse, precomputed distance matrices are handled here 530 # and thereby allowed to contain numpy.inf for missing distances 531 check_precomputed_distance_matrix(X) 532 533 # Python 2 and 3 compliant string_type checking 534 if isinstance(memory, six.string_types): 535 memory = Memory(cachedir=memory, verbose=0) 536 537 size = X.shape[0] 538 min_samples = min(size - 1, min_samples) 539 if min_samples == 0: 540 min_samples = 1 541 542 if algorithm != 'best': 543 if metric != 'precomputed' and issparse(X) and algorithm != 'generic': 544 raise ValueError("Sparse data matrices only support algorithm 'generic'.") 545 546 if algorithm == 'generic': 547 (single_linkage_tree, 548 result_min_span_tree) = memory.cache( 549 _hdbscan_generic)(X, min_samples, alpha, metric, 550 p, leaf_size, gen_min_span_tree, **kwargs) 551 elif algorithm == 'prims_kdtree': 552 if metric not in KDTree.valid_metrics: 553 raise ValueError("Cannot use Prim's with KDTree for this" 554 " metric!") 555 (single_linkage_tree, result_min_span_tree) = memory.cache( 556 _hdbscan_prims_kdtree)(X, min_samples, alpha, 557 metric, p, leaf_size, 558 gen_min_span_tree, **kwargs) 559 elif algorithm == 'prims_balltree': 560 if metric not in BallTree.valid_metrics: 561 raise ValueError("Cannot use Prim's with BallTree for this" 562 " metric!") 563 (single_linkage_tree, result_min_span_tree) = memory.cache( 564 _hdbscan_prims_balltree)(X, min_samples, alpha, 565 metric, p, leaf_size, 566 gen_min_span_tree, **kwargs) 567 elif algorithm == 'boruvka_kdtree': 568 if metric not in BallTree.valid_metrics: 569 raise ValueError("Cannot use Boruvka with KDTree for this" 570 " metric!") 571 (single_linkage_tree, result_min_span_tree) = memory.cache( 572 _hdbscan_boruvka_kdtree)(X, min_samples, alpha, 573 metric, p, leaf_size, 574 approx_min_span_tree, 575 gen_min_span_tree, 576 core_dist_n_jobs, **kwargs) 577 elif algorithm == 'boruvka_balltree': 578 if metric not in BallTree.valid_metrics: 579 raise ValueError("Cannot use Boruvka with BallTree for this" 580 " metric!") 581 if (X.shape[0] // leaf_size) > 16000: 582 warn("A large dataset size and small leaf_size may induce excessive " 583 "memory usage. If you are running out of memory consider " 584 "increasing the ``leaf_size`` parameter.") 585 (single_linkage_tree, result_min_span_tree) = memory.cache( 586 _hdbscan_boruvka_balltree)(X, min_samples, alpha, 587 metric, p, leaf_size, 588 approx_min_span_tree, 589 gen_min_span_tree, 590 core_dist_n_jobs, **kwargs) 591 else: 592 raise TypeError('Unknown algorithm type %s specified' % algorithm) 593 else: 594 595 if issparse(X) or metric not in FAST_METRICS: 596 # We can't do much with sparse matrices ... 597 (single_linkage_tree, result_min_span_tree) = memory.cache( 598 _hdbscan_generic)(X, min_samples, 599 alpha, metric, p, leaf_size, 600 gen_min_span_tree, **kwargs) 601 elif metric in KDTree.valid_metrics: 602 # TO DO: Need heuristic to decide when to go to boruvka; 603 # still debugging for now 604 if X.shape[1] > 60: 605 (single_linkage_tree, result_min_span_tree) = memory.cache( 606 _hdbscan_prims_kdtree)(X, min_samples, alpha, 607 metric, p, leaf_size, 608 gen_min_span_tree, **kwargs) 609 else: 610 (single_linkage_tree, result_min_span_tree) = memory.cache( 611 _hdbscan_boruvka_kdtree)(X, min_samples, alpha, 612 metric, p, leaf_size, 613 approx_min_span_tree, 614 gen_min_span_tree, 615 core_dist_n_jobs, **kwargs) 616 else: # Metric is a valid BallTree metric 617 # TO DO: Need heuristic to decide when to go to boruvka; 618 # still debugging for now 619 if X.shape[1] > 60: 620 (single_linkage_tree, result_min_span_tree) = memory.cache( 621 _hdbscan_prims_balltree)(X, min_samples, alpha, 622 metric, p, leaf_size, 623 gen_min_span_tree, **kwargs) 624 else: 625 (single_linkage_tree, result_min_span_tree) = memory.cache( 626 _hdbscan_boruvka_balltree)(X, min_samples, alpha, 627 metric, p, leaf_size, 628 approx_min_span_tree, 629 gen_min_span_tree, 630 core_dist_n_jobs, **kwargs) 631 632 return _tree_to_labels(X, 633 single_linkage_tree, 634 min_cluster_size, 635 cluster_selection_method, 636 allow_single_cluster, 637 match_reference_implementation, 638 cluster_selection_epsilon) + \ 639 (result_min_span_tree,) 640 641 642# Inherits from sklearn 643class HDBSCAN(BaseEstimator, ClusterMixin): 644 """Perform HDBSCAN clustering from vector array or distance matrix. 645 646 HDBSCAN - Hierarchical Density-Based Spatial Clustering of Applications 647 with Noise. Performs DBSCAN over varying epsilon values and integrates 648 the result to find a clustering that gives the best stability over epsilon. 649 This allows HDBSCAN to find clusters of varying densities (unlike DBSCAN), 650 and be more robust to parameter selection. 651 652 Parameters 653 ---------- 654 min_cluster_size : int, optional (default=5) 655 The minimum size of clusters; single linkage splits that contain 656 fewer points than this will be considered points "falling out" of a 657 cluster rather than a cluster splitting into two new clusters. 658 659 min_samples : int, optional (default=None) 660 The number of samples in a neighbourhood for a point to be 661 considered a core point. 662 663 metric : string, or callable, optional (default='euclidean') 664 The metric to use when calculating distance between instances in a 665 feature array. If metric is a string or callable, it must be one of 666 the options allowed by metrics.pairwise.pairwise_distances for its 667 metric parameter. 668 If metric is "precomputed", X is assumed to be a distance matrix and 669 must be square. 670 671 p : int, optional (default=None) 672 p value to use if using the minkowski metric. 673 674 alpha : float, optional (default=1.0) 675 A distance scaling parameter as used in robust single linkage. 676 See [3]_ for more information. 677 678 cluster_selection_epsilon: float, optional (default=0.0) 679 A distance threshold. Clusters below this value will be merged. 680 See [5]_ for more information. 681 682 algorithm : string, optional (default='best') 683 Exactly which algorithm to use; hdbscan has variants specialised 684 for different characteristics of the data. By default this is set 685 to ``best`` which chooses the "best" algorithm given the nature of 686 the data. You can force other options if you believe you know 687 better. Options are: 688 * ``best`` 689 * ``generic`` 690 * ``prims_kdtree`` 691 * ``prims_balltree`` 692 * ``boruvka_kdtree`` 693 * ``boruvka_balltree`` 694 695 leaf_size: int, optional (default=40) 696 If using a space tree algorithm (kdtree, or balltree) the number 697 of points ina leaf node of the tree. This does not alter the 698 resulting clustering, but may have an effect on the runtime 699 of the algorithm. 700 701 memory : Instance of joblib.Memory or string (optional) 702 Used to cache the output of the computation of the tree. 703 By default, no caching is done. If a string is given, it is the 704 path to the caching directory. 705 706 approx_min_span_tree : bool, optional (default=True) 707 Whether to accept an only approximate minimum spanning tree. 708 For some algorithms this can provide a significant speedup, but 709 the resulting clustering may be of marginally lower quality. 710 If you are willing to sacrifice speed for correctness you may want 711 to explore this; in general this should be left at the default True. 712 713 gen_min_span_tree: bool, optional (default=False) 714 Whether to generate the minimum spanning tree with regard 715 to mutual reachability distance for later analysis. 716 717 core_dist_n_jobs : int, optional (default=4) 718 Number of parallel jobs to run in core distance computations (if 719 supported by the specific algorithm). For ``core_dist_n_jobs`` 720 below -1, (n_cpus + 1 + core_dist_n_jobs) are used. 721 722 cluster_selection_method : string, optional (default='eom') 723 The method used to select clusters from the condensed tree. The 724 standard approach for HDBSCAN* is to use an Excess of Mass algorithm 725 to find the most persistent clusters. Alternatively you can instead 726 select the clusters at the leaves of the tree -- this provides the 727 most fine grained and homogeneous clusters. Options are: 728 * ``eom`` 729 * ``leaf`` 730 731 allow_single_cluster : bool, optional (default=False) 732 By default HDBSCAN* will not produce a single cluster, setting this 733 to True will override this and allow single cluster results in 734 the case that you feel this is a valid result for your dataset. 735 736 prediction_data : boolean, optional 737 Whether to generate extra cached data for predicting labels or 738 membership vectors few new unseen points later. If you wish to 739 persist the clustering object for later re-use you probably want 740 to set this to True. 741 (default False) 742 743 match_reference_implementation : bool, optional (default=False) 744 There exist some interpretational differences between this 745 HDBSCAN* implementation and the original authors reference 746 implementation in Java. This can result in very minor differences 747 in clustering results. Setting this flag to True will, at a some 748 performance cost, ensure that the clustering results match the 749 reference implementation. 750 751 **kwargs : optional 752 Arguments passed to the distance metric 753 754 Attributes 755 ---------- 756 labels_ : ndarray, shape (n_samples, ) 757 Cluster labels for each point in the dataset given to fit(). 758 Noisy samples are given the label -1. 759 760 probabilities_ : ndarray, shape (n_samples, ) 761 The strength with which each sample is a member of its assigned 762 cluster. Noise points have probability zero; points in clusters 763 have values assigned proportional to the degree that they 764 persist as part of the cluster. 765 766 cluster_persistence_ : ndarray, shape (n_clusters, ) 767 A score of how persistent each cluster is. A score of 1.0 represents 768 a perfectly stable cluster that persists over all distance scales, 769 while a score of 0.0 represents a perfectly ephemeral cluster. These 770 scores can be guage the relative coherence of the clusters output 771 by the algorithm. 772 773 condensed_tree_ : CondensedTree object 774 The condensed tree produced by HDBSCAN. The object has methods 775 for converting to pandas, networkx, and plotting. 776 777 single_linkage_tree_ : SingleLinkageTree object 778 The single linkage tree produced by HDBSCAN. The object has methods 779 for converting to pandas, networkx, and plotting. 780 781 minimum_spanning_tree_ : MinimumSpanningTree object 782 The minimum spanning tree of the mutual reachability graph generated 783 by HDBSCAN. Note that this is not generated by default and will only 784 be available if `gen_min_span_tree` was set to True on object creation. 785 Even then in some optimized cases a tre may not be generated. 786 787 outlier_scores_ : ndarray, shape (n_samples, ) 788 Outlier scores for clustered points; the larger the score the more 789 outlier-like the point. Useful as an outlier detection technique. 790 Based on the GLOSH algorithm by Campello, Moulavi, Zimek and Sander. 791 792 prediction_data_ : PredictionData object 793 Cached data used for predicting the cluster labels of new or 794 unseen points. Necessary only if you are using functions from 795 ``hdbscan.prediction`` (see 796 :func:`~hdbscan.prediction.approximate_predict`, 797 :func:`~hdbscan.prediction.membership_vector`, 798 and :func:`~hdbscan.prediction.all_points_membership_vectors`). 799 800 exemplars_ : list 801 A list of exemplar points for clusters. Since HDBSCAN supports 802 arbitrary shapes for clusters we cannot provide a single cluster 803 exemplar per cluster. Instead a list is returned with each element 804 of the list being a numpy array of exemplar points for a cluster -- 805 these points are the "most representative" points of the cluster. 806 807 relative_validity_ : float 808 A fast approximation of the Density Based Cluster Validity (DBCV) 809 score [4]. The only differece, and the speed, comes from the fact 810 that this relative_validity_ is computed using the mutual- 811 reachability minimum spanning tree, i.e. minimum_spanning_tree_, 812 instead of the all-points minimum spanning tree used in the 813 reference. This score might not be an objective measure of the 814 goodness of clusterering. It may only be used to compare results 815 across different choices of hyper-parameters, therefore is only a 816 relative score. 817 818 References 819 ---------- 820 821 .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April). 822 Density-based clustering based on hierarchical density estimates. 823 In Pacific-Asia Conference on Knowledge Discovery and Data Mining 824 (pp. 160-172). Springer Berlin Heidelberg. 825 826 .. [2] Campello, R. J., Moulavi, D., Zimek, A., & Sander, J. (2015). 827 Hierarchical density estimates for data clustering, visualization, 828 and outlier detection. ACM Transactions on Knowledge Discovery 829 from Data (TKDD), 10(1), 5. 830 831 .. [3] Chaudhuri, K., & Dasgupta, S. (2010). Rates of convergence for the 832 cluster tree. In Advances in Neural Information Processing Systems 833 (pp. 343-351). 834 835 .. [4] Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and 836 Sander, J., 2014. Density-Based Clustering Validation. In SDM 837 (pp. 839-847). 838 839 .. [5] Malzer, C., & Baum, M. (2019). A Hybrid Approach To Hierarchical 840 Density-based Cluster Selection. arxiv preprint 1911.02282. 841 842 """ 843 844 def __init__(self, min_cluster_size=5, min_samples=None, cluster_selection_epsilon=0.0, 845 metric='euclidean', alpha=1.0, p=None, 846 algorithm='best', leaf_size=40, 847 memory=Memory(cachedir=None, verbose=0), 848 approx_min_span_tree=True, 849 gen_min_span_tree=False, 850 core_dist_n_jobs=4, 851 cluster_selection_method='eom', 852 allow_single_cluster=False, 853 prediction_data=False, 854 match_reference_implementation=False, **kwargs): 855 self.min_cluster_size = min_cluster_size 856 self.min_samples = min_samples 857 self.alpha = alpha 858 self.cluster_selection_epsilon = cluster_selection_epsilon 859 self.metric = metric 860 self.p = p 861 self.algorithm = algorithm 862 self.leaf_size = leaf_size 863 self.memory = memory 864 self.approx_min_span_tree = approx_min_span_tree 865 self.gen_min_span_tree = gen_min_span_tree 866 self.core_dist_n_jobs = core_dist_n_jobs 867 self.cluster_selection_method = cluster_selection_method 868 self.allow_single_cluster = allow_single_cluster 869 self.match_reference_implementation = match_reference_implementation 870 self.prediction_data = prediction_data 871 872 self._metric_kwargs = kwargs 873 874 self._condensed_tree = None 875 self._single_linkage_tree = None 876 self._min_spanning_tree = None 877 self._raw_data = None 878 self._outlier_scores = None 879 self._prediction_data = None 880 self._relative_validity = None 881 882 def fit(self, X, y=None): 883 """Perform HDBSCAN clustering from features or distance matrix. 884 885 Parameters 886 ---------- 887 X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ 888 array of shape (n_samples, n_samples) 889 A feature array, or array of distances between samples if 890 ``metric='precomputed'``. 891 892 Returns 893 ------- 894 self : object 895 Returns self 896 """ 897 if self.metric != 'precomputed': 898 X = check_array(X, accept_sparse='csr') 899 self._raw_data = X 900 elif issparse(X): 901 # Handle sparse precomputed distance matrices separately 902 X = check_array(X, accept_sparse='csr') 903 else: 904 # Only non-sparse, precomputed distance matrices are allowed 905 # to have numpy.inf values indicating missing distances 906 check_precomputed_distance_matrix(X) 907 908 kwargs = self.get_params() 909 # prediction data only applies to the persistent model, so remove 910 # it from the keyword args we pass on the the function 911 kwargs.pop('prediction_data', None) 912 kwargs.update(self._metric_kwargs) 913 914 (self.labels_, 915 self.probabilities_, 916 self.cluster_persistence_, 917 self._condensed_tree, 918 self._single_linkage_tree, 919 self._min_spanning_tree) = hdbscan(X, **kwargs) 920 921 if self.prediction_data: 922 self.generate_prediction_data() 923 924 return self 925 926 def fit_predict(self, X, y=None): 927 """Performs clustering on X and returns cluster labels. 928 929 Parameters 930 ---------- 931 X : array or sparse (CSR) matrix of shape (n_samples, n_features), or \ 932 array of shape (n_samples, n_samples) 933 A feature array, or array of distances between samples if 934 ``metric='precomputed'``. 935 936 Returns 937 ------- 938 y : ndarray, shape (n_samples, ) 939 cluster labels 940 """ 941 self.fit(X) 942 return self.labels_ 943 944 def generate_prediction_data(self): 945 """ 946 Create data that caches intermediate results used for predicting 947 the label of new/unseen points. This data is only useful if 948 you are intending to use functions from ``hdbscan.prediction``. 949 """ 950 951 if self.metric in FAST_METRICS: 952 min_samples = self.min_samples or self.min_cluster_size 953 if self.metric in KDTree.valid_metrics: 954 tree_type = 'kdtree' 955 elif self.metric in BallTree.valid_metrics: 956 tree_type = 'balltree' 957 else: 958 warn('Metric {} not supported for prediction data!'.format(self.metric)) 959 return 960 961 self._prediction_data = PredictionData( 962 self._raw_data, self.condensed_tree_, min_samples, 963 tree_type=tree_type, metric=self.metric, 964 **self._metric_kwargs 965 ) 966 else: 967 warn('Cannot generate prediction data for non-vector' 968 'space inputs -- access to the source data rather' 969 'than mere distances is required!') 970 971 972 def weighted_cluster_centroid(self, cluster_id): 973 """Provide an approximate representative point for a given cluster. 974 Note that this technique assumes a euclidean metric for speed of 975 computation. For more general metrics use the ``weighted_cluster_medoid`` 976 method which is slower, but can work with the metric the model trained 977 with. 978 979 Parameters 980 ---------- 981 cluster_id: int 982 The id of the cluster to compute a centroid for. 983 984 Returns 985 ------- 986 centroid: array of shape (n_features,) 987 A representative centroid for cluster ``cluster_id``. 988 """ 989 if not hasattr(self, 'labels_'): 990 raise AttributeError('Model has not been fit to data') 991 992 if cluster_id == -1: 993 raise ValueError('Cannot calculate weighted centroid for -1 cluster ' 994 'since it is a noise cluster') 995 996 mask = self.labels_ == cluster_id 997 cluster_data = self._raw_data[mask] 998 cluster_membership_strengths = self.probabilities_[mask] 999 1000 return np.average(cluster_data, weights=cluster_membership_strengths, axis=0) 1001 1002 1003 def weighted_cluster_medoid(self, cluster_id): 1004 """Provide an approximate representative point for a given cluster. 1005 Note that this technique can be very slow and memory intensive for 1006 large clusters. For faster results use the ``weighted_cluster_centroid`` 1007 method which is faster, but assumes a euclidean metric. 1008 1009 Parameters 1010 ---------- 1011 cluster_id: int 1012 The id of the cluster to compute a medoid for. 1013 1014 Returns 1015 ------- 1016 centroid: array of shape (n_features,) 1017 A representative medoid for cluster ``cluster_id``. 1018 """ 1019 if not hasattr(self, 'labels_'): 1020 raise AttributeError('Model has not been fit to data') 1021 1022 if cluster_id == -1: 1023 raise ValueError('Cannot calculate weighted centroid for -1 cluster ' 1024 'since it is a noise cluster') 1025 1026 mask = self.labels_ == cluster_id 1027 cluster_data = self._raw_data[mask] 1028 cluster_membership_strengths = self.probabilities_[mask] 1029 1030 dist_mat = pairwise_distances(cluster_data, 1031 metric=self.metric, 1032 **self._metric_kwargs) 1033 1034 dist_mat = dist_mat * cluster_membership_strengths 1035 medoid_index = np.argmin(dist_mat.sum(axis=1)) 1036 return cluster_data[medoid_index] 1037 1038 1039 @property 1040 def prediction_data_(self): 1041 if self._prediction_data is None: 1042 raise AttributeError('No prediction data was generated') 1043 else: 1044 return self._prediction_data 1045 1046 @property 1047 def outlier_scores_(self): 1048 if self._outlier_scores is not None: 1049 return self._outlier_scores 1050 else: 1051 if self._condensed_tree is not None: 1052 self._outlier_scores = outlier_scores(self._condensed_tree) 1053 return self._outlier_scores 1054 else: 1055 raise AttributeError('No condensed tree was generated; try running fit first.') 1056 1057 @property 1058 def condensed_tree_(self): 1059 if self._condensed_tree is not None: 1060 return CondensedTree(self._condensed_tree, 1061 self.cluster_selection_method, 1062 self.allow_single_cluster) 1063 else: 1064 raise AttributeError('No condensed tree was generated; try running fit first.') 1065 1066 @property 1067 def single_linkage_tree_(self): 1068 if self._single_linkage_tree is not None: 1069 return SingleLinkageTree(self._single_linkage_tree) 1070 else: 1071 raise AttributeError('No single linkage tree was generated; try running fit' 1072 ' first.') 1073 1074 @property 1075 def minimum_spanning_tree_(self): 1076 if self._min_spanning_tree is not None: 1077 if self._raw_data is not None: 1078 return MinimumSpanningTree(self._min_spanning_tree, 1079 self._raw_data) 1080 else: 1081 warn('No raw data is available; this may be due to using' 1082 ' a precomputed metric matrix. No minimum spanning' 1083 ' tree will be provided without raw data.') 1084 return None 1085 else: 1086 raise AttributeError('No minimum spanning tree was generated.' 1087 'This may be due to optimized algorithm variations that skip' 1088 ' explicit generation of the spanning tree.') 1089 1090 @property 1091 def exemplars_(self): 1092 if self._prediction_data is not None: 1093 return self._prediction_data.exemplars 1094 elif self.metric in FAST_METRICS: 1095 self.generate_prediction_data() 1096 return self._prediction_data.exemplars 1097 else: 1098 raise AttributeError('Currently exemplars require the use of vector input data' 1099 'with a suitable metric. This will likely change in the ' 1100 'future, but for now no exemplars can be provided') 1101 1102 @property 1103 def relative_validity_(self): 1104 if self._relative_validity is not None: 1105 return self._relative_validity 1106 1107 if not self.gen_min_span_tree: 1108 raise AttributeError("Minimum spanning tree not present. " + 1109 "Either HDBSCAN object was created with " + 1110 "gen_min_span_tree=False or the tree was " + 1111 "not generated in spite of it owing to " + 1112 "internal optimization criteria.") 1113 return 1114 1115 labels = self.labels_ 1116 sizes = np.bincount(labels + 1) 1117 noise_size = sizes[0] 1118 cluster_size = sizes[1:] 1119 total = noise_size + np.sum(cluster_size) 1120 num_clusters = len(cluster_size) 1121 DSC = np.zeros(num_clusters) 1122 min_outlier_sep = np.inf # only required if num_clusters = 1 1123 correction_const = 2 # only required if num_clusters = 1 1124 1125 # Unltimately, for each Ci, we only require the 1126 # minimum of DSPC(Ci, Cj) over all Cj != Ci. 1127 # So let's call this value DSPC_wrt(Ci), i.e. 1128 # density separation 'with respect to' Ci. 1129 DSPC_wrt = np.ones(num_clusters) * np.inf 1130 max_distance = 0 1131 1132 mst_df = self.minimum_spanning_tree_.to_pandas() 1133 1134 for edge in mst_df.iterrows(): 1135 label1 = labels[int(edge[1]['from'])] 1136 label2 = labels[int(edge[1]['to'])] 1137 length = edge[1]['distance'] 1138 1139 max_distance = max(max_distance, length) 1140 1141 if label1 == -1 and label2 == -1: 1142 continue 1143 elif label1 == -1 or label2 == -1: 1144 # If exactly one of the points is noise 1145 min_outlier_sep = min(min_outlier_sep, length) 1146 continue 1147 1148 if label1 == label2: 1149 # Set the density sparseness of the cluster 1150 # to the sparsest value seen so far. 1151 DSC[label1] = max(length, DSC[label1]) 1152 else: 1153 # Check whether density separations with 1154 # respect to each of these clusters can 1155 # be reduced. 1156 DSPC_wrt[label1] = min(length, DSPC_wrt[label1]) 1157 DSPC_wrt[label2] = min(length, DSPC_wrt[label2]) 1158 1159 # In case min_outlier_sep is still np.inf, we assign a new value to it. 1160 # This only makes sense if num_clusters = 1 since it has turned out 1161 # that the MR-MST has no edges between a noise point and a core point. 1162 min_outlier_sep = max_distance if min_outlier_sep == np.inf else min_outlier_sep 1163 1164 # DSPC_wrt[Ci] might be infinite if the connected component for Ci is 1165 # an "island" in the MR-MST. Whereas for other clusters Cj and Ck, the 1166 # MR-MST might contain an edge with one point in Cj and ther other one 1167 # in Ck. Here, we replace the infinite density separation of Ci by 1168 # another large enough value. 1169 # 1170 # TODO: Think of a better yet efficient way to handle this. 1171 correction = correction_const * (max_distance if num_clusters > 1 else min_outlier_sep) 1172 DSPC_wrt[np.where(DSPC_wrt == np.inf)] = correction 1173 1174 V_index = [(DSPC_wrt[i] - DSC[i]) / max(DSPC_wrt[i], DSC[i]) for i in range(num_clusters)] 1175 score = np.sum([(cluster_size[i] * V_index[i]) / total for i in range(num_clusters)]) 1176 self._relative_validity = score 1177 return self._relative_validity 1178