1import numpy as np 2from sklearn.metrics import pairwise_distances 3from scipy.spatial.distance import cdist 4from ._hdbscan_linkage import mst_linkage_core 5from .hdbscan_ import isclose 6 7def all_points_core_distance(distance_matrix, d=2.0): 8 """ 9 Compute the all-points-core-distance for all the points of a cluster. 10 11 Parameters 12 ---------- 13 distance_matrix : array (cluster_size, cluster_size) 14 The pairwise distance matrix between points in the cluster. 15 16 d : integer 17 The dimension of the data set, which is used in the computation 18 of the all-point-core-distance as per the paper. 19 20 Returns 21 ------- 22 core_distances : array (cluster_size,) 23 The all-points-core-distance of each point in the cluster 24 25 References 26 ---------- 27 Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., 28 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). 29 """ 30 distance_matrix[distance_matrix != 0] = (1.0 / distance_matrix[ 31 distance_matrix != 0]) ** d 32 result = distance_matrix.sum(axis=1) 33 result /= distance_matrix.shape[0] - 1 34 result **= (-1.0 / d) 35 36 return result 37 38 39def all_points_mutual_reachability(X, labels, cluster_id, 40 metric='euclidean', d=None, **kwd_args): 41 """ 42 Compute the all-points-mutual-reachability distances for all the points of 43 a cluster. 44 45 If metric is 'precomputed' then assume X is a distance matrix for the full 46 dataset. Note that in this case you must pass in 'd' the dimension of the 47 dataset. 48 49 Parameters 50 ---------- 51 X : array (n_samples, n_features) or (n_samples, n_samples) 52 The input data of the clustering. This can be the data, or, if 53 metric is set to `precomputed` the pairwise distance matrix used 54 for the clustering. 55 56 labels : array (n_samples) 57 The label array output by the clustering, providing an integral 58 cluster label to each data point, with -1 for noise points. 59 60 cluster_id : integer 61 The cluster label for which to compute the all-points 62 mutual-reachability (which should be done on a cluster 63 by cluster basis). 64 65 metric : string 66 The metric used to compute distances for the clustering (and 67 to be re-used in computing distances for mr distance). If 68 set to `precomputed` then X is assumed to be the precomputed 69 distance matrix between samples. 70 71 d : integer (or None) 72 The number of features (dimension) of the dataset. This need only 73 be set in the case of metric being set to `precomputed`, where 74 the ambient dimension of the data is unknown to the function. 75 76 **kwd_args : 77 Extra arguments to pass to the distance computation for other 78 metrics, such as minkowski, Mahanalobis etc. 79 80 Returns 81 ------- 82 83 mutual_reachaibility : array (n_samples, n_samples) 84 The pairwise mutual reachability distances between all points in `X` 85 with `label` equal to `cluster_id`. 86 87 core_distances : array (n_samples,) 88 The all-points-core_distance of all points in `X` with `label` equal 89 to `cluster_id`. 90 91 References 92 ---------- 93 Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., 94 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). 95 """ 96 if metric == 'precomputed': 97 if d is None: 98 raise ValueError('If metric is precomputed a ' 99 'd value must be provided!') 100 distance_matrix = X[labels == cluster_id, :][:, labels == cluster_id] 101 else: 102 subset_X = X[labels == cluster_id, :] 103 distance_matrix = pairwise_distances(subset_X, metric=metric, 104 **kwd_args) 105 d = X.shape[1] 106 107 core_distances = all_points_core_distance(distance_matrix.copy(), d=d) 108 core_dist_matrix = np.tile(core_distances, (core_distances.shape[0], 1)) 109 110 result = np.dstack( 111 [distance_matrix, core_dist_matrix, core_dist_matrix.T]).max(axis=-1) 112 113 return result, core_distances 114 115 116def internal_minimum_spanning_tree(mr_distances): 117 """ 118 Compute the 'internal' minimum spanning tree given a matrix of mutual 119 reachability distances. Given a minimum spanning tree the 'internal' 120 graph is the subgraph induced by vertices of degree greater than one. 121 122 Parameters 123 ---------- 124 mr_distances : array (cluster_size, cluster_size) 125 The pairwise mutual reachability distances, inferred to be the edge 126 weights of a complete graph. Since MSTs are computed per cluster 127 this is the all-points-mutual-reacability for points within a single 128 cluster. 129 130 Returns 131 ------- 132 internal_nodes : array 133 An array listing the indices of the internal nodes of the MST 134 135 internal_edges : array (?, 3) 136 An array of internal edges in weighted edge list format; that is 137 an edge is an array of length three listing the two vertices 138 forming the edge and weight of the edge. 139 140 References 141 ---------- 142 Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., 143 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). 144 """ 145 single_linkage_data = mst_linkage_core(mr_distances) 146 min_span_tree = single_linkage_data.copy() 147 for index, row in enumerate(min_span_tree[1:], 1): 148 candidates = np.where(isclose(mr_distances[int(row[1])], row[2]))[0] 149 candidates = np.intersect1d(candidates, 150 single_linkage_data[:index, :2].astype( 151 int)) 152 candidates = candidates[candidates != row[1]] 153 assert len(candidates) > 0 154 row[0] = candidates[0] 155 156 vertices = np.arange(mr_distances.shape[0])[ 157 np.bincount(min_span_tree.T[:2].flatten().astype(np.intp)) > 1] 158 # A little "fancy" we select from the flattened array reshape back 159 # (Fortran format to get indexing right) and take the product to do an and 160 # then convert back to boolean type. 161 edge_selection = np.prod(np.in1d(min_span_tree.T[:2], vertices).reshape( 162 (min_span_tree.shape[0], 2), order='F'), axis=1).astype(bool) 163 164 # Density sparseness is not well defined if there are no 165 # internal edges (as per the referenced paper). However 166 # MATLAB code from the original authors simply selects the 167 # largest of *all* the edges in the case that there are 168 # no internal edges, so we do the same here 169 if np.any(edge_selection): 170 # If there are any internal edges, then subselect them out 171 edges = min_span_tree[edge_selection] 172 else: 173 # If there are no internal edges then we want to take the 174 # max over all the edges that exist in the MST, so we simply 175 # do nothing and return all the edges in the MST. 176 edges = min_span_tree.copy() 177 178 return vertices, edges 179 180 181def density_separation(X, labels, cluster_id1, cluster_id2, 182 internal_nodes1, internal_nodes2, 183 core_distances1, core_distances2, 184 metric='euclidean', **kwd_args): 185 """ 186 Compute the density separation between two clusters. This is the minimum 187 all-points mutual reachability distance between pairs of points, one from 188 internal nodes of MSTs of each cluster. 189 190 Parameters 191 ---------- 192 X : array (n_samples, n_features) or (n_samples, n_samples) 193 The input data of the clustering. This can be the data, or, if 194 metric is set to `precomputed` the pairwise distance matrix used 195 for the clustering. 196 197 labels : array (n_samples) 198 The label array output by the clustering, providing an integral 199 cluster label to each data point, with -1 for noise points. 200 201 cluster_id1 : integer 202 The first cluster label to compute separation between. 203 204 cluster_id2 : integer 205 The second cluster label to compute separation between. 206 207 internal_nodes1 : array 208 The vertices of the MST for `cluster_id1` that were internal vertices. 209 210 internal_nodes2 : array 211 The vertices of the MST for `cluster_id2` that were internal vertices. 212 213 core_distances1 : array (size of cluster_id1,) 214 The all-points-core_distances of all points in the cluster 215 specified by cluster_id1. 216 217 core_distances2 : array (size of cluster_id2,) 218 The all-points-core_distances of all points in the cluster 219 specified by cluster_id2. 220 221 metric : string 222 The metric used to compute distances for the clustering (and 223 to be re-used in computing distances for mr distance). If 224 set to `precomputed` then X is assumed to be the precomputed 225 distance matrix between samples. 226 227 **kwd_args : 228 Extra arguments to pass to the distance computation for other 229 metrics, such as minkowski, Mahanalobis etc. 230 231 Returns 232 ------- 233 The 'density separation' between the clusters specified by 234 `cluster_id1` and `cluster_id2`. 235 236 References 237 ---------- 238 Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., 239 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). 240 """ 241 if metric == 'precomputed': 242 sub_select = X[labels == cluster_id1, :][:, labels == cluster_id2] 243 distance_matrix = sub_select[internal_nodes1, :][:, internal_nodes2] 244 else: 245 cluster1 = X[labels == cluster_id1][internal_nodes1] 246 cluster2 = X[labels == cluster_id2][internal_nodes2] 247 distance_matrix = cdist(cluster1, cluster2, metric, **kwd_args) 248 249 core_dist_matrix1 = np.tile(core_distances1[internal_nodes1], 250 (distance_matrix.shape[1], 1)).T 251 core_dist_matrix2 = np.tile(core_distances2[internal_nodes2], 252 (distance_matrix.shape[0], 1)) 253 254 mr_dist_matrix = np.dstack([distance_matrix, 255 core_dist_matrix1, 256 core_dist_matrix2]).max(axis=-1) 257 258 return mr_dist_matrix.min() 259 260 261def validity_index(X, labels, metric='euclidean', 262 d=None, per_cluster_scores=False, **kwd_args): 263 """ 264 Compute the density based cluster validity index for the 265 clustering specified by `labels` and for each cluster in `labels`. 266 267 Parameters 268 ---------- 269 X : array (n_samples, n_features) or (n_samples, n_samples) 270 The input data of the clustering. This can be the data, or, if 271 metric is set to `precomputed` the pairwise distance matrix used 272 for the clustering. 273 274 labels : array (n_samples) 275 The label array output by the clustering, providing an integral 276 cluster label to each data point, with -1 for noise points. 277 278 metric : optional, string (default 'euclidean') 279 The metric used to compute distances for the clustering (and 280 to be re-used in computing distances for mr distance). If 281 set to `precomputed` then X is assumed to be the precomputed 282 distance matrix between samples. 283 284 d : optional, integer (or None) (default None) 285 The number of features (dimension) of the dataset. This need only 286 be set in the case of metric being set to `precomputed`, where 287 the ambient dimension of the data is unknown to the function. 288 289 per_cluster_scores : optional, boolean (default False) 290 Whether to return the validity index for individual clusters. 291 Defaults to False with the function returning a single float 292 value for the whole clustering. 293 294 **kwd_args : 295 Extra arguments to pass to the distance computation for other 296 metrics, such as minkowski, Mahanalobis etc. 297 298 Returns 299 ------- 300 validity_index : float 301 The density based cluster validity index for the clustering. This 302 is a numeric value between -1 and 1, with higher values indicating 303 a 'better' clustering. 304 305 per_cluster_validity_index : array (n_clusters,) 306 The cluster validity index of each individual cluster as an array. 307 The overall validity index is the weighted average of these values. 308 Only returned if per_cluster_scores is set to True. 309 310 References 311 ---------- 312 Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J., 313 2014. Density-Based Clustering Validation. In SDM (pp. 839-847). 314 """ 315 core_distances = {} 316 density_sparseness = {} 317 mst_nodes = {} 318 mst_edges = {} 319 320 max_cluster_id = labels.max() + 1 321 density_sep = np.inf * np.ones((max_cluster_id, max_cluster_id), 322 dtype=np.float64) 323 cluster_validity_indices = np.empty(max_cluster_id, dtype=np.float64) 324 325 for cluster_id in range(max_cluster_id): 326 327 if np.sum(labels == cluster_id) == 0: 328 continue 329 330 mr_distances, core_distances[ 331 cluster_id] = all_points_mutual_reachability( 332 X, 333 labels, 334 cluster_id, 335 metric, 336 d, 337 **kwd_args 338 ) 339 340 mst_nodes[cluster_id], mst_edges[cluster_id] = \ 341 internal_minimum_spanning_tree(mr_distances) 342 density_sparseness[cluster_id] = mst_edges[cluster_id].T[2].max() 343 344 for i in range(max_cluster_id): 345 346 if np.sum(labels == i) == 0: 347 continue 348 349 internal_nodes_i = mst_nodes[i] 350 for j in range(i + 1, max_cluster_id): 351 352 if np.sum(labels == j) == 0: 353 continue 354 355 internal_nodes_j = mst_nodes[j] 356 density_sep[i, j] = density_separation( 357 X, labels, i, j, 358 internal_nodes_i, internal_nodes_j, 359 core_distances[i], core_distances[j], 360 metric=metric, **kwd_args 361 ) 362 density_sep[j, i] = density_sep[i, j] 363 364 n_samples = float(X.shape[0]) 365 result = 0 366 367 for i in range(max_cluster_id): 368 369 if np.sum(labels == i) == 0: 370 continue 371 372 min_density_sep = density_sep[i].min() 373 cluster_validity_indices[i] = ( 374 (min_density_sep - density_sparseness[i]) / 375 max(min_density_sep, density_sparseness[i]) 376 ) 377 cluster_size = np.sum(labels == i) 378 result += (cluster_size / n_samples) * cluster_validity_indices[i] 379 380 if per_cluster_scores: 381 return result, cluster_validity_indices 382 else: 383 return result 384