1#cython: boundscheck=False, nonecheck=False, initializedcheck=False 2# Utility routines in cython for prediction in hdbscan 3# Authors: Leland McInnes 4# License: 3-clause BSD 5 6import numpy as np 7cimport numpy as np 8 9from dist_metrics cimport DistanceMetric 10 11from libc.float cimport DBL_MAX 12from libc.math cimport exp 13 14cpdef get_tree_row_with_child(np.ndarray tree, np.intp_t child): 15 16 cdef np.intp_t i 17 cdef np.ndarray[np.intp_t, ndim = 1] child_array = tree['child'] 18 19 for i in range(tree.shape[0]): 20 if child_array[i] == child: 21 return tree[i] 22 23 return tree[0] 24 25cdef np.float64_t min_dist_to_exemplar( 26 np.ndarray[np.float64_t, ndim=1] point, 27 np.ndarray[np.float64_t, ndim=2] cluster_exemplars, 28 DistanceMetric dist_metric): 29 30 cdef np.intp_t i 31 cdef np.float64_t result = DBL_MAX 32 cdef np.float64_t distance 33 cdef np.float64_t *point_ptr = (<np.float64_t *> point.data) 34 cdef np.float64_t[:, ::1] exemplars_view = \ 35 (<np.float64_t [:cluster_exemplars.shape[0], :cluster_exemplars.shape[1]:1]> 36 (<np.float64_t *> cluster_exemplars.data)) 37 cdef np.float64_t *exemplars_ptr = \ 38 (<np.float64_t *> &exemplars_view[0, 0]) 39 cdef np.intp_t num_features = point.shape[0] 40 41 for i in range(cluster_exemplars.shape[0]): 42 distance = dist_metric.dist(point_ptr, 43 &exemplars_ptr[num_features * i], 44 num_features) 45 if distance < result: 46 result = distance 47 48 return result 49 50cdef np.ndarray[np.float64_t, ndim=1] dist_vector( 51 np.ndarray[np.float64_t, ndim=1] point, 52 list exemplars_list, 53 DistanceMetric dist_metric): 54 55 cdef np.intp_t i 56 cdef np.ndarray[np.float64_t, ndim=2] exemplars 57 cdef np.ndarray[np.float64_t, ndim=1] result = np.empty(len(exemplars_list)) 58 59 60 for i in range(len(exemplars_list)): 61 exemplars = exemplars_list[i] 62 result[i] = min_dist_to_exemplar(point, exemplars, dist_metric) 63 64 return result 65 66cpdef np.ndarray[np.float64_t, ndim=1] dist_membership_vector( 67 np.ndarray[np.float64_t, ndim=1] point, 68 list exemplars_list, 69 DistanceMetric dist_metric, 70 softmax=False): 71 72 cdef np.intp_t i 73 cdef np.ndarray[np.float64_t, ndim=1] result = np.empty(len(exemplars_list)) 74 cdef np.ndarray[np.float64_t, ndim=1] vector 75 cdef np.float64_t sum = 0.0 76 77 vector = dist_vector(point, exemplars_list, dist_metric) 78 79 if softmax: 80 for i in range(vector.shape[0]): 81 result[i] = 1.0 / vector[i] 82 result = np.exp(result - np.nanmax(result)) 83 sum = np.sum(result) 84 85 else: 86 for i in range(vector.shape[0]): 87 if vector[i] != 0: 88 result[i] = 1.0 / vector[i] 89 else: 90 result[i] = DBL_MAX / vector.shape[0] 91 sum += result[i] 92 93 for i in range(result.shape[0]): 94 result[i] = result[i] / sum 95 96 return result 97 98cpdef np.ndarray[np.float64_t, ndim=2] all_points_dist_membership_vector( 99 np.ndarray[np.float64_t, ndim=2] all_points, 100 list exemplars_list, 101 DistanceMetric dist_metric, 102 softmax=False): 103 104 cdef np.ndarray[np.float64_t, ndim=2] result 105 cdef np.intp_t i 106 107 result = np.empty((all_points.shape[0], len(exemplars_list)), 108 dtype=np.float64) 109 110 for i in range(all_points.shape[0]): 111 result[i] = dist_membership_vector(all_points[i], 112 exemplars_list, 113 dist_metric, 114 softmax) 115 116 return result 117 118cdef np.ndarray[np.float64_t, ndim=1] merge_height( 119 np.intp_t point_cluster, 120 np.float64_t point_lambda, 121 np.ndarray[np.intp_t, ndim=1] clusters, 122 np.ndarray cluster_tree): 123 124 cdef np.intp_t i 125 cdef np.intp_t j 126 127 cdef np.intp_t left_cluster 128 cdef np.intp_t right_cluster 129 cdef int took_right_parent 130 cdef int took_left_parent 131 cdef np.intp_t cluster 132 133 cdef np.ndarray[np.float64_t, ndim=1] result = np.empty(clusters.shape[0], 134 dtype=np.float64) 135 cdef np.ndarray[np.intp_t, ndim=1] parents 136 cdef np.ndarray[np.intp_t, ndim=1] children 137 cdef np.ndarray[np.float64_t, ndim=1] lambdas 138 139 # convert the cluster tree for fast direct access 140 parents = cluster_tree['parent'].astype(np.intp) 141 children = cluster_tree['child'].astype(np.intp) 142 lambdas = cluster_tree['lambda_val'].astype(np.float64) 143 144 145 for i in range(clusters.shape[0]): 146 147 took_right_parent = False 148 took_left_parent = False 149 150 right_cluster = clusters[i] 151 left_cluster = point_cluster 152 153 while left_cluster != right_cluster: 154 if left_cluster > right_cluster: 155 took_left_parent = True 156 last_cluster = left_cluster 157 158 # Set left_cluster to be its parent 159 for j in range(children.shape[0]): 160 if children[j] == left_cluster: 161 left_cluster = parents[j] 162 break 163 else: 164 took_right_parent = True 165 last_cluster = right_cluster 166 167 # Set right_cluster to be its parent 168 for j in range(children.shape[0]): 169 if children[j] == right_cluster: 170 right_cluster = parents[j] 171 break 172 173 if took_left_parent and took_right_parent: 174 # Take the lambda value of last_cluster merging in 175 for j in range(children.shape[0]): 176 if children[j] == last_cluster: 177 result[i] = lambdas[j] 178 break 179 else: 180 result[i] = point_lambda 181 182 return result 183 184cpdef np.ndarray[np.float64_t, ndim=1] per_cluster_scores( 185 np.intp_t neighbor, 186 np.float32_t lambda_, 187 np.ndarray[np.intp_t, ndim=1] clusters, 188 np.ndarray tree, 189 dict max_lambda_dict, 190 np.ndarray cluster_tree): 191 192 cdef np.intp_t point_cluster 193 cdef np.float64_t point_lambda 194 cdef np.float64_t max_lambda 195 196 cdef np.intp_t i 197 198 cdef np.ndarray[np.float64_t, ndim=1] result 199 200 point_row = get_tree_row_with_child(tree, neighbor) 201 point_cluster = point_row['parent'] 202 point_lambda = lambda_ 203 max_lambda = max_lambda_dict[point_cluster] + 1e-8 # avoid zero 204 205 # Save an allocation by assigning and reusing result ... 206 # height = merge_height(point_cluster, point_lambda, 207 # clusters, cluster_tree) 208 result = merge_height(point_cluster, point_lambda, 209 clusters, cluster_tree) 210 211 # Cythonize: result = np.exp(-(max_lambda / height)) 212 for i in range(result.shape[0]): 213 # result[i] = exp(-(max_lambda / result[i])) 214 result[i] = (max_lambda / (max_lambda - result[i])) 215 216 return result 217 218cpdef np.ndarray[np.float64_t, ndim=1] outlier_membership_vector(neighbor, 219 lambda_, clusters, tree, max_lambda_dict, cluster_tree, 220 softmax=True): 221 222 cdef np.ndarray[np.float64_t, ndim=1] result 223 224 if softmax: 225 result = per_cluster_scores(neighbor, lambda_, clusters, tree, 226 max_lambda_dict, cluster_tree) 227 # Scale for numerical stability, mathematically equivalent with old 228 # version due to the scaling with the sum in below. 229 result = np.exp(result - np.nanmax(result)) 230 #result[~np.isfinite(result)] = np.finfo(np.double).max 231 else: 232 result = per_cluster_scores(neighbor, lambda_, clusters, tree, 233 max_lambda_dict, cluster_tree) 234 235 result /= result.sum() 236 return result 237 238cpdef np.float64_t prob_in_some_cluster(neighbor, lambda_, clusters, tree, 239 max_lambda_dict, cluster_tree): 240 241 cdef np.ndarray[np.float64_t, ndim=1] cluster_merge_heights 242 243 cdef np.intp_t point_cluster 244 cdef np.float64_t point_lambda 245 cdef np.float64_t max_lambda 246 247 point_row = get_tree_row_with_child(tree, neighbor) 248 point_cluster = point_row['parent'] 249 point_lambda = lambda_ 250 251 cluster_merge_heights = \ 252 merge_height(point_cluster, point_lambda, clusters, cluster_tree) 253 point_height = cluster_merge_heights.max() 254 nearest_cluster = clusters[cluster_merge_heights.argmax()] 255 256 max_lambda = max(lambda_, max_lambda_dict[nearest_cluster]) + 1e-8 # avoid z 257 258 return (point_height / max_lambda) 259 260cpdef np.ndarray[np.float64_t, ndim=2] all_points_per_cluster_scores( 261 np.ndarray[np.intp_t, ndim=1] clusters, 262 np.ndarray tree, 263 dict max_lambda_dict, 264 np.ndarray cluster_tree): 265 266 cdef np.intp_t num_points = tree['parent'].min() 267 cdef np.ndarray[np.float64_t, ndim=2] result_arr 268 cdef np.float64_t[:, ::1] result 269 cdef np.intp_t point 270 cdef np.intp_t point_cluster 271 cdef np.float64_t point_lambda 272 cdef np.float64_t max_lambda 273 274 cdef np.intp_t i, j 275 276 result_arr = np.empty((num_points, clusters.shape[0]), dtype=np.float64) 277 result = (<np.float64_t [:num_points, :clusters.shape[0]:1]> 278 (<np.float64_t *> result_arr.data)) 279 280 point_tree = tree[tree['child_size'] == 1] 281 282 for i in range(point_tree.shape[0]): 283 point_row = point_tree[i] 284 point = point_row['child'] 285 point_cluster = point_row['parent'] 286 point_lambda = point_row['lambda_val'] 287 max_lambda = max_lambda_dict[point_cluster] + 1e-8 # avoid zero lambda 288 289 # Can we not do a faster merge height operation here? 290 result_arr[point] = merge_height(point_cluster, point_lambda, 291 clusters, cluster_tree) 292 293 # Cythonize: result = np.exp(-(max_lambda / height)) 294 for j in range(result_arr.shape[1]): 295 result[point][j] = exp(-(max_lambda / result[point][j])) 296 297 return result_arr 298 299cpdef np.ndarray[np.float64_t, ndim=2] all_points_outlier_membership_vector( 300 np.ndarray[np.intp_t, ndim=1] clusters, 301 np.ndarray tree, 302 dict max_lambda_dict, 303 np.ndarray cluster_tree, 304 np.intp_t softmax=True): 305 306 cdef np.ndarray[np.float64_t, ndim=2] per_cluster_scores 307 308 per_cluster_scores = all_points_per_cluster_scores( 309 clusters, 310 tree, 311 max_lambda_dict, 312 cluster_tree) 313 if softmax: 314 # Scale for numerical stability, mathematically equivalent with old 315 # version due to the scaling with the sum in below. 316 result = np.exp(per_cluster_scores - np.nanmax(per_cluster_scores)) 317 #result[~np.isfinite(result)] = np.finfo(np.double).max 318 else: 319 result = per_cluster_scores 320 321 row_sums = result.sum(axis=1) 322 result = result / row_sums[:, np.newaxis] 323 324 return result 325 326cpdef all_points_prob_in_some_cluster( 327 np.ndarray[np.intp_t, ndim=1] clusters, 328 np.ndarray tree, 329 dict max_lambda_dict, 330 np.ndarray cluster_tree): 331 332 cdef np.ndarray[np.float64_t, ndim=1] heights 333 cdef np.intp_t num_points = tree['parent'].min() 334 cdef np.ndarray[np.float64_t, ndim=1] result 335 cdef np.intp_t point 336 cdef np.intp_t point_cluster 337 cdef np.float64_t point_lambda 338 cdef np.float64_t max_lambda 339 340 cdef np.intp_t i 341 342 result = np.empty(num_points, dtype=np.float64) 343 344 point_tree = tree[tree['child_size'] == 1] 345 346 for i in range(point_tree.shape[0]): 347 point_row = point_tree[i] 348 point = point_row['child'] 349 point_cluster = point_row['parent'] 350 point_lambda = point_row['lambda_val'] 351 352 # Can we not do a faster merge height operation here? 353 heights = merge_height(point_cluster, point_lambda, 354 clusters, cluster_tree) 355 max_lambda = max(max_lambda_dict[clusters[heights.argmax()]], 356 point_lambda) 357 result[point] = (heights.max() / max_lambda) 358 359 return result 360