1#cython: boundscheck=False, nonecheck=False, initializedcheck=False
2# Utility routines in cython for prediction in hdbscan
3# Authors: Leland McInnes
4# License: 3-clause BSD
5
6import numpy as np
7cimport numpy as np
8
9from dist_metrics cimport DistanceMetric
10
11from libc.float cimport DBL_MAX
12from libc.math cimport exp
13
14cpdef get_tree_row_with_child(np.ndarray tree, np.intp_t child):
15
16    cdef np.intp_t i
17    cdef np.ndarray[np.intp_t, ndim = 1] child_array = tree['child']
18
19    for i in range(tree.shape[0]):
20        if child_array[i] == child:
21            return tree[i]
22
23    return tree[0]
24
25cdef np.float64_t min_dist_to_exemplar(
26                        np.ndarray[np.float64_t, ndim=1] point,
27                        np.ndarray[np.float64_t, ndim=2] cluster_exemplars,
28                        DistanceMetric dist_metric):
29
30    cdef np.intp_t i
31    cdef np.float64_t result = DBL_MAX
32    cdef np.float64_t distance
33    cdef np.float64_t *point_ptr = (<np.float64_t *> point.data)
34    cdef np.float64_t[:, ::1] exemplars_view = \
35        (<np.float64_t [:cluster_exemplars.shape[0], :cluster_exemplars.shape[1]:1]>
36            (<np.float64_t *> cluster_exemplars.data))
37    cdef np.float64_t *exemplars_ptr = \
38        (<np.float64_t *> &exemplars_view[0, 0])
39    cdef np.intp_t num_features = point.shape[0]
40
41    for i in range(cluster_exemplars.shape[0]):
42        distance = dist_metric.dist(point_ptr,
43                                    &exemplars_ptr[num_features * i],
44                                    num_features)
45        if distance < result:
46            result = distance
47
48    return result
49
50cdef np.ndarray[np.float64_t, ndim=1] dist_vector(
51                    np.ndarray[np.float64_t, ndim=1] point,
52                    list exemplars_list,
53                    DistanceMetric dist_metric):
54
55    cdef np.intp_t i
56    cdef np.ndarray[np.float64_t, ndim=2] exemplars
57    cdef np.ndarray[np.float64_t, ndim=1] result = np.empty(len(exemplars_list))
58
59
60    for i in range(len(exemplars_list)):
61        exemplars = exemplars_list[i]
62        result[i] = min_dist_to_exemplar(point, exemplars, dist_metric)
63
64    return result
65
66cpdef np.ndarray[np.float64_t, ndim=1] dist_membership_vector(
67                    np.ndarray[np.float64_t, ndim=1] point,
68                    list exemplars_list,
69                    DistanceMetric dist_metric,
70                    softmax=False):
71
72    cdef np.intp_t i
73    cdef np.ndarray[np.float64_t, ndim=1] result = np.empty(len(exemplars_list))
74    cdef np.ndarray[np.float64_t, ndim=1] vector
75    cdef np.float64_t sum = 0.0
76
77    vector = dist_vector(point, exemplars_list, dist_metric)
78
79    if softmax:
80        for i in range(vector.shape[0]):
81            result[i] = 1.0 / vector[i]
82        result = np.exp(result - np.nanmax(result))
83        sum = np.sum(result)
84
85    else:
86        for i in range(vector.shape[0]):
87            if vector[i] != 0:
88                result[i] = 1.0 / vector[i]
89            else:
90                result[i] = DBL_MAX / vector.shape[0]
91            sum += result[i]
92
93    for i in range(result.shape[0]):
94        result[i] = result[i] / sum
95
96    return result
97
98cpdef np.ndarray[np.float64_t, ndim=2] all_points_dist_membership_vector(
99        np.ndarray[np.float64_t, ndim=2] all_points,
100        list exemplars_list,
101        DistanceMetric dist_metric,
102        softmax=False):
103
104    cdef np.ndarray[np.float64_t, ndim=2] result
105    cdef np.intp_t i
106
107    result = np.empty((all_points.shape[0], len(exemplars_list)),
108                      dtype=np.float64)
109
110    for i in range(all_points.shape[0]):
111        result[i] = dist_membership_vector(all_points[i],
112                                           exemplars_list,
113                                           dist_metric,
114                                           softmax)
115
116    return result
117
118cdef np.ndarray[np.float64_t, ndim=1] merge_height(
119        np.intp_t point_cluster,
120        np.float64_t point_lambda,
121        np.ndarray[np.intp_t, ndim=1] clusters,
122        np.ndarray cluster_tree):
123
124    cdef np.intp_t i
125    cdef np.intp_t j
126
127    cdef np.intp_t left_cluster
128    cdef np.intp_t right_cluster
129    cdef int took_right_parent
130    cdef int took_left_parent
131    cdef np.intp_t cluster
132
133    cdef np.ndarray[np.float64_t, ndim=1] result = np.empty(clusters.shape[0],
134                                                            dtype=np.float64)
135    cdef np.ndarray[np.intp_t, ndim=1] parents
136    cdef np.ndarray[np.intp_t, ndim=1] children
137    cdef np.ndarray[np.float64_t, ndim=1] lambdas
138
139    # convert the cluster tree for fast direct access
140    parents = cluster_tree['parent'].astype(np.intp)
141    children = cluster_tree['child'].astype(np.intp)
142    lambdas = cluster_tree['lambda_val'].astype(np.float64)
143
144
145    for i in range(clusters.shape[0]):
146
147        took_right_parent = False
148        took_left_parent = False
149
150        right_cluster = clusters[i]
151        left_cluster = point_cluster
152
153        while left_cluster != right_cluster:
154            if left_cluster > right_cluster:
155                took_left_parent = True
156                last_cluster = left_cluster
157
158                # Set left_cluster to be its parent
159                for j in range(children.shape[0]):
160                    if children[j] == left_cluster:
161                        left_cluster = parents[j]
162                        break
163            else:
164                took_right_parent = True
165                last_cluster = right_cluster
166
167                # Set right_cluster to be its parent
168                for j in range(children.shape[0]):
169                    if children[j] == right_cluster:
170                        right_cluster = parents[j]
171                        break
172
173        if took_left_parent and took_right_parent:
174            # Take the lambda value of last_cluster merging in
175            for j in range(children.shape[0]):
176                if children[j] == last_cluster:
177                    result[i] = lambdas[j]
178                    break
179        else:
180            result[i] = point_lambda
181
182    return result
183
184cpdef np.ndarray[np.float64_t, ndim=1] per_cluster_scores(
185        np.intp_t neighbor,
186        np.float32_t lambda_,
187        np.ndarray[np.intp_t, ndim=1] clusters,
188        np.ndarray tree,
189        dict max_lambda_dict,
190        np.ndarray cluster_tree):
191
192    cdef np.intp_t point_cluster
193    cdef np.float64_t point_lambda
194    cdef np.float64_t max_lambda
195
196    cdef np.intp_t i
197
198    cdef np.ndarray[np.float64_t, ndim=1] result
199
200    point_row = get_tree_row_with_child(tree, neighbor)
201    point_cluster = point_row['parent']
202    point_lambda = lambda_
203    max_lambda = max_lambda_dict[point_cluster] + 1e-8 # avoid zero
204
205    # Save an allocation by assigning and reusing result ...
206    # height = merge_height(point_cluster, point_lambda,
207    #                       clusters, cluster_tree)
208    result = merge_height(point_cluster, point_lambda,
209                          clusters, cluster_tree)
210
211    # Cythonize: result = np.exp(-(max_lambda / height))
212    for i in range(result.shape[0]):
213        # result[i] = exp(-(max_lambda / result[i]))
214        result[i] = (max_lambda / (max_lambda - result[i]))
215
216    return result
217
218cpdef np.ndarray[np.float64_t, ndim=1] outlier_membership_vector(neighbor,
219            lambda_, clusters, tree, max_lambda_dict, cluster_tree,
220            softmax=True):
221
222    cdef np.ndarray[np.float64_t, ndim=1] result
223
224    if softmax:
225        result = per_cluster_scores(neighbor, lambda_, clusters, tree,
226                                    max_lambda_dict, cluster_tree)
227        # Scale for numerical stability, mathematically equivalent with old
228        # version due to the scaling with the sum in below.
229        result = np.exp(result - np.nanmax(result))
230        #result[~np.isfinite(result)] = np.finfo(np.double).max
231    else:
232        result = per_cluster_scores(neighbor, lambda_, clusters, tree,
233                                    max_lambda_dict, cluster_tree)
234
235    result /= result.sum()
236    return result
237
238cpdef np.float64_t prob_in_some_cluster(neighbor, lambda_, clusters, tree,
239            max_lambda_dict, cluster_tree):
240
241    cdef np.ndarray[np.float64_t, ndim=1] cluster_merge_heights
242
243    cdef np.intp_t point_cluster
244    cdef np.float64_t point_lambda
245    cdef np.float64_t max_lambda
246
247    point_row = get_tree_row_with_child(tree, neighbor)
248    point_cluster = point_row['parent']
249    point_lambda = lambda_
250
251    cluster_merge_heights = \
252        merge_height(point_cluster, point_lambda, clusters, cluster_tree)
253    point_height = cluster_merge_heights.max()
254    nearest_cluster = clusters[cluster_merge_heights.argmax()]
255
256    max_lambda = max(lambda_, max_lambda_dict[nearest_cluster]) + 1e-8 # avoid z
257
258    return (point_height / max_lambda)
259
260cpdef np.ndarray[np.float64_t, ndim=2] all_points_per_cluster_scores(
261        np.ndarray[np.intp_t, ndim=1] clusters,
262        np.ndarray tree,
263        dict max_lambda_dict,
264        np.ndarray cluster_tree):
265
266    cdef np.intp_t num_points = tree['parent'].min()
267    cdef np.ndarray[np.float64_t, ndim=2] result_arr
268    cdef np.float64_t[:, ::1] result
269    cdef np.intp_t point
270    cdef np.intp_t point_cluster
271    cdef np.float64_t point_lambda
272    cdef np.float64_t max_lambda
273
274    cdef np.intp_t i, j
275
276    result_arr = np.empty((num_points, clusters.shape[0]), dtype=np.float64)
277    result = (<np.float64_t [:num_points, :clusters.shape[0]:1]>
278                 (<np.float64_t *> result_arr.data))
279
280    point_tree = tree[tree['child_size'] == 1]
281
282    for i in range(point_tree.shape[0]):
283        point_row = point_tree[i]
284        point = point_row['child']
285        point_cluster = point_row['parent']
286        point_lambda = point_row['lambda_val']
287        max_lambda = max_lambda_dict[point_cluster] + 1e-8 # avoid zero lambda
288
289        # Can we not do a faster merge height operation here?
290        result_arr[point] = merge_height(point_cluster, point_lambda,
291                                          clusters, cluster_tree)
292
293        # Cythonize: result = np.exp(-(max_lambda / height))
294        for j in range(result_arr.shape[1]):
295            result[point][j] = exp(-(max_lambda / result[point][j]))
296
297    return result_arr
298
299cpdef np.ndarray[np.float64_t, ndim=2] all_points_outlier_membership_vector(
300        np.ndarray[np.intp_t, ndim=1] clusters,
301        np.ndarray tree,
302        dict max_lambda_dict,
303        np.ndarray cluster_tree,
304        np.intp_t softmax=True):
305
306    cdef np.ndarray[np.float64_t, ndim=2] per_cluster_scores
307
308    per_cluster_scores = all_points_per_cluster_scores(
309                                clusters,
310                                tree,
311                                max_lambda_dict,
312                                cluster_tree)
313    if softmax:
314        # Scale for numerical stability, mathematically equivalent with old
315        # version due to the scaling with the sum in below.
316        result = np.exp(per_cluster_scores - np.nanmax(per_cluster_scores))
317        #result[~np.isfinite(result)] = np.finfo(np.double).max
318    else:
319        result = per_cluster_scores
320
321    row_sums = result.sum(axis=1)
322    result = result / row_sums[:, np.newaxis]
323
324    return result
325
326cpdef all_points_prob_in_some_cluster(
327        np.ndarray[np.intp_t, ndim=1] clusters,
328        np.ndarray tree,
329        dict max_lambda_dict,
330        np.ndarray cluster_tree):
331
332    cdef np.ndarray[np.float64_t, ndim=1] heights
333    cdef np.intp_t num_points = tree['parent'].min()
334    cdef np.ndarray[np.float64_t, ndim=1] result
335    cdef np.intp_t point
336    cdef np.intp_t point_cluster
337    cdef np.float64_t point_lambda
338    cdef np.float64_t max_lambda
339
340    cdef np.intp_t i
341
342    result = np.empty(num_points, dtype=np.float64)
343
344    point_tree = tree[tree['child_size'] == 1]
345
346    for i in range(point_tree.shape[0]):
347        point_row = point_tree[i]
348        point = point_row['child']
349        point_cluster = point_row['parent']
350        point_lambda = point_row['lambda_val']
351
352        # Can we not do a faster merge height operation here?
353        heights = merge_height(point_cluster, point_lambda,
354                               clusters, cluster_tree)
355        max_lambda = max(max_lambda_dict[clusters[heights.argmax()]],
356                         point_lambda)
357        result[point] = (heights.max() / max_lambda)
358
359    return result
360