1import numpy as np
2from sklearn.metrics import pairwise_distances
3from scipy.spatial.distance import cdist
4from ._hdbscan_linkage import mst_linkage_core
5from .hdbscan_ import isclose
6
7def all_points_core_distance(distance_matrix, d=2.0):
8    """
9    Compute the all-points-core-distance for all the points of a cluster.
10
11    Parameters
12    ----------
13    distance_matrix : array (cluster_size, cluster_size)
14        The pairwise distance matrix between points in the cluster.
15
16    d : integer
17        The dimension of the data set, which is used in the computation
18        of the all-point-core-distance as per the paper.
19
20    Returns
21    -------
22    core_distances : array (cluster_size,)
23        The all-points-core-distance of each point in the cluster
24
25    References
26    ----------
27    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
28    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
29    """
30    distance_matrix[distance_matrix != 0] = (1.0 / distance_matrix[
31        distance_matrix != 0]) ** d
32    result = distance_matrix.sum(axis=1)
33    result /= distance_matrix.shape[0] - 1
34    result **= (-1.0 / d)
35
36    return result
37
38
39def all_points_mutual_reachability(X, labels, cluster_id,
40                                   metric='euclidean', d=None, **kwd_args):
41    """
42    Compute the all-points-mutual-reachability distances for all the points of
43    a cluster.
44
45    If metric is 'precomputed' then assume X is a distance matrix for the full
46    dataset. Note that in this case you must pass in 'd' the dimension of the
47    dataset.
48
49    Parameters
50    ----------
51    X : array (n_samples, n_features) or (n_samples, n_samples)
52        The input data of the clustering. This can be the data, or, if
53        metric is set to `precomputed` the pairwise distance matrix used
54        for the clustering.
55
56    labels : array (n_samples)
57        The label array output by the clustering, providing an integral
58        cluster label to each data point, with -1 for noise points.
59
60    cluster_id : integer
61        The cluster label for which to compute the all-points
62        mutual-reachability (which should be done on a cluster
63        by cluster basis).
64
65    metric : string
66        The metric used to compute distances for the clustering (and
67        to be re-used in computing distances for mr distance). If
68        set to `precomputed` then X is assumed to be the precomputed
69        distance matrix between samples.
70
71    d : integer (or None)
72        The number of features (dimension) of the dataset. This need only
73        be set in the case of metric being set to `precomputed`, where
74        the ambient dimension of the data is unknown to the function.
75
76    **kwd_args :
77        Extra arguments to pass to the distance computation for other
78        metrics, such as minkowski, Mahanalobis etc.
79
80    Returns
81    -------
82
83    mutual_reachaibility : array (n_samples, n_samples)
84        The pairwise mutual reachability distances between all points in `X`
85        with `label` equal to `cluster_id`.
86
87    core_distances : array (n_samples,)
88        The all-points-core_distance of all points in `X` with `label` equal
89        to `cluster_id`.
90
91    References
92    ----------
93    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
94    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
95    """
96    if metric == 'precomputed':
97        if d is None:
98            raise ValueError('If metric is precomputed a '
99                             'd value must be provided!')
100        distance_matrix = X[labels == cluster_id, :][:, labels == cluster_id]
101    else:
102        subset_X = X[labels == cluster_id, :]
103        distance_matrix = pairwise_distances(subset_X, metric=metric,
104                                             **kwd_args)
105        d = X.shape[1]
106
107    core_distances = all_points_core_distance(distance_matrix.copy(), d=d)
108    core_dist_matrix = np.tile(core_distances, (core_distances.shape[0], 1))
109
110    result = np.dstack(
111        [distance_matrix, core_dist_matrix, core_dist_matrix.T]).max(axis=-1)
112
113    return result, core_distances
114
115
116def internal_minimum_spanning_tree(mr_distances):
117    """
118    Compute the 'internal' minimum spanning tree given a matrix of mutual
119    reachability distances. Given a minimum spanning tree the 'internal'
120    graph is the subgraph induced by vertices of degree greater than one.
121
122    Parameters
123    ----------
124    mr_distances : array (cluster_size, cluster_size)
125        The pairwise mutual reachability distances, inferred to be the edge
126        weights of a complete graph. Since MSTs are computed per cluster
127        this is the all-points-mutual-reacability for points within a single
128        cluster.
129
130    Returns
131    -------
132    internal_nodes : array
133        An array listing the indices of the internal nodes of the MST
134
135    internal_edges : array (?, 3)
136        An array of internal edges in weighted edge list format; that is
137        an edge is an array of length three listing the two vertices
138        forming the edge and weight of the edge.
139
140    References
141    ----------
142    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
143    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
144    """
145    single_linkage_data = mst_linkage_core(mr_distances)
146    min_span_tree = single_linkage_data.copy()
147    for index, row in enumerate(min_span_tree[1:], 1):
148        candidates = np.where(isclose(mr_distances[int(row[1])], row[2]))[0]
149        candidates = np.intersect1d(candidates,
150                                    single_linkage_data[:index, :2].astype(
151                                        int))
152        candidates = candidates[candidates != row[1]]
153        assert len(candidates) > 0
154        row[0] = candidates[0]
155
156    vertices = np.arange(mr_distances.shape[0])[
157        np.bincount(min_span_tree.T[:2].flatten().astype(np.intp)) > 1]
158    # A little "fancy" we select from the flattened array reshape back
159    # (Fortran format to get indexing right) and take the product to do an and
160    # then convert back to boolean type.
161    edge_selection = np.prod(np.in1d(min_span_tree.T[:2], vertices).reshape(
162        (min_span_tree.shape[0], 2), order='F'), axis=1).astype(bool)
163
164    # Density sparseness is not well defined if there are no
165    # internal edges (as per the referenced paper). However
166    # MATLAB code from the original authors simply selects the
167    # largest of *all* the edges in the case that there are
168    # no internal edges, so we do the same here
169    if np.any(edge_selection):
170        # If there are any internal edges, then subselect them out
171        edges = min_span_tree[edge_selection]
172    else:
173        # If there are no internal edges then we want to take the
174        # max over all the edges that exist in the MST, so we simply
175        # do nothing and return all the edges in the MST.
176        edges = min_span_tree.copy()
177
178    return vertices, edges
179
180
181def density_separation(X, labels, cluster_id1, cluster_id2,
182                       internal_nodes1, internal_nodes2,
183                       core_distances1, core_distances2,
184                       metric='euclidean', **kwd_args):
185    """
186    Compute the density separation between two clusters. This is the minimum
187    all-points mutual reachability distance between pairs of points, one from
188    internal nodes of MSTs of each cluster.
189
190    Parameters
191    ----------
192    X : array (n_samples, n_features) or (n_samples, n_samples)
193        The input data of the clustering. This can be the data, or, if
194        metric is set to `precomputed` the pairwise distance matrix used
195        for the clustering.
196
197    labels : array (n_samples)
198        The label array output by the clustering, providing an integral
199        cluster label to each data point, with -1 for noise points.
200
201    cluster_id1 : integer
202        The first cluster label to compute separation between.
203
204    cluster_id2 : integer
205        The second cluster label to compute separation between.
206
207    internal_nodes1 : array
208        The vertices of the MST for `cluster_id1` that were internal vertices.
209
210    internal_nodes2 : array
211        The vertices of the MST for `cluster_id2` that were internal vertices.
212
213    core_distances1 : array (size of cluster_id1,)
214        The all-points-core_distances of all points in the cluster
215        specified by cluster_id1.
216
217    core_distances2 : array (size of cluster_id2,)
218        The all-points-core_distances of all points in the cluster
219        specified by cluster_id2.
220
221    metric : string
222        The metric used to compute distances for the clustering (and
223        to be re-used in computing distances for mr distance). If
224        set to `precomputed` then X is assumed to be the precomputed
225        distance matrix between samples.
226
227    **kwd_args :
228        Extra arguments to pass to the distance computation for other
229        metrics, such as minkowski, Mahanalobis etc.
230
231    Returns
232    -------
233    The 'density separation' between the clusters specified by
234    `cluster_id1` and `cluster_id2`.
235
236    References
237    ----------
238    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
239    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
240    """
241    if metric == 'precomputed':
242        sub_select = X[labels == cluster_id1, :][:, labels == cluster_id2]
243        distance_matrix = sub_select[internal_nodes1, :][:, internal_nodes2]
244    else:
245        cluster1 = X[labels == cluster_id1][internal_nodes1]
246        cluster2 = X[labels == cluster_id2][internal_nodes2]
247        distance_matrix = cdist(cluster1, cluster2, metric, **kwd_args)
248
249    core_dist_matrix1 = np.tile(core_distances1[internal_nodes1],
250                                (distance_matrix.shape[1], 1)).T
251    core_dist_matrix2 = np.tile(core_distances2[internal_nodes2],
252                                (distance_matrix.shape[0], 1))
253
254    mr_dist_matrix = np.dstack([distance_matrix,
255                                core_dist_matrix1,
256                                core_dist_matrix2]).max(axis=-1)
257
258    return mr_dist_matrix.min()
259
260
261def validity_index(X, labels, metric='euclidean',
262                   d=None, per_cluster_scores=False, **kwd_args):
263    """
264    Compute the density based cluster validity index for the
265    clustering specified by `labels` and for each cluster in `labels`.
266
267    Parameters
268    ----------
269    X : array (n_samples, n_features) or (n_samples, n_samples)
270        The input data of the clustering. This can be the data, or, if
271        metric is set to `precomputed` the pairwise distance matrix used
272        for the clustering.
273
274    labels : array (n_samples)
275        The label array output by the clustering, providing an integral
276        cluster label to each data point, with -1 for noise points.
277
278    metric : optional, string (default 'euclidean')
279        The metric used to compute distances for the clustering (and
280        to be re-used in computing distances for mr distance). If
281        set to `precomputed` then X is assumed to be the precomputed
282        distance matrix between samples.
283
284    d : optional, integer (or None) (default None)
285        The number of features (dimension) of the dataset. This need only
286        be set in the case of metric being set to `precomputed`, where
287        the ambient dimension of the data is unknown to the function.
288
289    per_cluster_scores : optional, boolean (default False)
290        Whether to return the validity index for individual clusters.
291        Defaults to False with the function returning a single float
292        value for the whole clustering.
293
294    **kwd_args :
295        Extra arguments to pass to the distance computation for other
296        metrics, such as minkowski, Mahanalobis etc.
297
298    Returns
299    -------
300    validity_index : float
301        The density based cluster validity index for the clustering. This
302        is a numeric value between -1 and 1, with higher values indicating
303        a 'better' clustering.
304
305    per_cluster_validity_index : array (n_clusters,)
306        The cluster validity index of each individual cluster as an array.
307        The overall validity index is the weighted average of these values.
308        Only returned if per_cluster_scores is set to True.
309
310    References
311    ----------
312    Moulavi, D., Jaskowiak, P.A., Campello, R.J., Zimek, A. and Sander, J.,
313    2014. Density-Based Clustering Validation. In SDM (pp. 839-847).
314    """
315    core_distances = {}
316    density_sparseness = {}
317    mst_nodes = {}
318    mst_edges = {}
319
320    max_cluster_id = labels.max() + 1
321    density_sep = np.inf * np.ones((max_cluster_id, max_cluster_id),
322                                   dtype=np.float64)
323    cluster_validity_indices = np.empty(max_cluster_id, dtype=np.float64)
324
325    for cluster_id in range(max_cluster_id):
326
327        if np.sum(labels == cluster_id) == 0:
328            continue
329
330        mr_distances, core_distances[
331            cluster_id] = all_points_mutual_reachability(
332            X,
333            labels,
334            cluster_id,
335            metric,
336            d,
337            **kwd_args
338        )
339
340        mst_nodes[cluster_id], mst_edges[cluster_id] = \
341            internal_minimum_spanning_tree(mr_distances)
342        density_sparseness[cluster_id] = mst_edges[cluster_id].T[2].max()
343
344    for i in range(max_cluster_id):
345
346        if np.sum(labels == i) == 0:
347            continue
348
349        internal_nodes_i = mst_nodes[i]
350        for j in range(i + 1, max_cluster_id):
351
352            if np.sum(labels == j) == 0:
353                continue
354
355            internal_nodes_j = mst_nodes[j]
356            density_sep[i, j] = density_separation(
357                X, labels, i, j,
358                internal_nodes_i, internal_nodes_j,
359                core_distances[i], core_distances[j],
360                metric=metric, **kwd_args
361            )
362            density_sep[j, i] = density_sep[i, j]
363
364    n_samples = float(X.shape[0])
365    result = 0
366
367    for i in range(max_cluster_id):
368
369        if np.sum(labels == i) == 0:
370            continue
371
372        min_density_sep = density_sep[i].min()
373        cluster_validity_indices[i] = (
374            (min_density_sep - density_sparseness[i]) /
375            max(min_density_sep, density_sparseness[i])
376        )
377        cluster_size = np.sum(labels == i)
378        result += (cluster_size / n_samples) * cluster_validity_indices[i]
379
380    if per_cluster_scores:
381        return result, cluster_validity_indices
382    else:
383        return result
384