1# cython: boundscheck=False 2# cython: nonecheck=False 3# cython: initializedcheck=False 4# mutual reachability distance compiutations 5# Authors: Leland McInnes 6# License: 3-clause BSD 7 8import numpy as np 9cimport numpy as np 10 11from scipy.spatial.distance import pdist, squareform 12from scipy.sparse import lil_matrix as sparse_matrix 13from sklearn.neighbors import KDTree, BallTree 14import gc 15 16 17def mutual_reachability(distance_matrix, min_points=5, alpha=1.0): 18 """Compute the weighted adjacency matrix of the mutual reachability 19 graph of a distance matrix. 20 21 Parameters 22 ---------- 23 distance_matrix : ndarray, shape (n_samples, n_samples) 24 Array of distances between samples. 25 26 min_points : int, optional (default=5) 27 The number of points in a neighbourhood for a point to be considered 28 a core point. 29 30 Returns 31 ------- 32 mututal_reachability: ndarray, shape (n_samples, n_samples) 33 Weighted adjacency matrix of the mutual reachability graph. 34 35 References 36 ---------- 37 .. [1] Campello, R. J., Moulavi, D., & Sander, J. (2013, April). 38 Density-based clustering based on hierarchical density estimates. 39 In Pacific-Asia Conference on Knowledge Discovery and Data Mining 40 (pp. 160-172). Springer Berlin Heidelberg. 41 """ 42 size = distance_matrix.shape[0] 43 min_points = min(size - 1, min_points) 44 try: 45 core_distances = np.partition(distance_matrix, 46 min_points, 47 axis=0)[min_points] 48 except AttributeError: 49 core_distances = np.sort(distance_matrix, 50 axis=0)[min_points] 51 52 if alpha != 1.0: 53 distance_matrix = distance_matrix / alpha 54 55 stage1 = np.where(core_distances > distance_matrix, 56 core_distances, distance_matrix) 57 result = np.where(core_distances > stage1.T, 58 core_distances.T, stage1.T).T 59 return result 60 61 62cpdef sparse_mutual_reachability(object lil_matrix, np.intp_t min_points=5, 63 float alpha=1.0, float max_dist=0.): 64 65 cdef np.intp_t i 66 cdef np.intp_t j 67 cdef np.intp_t n 68 cdef np.double_t mr_dist 69 cdef list sorted_row_data 70 cdef np.ndarray[dtype=np.double_t, ndim=1] core_distance 71 cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_row_data 72 cdef np.ndarray[dtype=np.int32_t, ndim=1] nz_col_data 73 74 result = sparse_matrix(lil_matrix.shape) 75 core_distance = np.empty(lil_matrix.shape[0], dtype=np.double) 76 77 for i in range(lil_matrix.shape[0]): 78 sorted_row_data = sorted(lil_matrix.data[i]) 79 if min_points < len(sorted_row_data): 80 core_distance[i] = sorted_row_data[min_points] 81 else: 82 core_distance[i] = np.infty 83 84 if alpha != 1.0: 85 lil_matrix = lil_matrix / alpha 86 87 nz_row_data, nz_col_data = lil_matrix.nonzero() 88 89 for n in range(nz_row_data.shape[0]): 90 i = nz_row_data[n] 91 j = nz_col_data[n] 92 93 mr_dist = max(core_distance[i], core_distance[j], lil_matrix[i, j]) 94 if np.isfinite(mr_dist): 95 result[i, j] = mr_dist 96 elif max_dist > 0: 97 result[i, j] = max_dist 98 99 return result.tocsr() 100 101 102def kdtree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5, 103 alpha=1.0, **kwargs): 104 dim = distance_matrix.shape[0] 105 min_points = min(dim - 1, min_points) 106 107 if metric == 'minkowski': 108 tree = KDTree(X, metric=metric, p=p) 109 else: 110 tree = KDTree(X, metric=metric, **kwargs) 111 112 core_distances = tree.query(X, k=min_points)[0][:, -1] 113 114 if alpha != 1.0: 115 distance_matrix = distance_matrix / alpha 116 117 stage1 = np.where(core_distances > distance_matrix, 118 core_distances, distance_matrix) 119 result = np.where(core_distances > stage1.T, 120 core_distances.T, stage1.T).T 121 return result 122 123 124def balltree_mutual_reachability(X, distance_matrix, metric, p=2, min_points=5, 125 alpha=1.0, **kwargs): 126 dim = distance_matrix.shape[0] 127 min_points = min(dim - 1, min_points) 128 129 tree = BallTree(X, metric=metric, **kwargs) 130 131 core_distances = tree.query(X, k=min_points)[0][:, -1] 132 133 if alpha != 1.0: 134 distance_matrix = distance_matrix / alpha 135 136 stage1 = np.where(core_distances > distance_matrix, 137 core_distances, distance_matrix) 138 result = np.where(core_distances > stage1.T, 139 core_distances.T, stage1.T).T 140 return result 141 142 143cdef np.ndarray[np.double_t, ndim=1] mutual_reachability_from_pdist( 144 np.ndarray[np.double_t, ndim=1] core_distances, 145 np.ndarray[np.double_t, ndim=1] dists, np.intp_t dim): 146 147 cdef np.intp_t i 148 cdef np.intp_t j 149 cdef np.intp_t result_pos 150 151 result_pos = 0 152 for i in range(dim): 153 for j in range(i + 1, dim): 154 if core_distances[i] > core_distances[j]: 155 if core_distances[i] > dists[result_pos]: 156 dists[result_pos] = core_distances[i] 157 158 else: 159 if core_distances[j] > dists[result_pos]: 160 dists[result_pos] = core_distances[j] 161 162 result_pos += 1 163 164 return dists 165 166 167def kdtree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0, 168 **kwargs): 169 170 dim = X.shape[0] 171 min_points = min(dim - 1, min_points) 172 173 if metric == 'minkowski': 174 tree = KDTree(X, metric=metric, p=p) 175 else: 176 tree = KDTree(X, metric=metric, **kwargs) 177 178 core_distances = tree.query(X, k=min_points)[0][:, -1] 179 180 del tree 181 gc.collect() 182 183 dists = pdist(X, metric=metric, p=p, **kwargs) 184 185 if alpha != 1.0: 186 dists /= alpha 187 188 dists = mutual_reachability_from_pdist(core_distances, dists, dim) 189 190 return dists 191 192 193def balltree_pdist_mutual_reachability(X, metric, p=2, min_points=5, alpha=1.0, 194 **kwargs): 195 196 dim = X.shape[0] 197 min_points = min(dim - 1, min_points) 198 199 tree = BallTree(X, metric=metric, **kwargs) 200 201 core_distances = tree.query(X, k=min_points)[0][:, -1] 202 203 del tree 204 gc.collect() 205 206 dists = pdist(X, metric=metric, p=p, **kwargs) 207 208 if alpha != 1.0: 209 dists /= alpha 210 211 dists = mutual_reachability_from_pdist(core_distances, dists, dim) 212 213 return dists 214