1""" 2Cython rewrite of the vector quantization module, originally written 3in C at src/vq.c and the wrapper at src/vq_module.c. This should be 4easier to maintain than old SWIG output. 5 6Original C version by Damian Eads. 7Translated to Cython by David Warde-Farley, October 2009. 8""" 9 10cimport cython 11import numpy as np 12cimport numpy as np 13from scipy.linalg.cython_blas cimport dgemm, sgemm 14 15from libc.math cimport sqrt 16 17ctypedef np.float64_t float64_t 18ctypedef np.float32_t float32_t 19ctypedef np.int32_t int32_t 20 21# Use Cython fused types for templating 22# Define supported data types as vq_type 23ctypedef fused vq_type: 24 float32_t 25 float64_t 26 27# When the number of features is less than this number, 28# switch back to the naive algorithm to avoid high overhead. 29DEF NFEATURES_CUTOFF=5 30 31# Initialize the NumPy C API 32np.import_array() 33 34 35cdef inline vq_type vec_sqr(int n, vq_type *p): 36 cdef vq_type result = 0.0 37 cdef int i 38 for i in range(n): 39 result += p[i] * p[i] 40 return result 41 42 43cdef inline void cal_M(int nobs, int ncodes, int nfeat, vq_type *obs, 44 vq_type *code_book, vq_type *M): 45 """ 46 Calculate M = obs * code_book.T 47 """ 48 cdef vq_type alpha = -2.0, beta = 0.0 49 50 # Call BLAS functions with Fortran ABI 51 # Note that BLAS Fortran ABI uses column-major order 52 if vq_type is float32_t: 53 sgemm("T", "N", &ncodes, &nobs, &nfeat, 54 &alpha, code_book, &nfeat, obs, &nfeat, &beta, M, &ncodes) 55 else: 56 dgemm("T", "N", &ncodes, &nobs, &nfeat, 57 &alpha, code_book, &nfeat, obs, &nfeat, &beta, M, &ncodes) 58 59 60cdef int _vq(vq_type *obs, vq_type *code_book, 61 int ncodes, int nfeat, int nobs, 62 int32_t *codes, vq_type *low_dist) except -1: 63 """ 64 The underlying function (template) of _vq.vq. 65 66 Parameters 67 ---------- 68 obs : vq_type* 69 The pointer to the observation matrix. 70 code_book : vq_type* 71 The pointer to the code book matrix. 72 ncodes : int 73 The number of centroids (codes). 74 nfeat : int 75 The number of features of each observation. 76 nobs : int 77 The number of observations. 78 codes : int32_t* 79 The pointer to the new codes array. 80 low_dist : vq_type* 81 low_dist[i] is the Euclidean distance from obs[i] to the corresponding 82 centroid. 83 """ 84 # Naive algorithm is preferred when nfeat is small 85 if nfeat < NFEATURES_CUTOFF: 86 _vq_small_nf(obs, code_book, ncodes, nfeat, nobs, codes, low_dist) 87 return 0 88 89 cdef np.npy_intp i, j 90 cdef vq_type *p_obs 91 cdef vq_type *p_codes 92 cdef vq_type dist_sqr 93 cdef np.ndarray[vq_type, ndim=1] obs_sqr, codes_sqr 94 cdef np.ndarray[vq_type, ndim=2] M 95 96 if vq_type is float32_t: 97 obs_sqr = np.ndarray(nobs, np.float32) 98 codes_sqr = np.ndarray(ncodes, np.float32) 99 M = np.ndarray((nobs, ncodes), np.float32) 100 else: 101 obs_sqr = np.ndarray(nobs, np.float64) 102 codes_sqr = np.ndarray(ncodes, np.float64) 103 M = np.ndarray((nobs, ncodes), np.float64) 104 105 p_obs = obs 106 for i in range(nobs): 107 # obs_sqr[i] is the inner product of the i-th observation with itself 108 obs_sqr[i] = vec_sqr(nfeat, p_obs) 109 p_obs += nfeat 110 111 p_codes = code_book 112 for i in range(ncodes): 113 # codes_sqr[i] is the inner product of the i-th code with itself 114 codes_sqr[i] = vec_sqr(nfeat, p_codes) 115 p_codes += nfeat 116 117 # M[i][j] is the inner product of the i-th obs and j-th code 118 # M = obs * codes.T 119 cal_M(nobs, ncodes, nfeat, obs, code_book, <vq_type *>M.data) 120 121 for i in range(nobs): 122 for j in range(ncodes): 123 dist_sqr = (M[i, j] + 124 obs_sqr[i] + codes_sqr[j]) 125 if dist_sqr < low_dist[i]: 126 codes[i] = j 127 low_dist[i] = dist_sqr 128 129 # dist_sqr may be negative due to float point errors 130 if low_dist[i] > 0: 131 low_dist[i] = sqrt(low_dist[i]) 132 else: 133 low_dist[i] = 0 134 135 return 0 136 137 138cdef void _vq_small_nf(vq_type *obs, vq_type *code_book, 139 int ncodes, int nfeat, int nobs, 140 int32_t *codes, vq_type *low_dist): 141 """ 142 Vector quantization using naive algorithm. 143 This is preferred when nfeat is small. 144 The parameters are the same as those of _vq. 145 """ 146 # Temporary variables 147 cdef vq_type dist_sqr, diff 148 cdef np.npy_intp i, j, k, obs_offset = 0, code_offset 149 150 # Index and pointer to keep track of the current position in 151 # both arrays so that we don't have to always do index * nfeat. 152 cdef vq_type *current_obs 153 cdef vq_type *current_code 154 155 for i in range(nobs): 156 code_offset = 0 157 current_obs = &(obs[obs_offset]) 158 159 for j in range(ncodes): 160 dist_sqr = 0 161 current_code = &(code_book[code_offset]) 162 163 # Distance between code_book[j] and obs[i] 164 for k in range(nfeat): 165 diff = current_code[k] - current_obs[k] 166 dist_sqr += diff * diff 167 code_offset += nfeat 168 169 # Replace the code assignment and record distance if necessary 170 if dist_sqr < low_dist[i]: 171 codes[i] = j 172 low_dist[i] = dist_sqr 173 174 low_dist[i] = sqrt(low_dist[i]) 175 176 # Update the offset of the current observation 177 obs_offset += nfeat 178 179 180def vq(np.ndarray obs, np.ndarray codes): 181 """ 182 Vector quantization ndarray wrapper. Only support float32 and float64. 183 184 Parameters 185 ---------- 186 obs : ndarray 187 The observation matrix. Each row is an observation. 188 codes : ndarray 189 The code book matrix. 190 191 Notes 192 ----- 193 The observation matrix and code book matrix should have same ndim and 194 same number of columns (features). Only 1-dimensional and 2-dimensional 195 arrays are supported. 196 """ 197 cdef int nobs, ncodes, nfeat 198 cdef np.ndarray outcodes, outdists 199 200 # Ensure the arrays are contiguous 201 obs = np.ascontiguousarray(obs) 202 codes = np.ascontiguousarray(codes) 203 204 if obs.dtype != codes.dtype: 205 raise TypeError('observation and code should have same dtype') 206 if obs.dtype not in (np.float32, np.float64): 207 raise TypeError('type other than float or double not supported') 208 if obs.ndim != codes.ndim: 209 raise ValueError( 210 'observation and code should have same number of dimensions') 211 212 if obs.ndim == 1: 213 nfeat = 1 214 nobs = obs.shape[0] 215 ncodes = codes.shape[0] 216 elif obs.ndim == 2: 217 nfeat = obs.shape[1] 218 nobs = obs.shape[0] 219 ncodes = codes.shape[0] 220 if nfeat != codes.shape[1]: 221 raise ValueError('obs and code should have same number of ' 222 'features (columns)') 223 else: 224 raise ValueError('ndim different than 1 or 2 are not supported') 225 226 # Initialize outdists and outcodes array. 227 # Outdists should be initialized as INF. 228 outdists = np.empty((nobs,), dtype=obs.dtype) 229 outcodes = np.empty((nobs,), dtype=np.int32) 230 outdists.fill(np.inf) 231 232 if obs.dtype.type is np.float32: 233 _vq(<float32_t *>obs.data, <float32_t *>codes.data, 234 ncodes, nfeat, nobs, <int32_t *>outcodes.data, 235 <float32_t *>outdists.data) 236 elif obs.dtype.type is np.float64: 237 _vq(<float64_t *>obs.data, <float64_t *>codes.data, 238 ncodes, nfeat, nobs, <int32_t *>outcodes.data, 239 <float64_t *>outdists.data) 240 241 return outcodes, outdists 242 243 244@cython.cdivision(True) 245cdef np.ndarray _update_cluster_means(vq_type *obs, int32_t *labels, 246 vq_type *cb, int nobs, int nc, int nfeat): 247 """ 248 The underlying function (template) of _vq.update_cluster_means. 249 250 Parameters 251 ---------- 252 obs : vq_type* 253 The pointer to the observation matrix. 254 labels : int32_t* 255 The pointer to the array of the labels (codes) of the observations. 256 cb : vq_type* 257 The pointer to the new code book matrix. 258 nobs : int 259 The number of observations. 260 nc : int 261 The number of centroids (codes). 262 nfeat : int 263 The number of features of each observation. 264 265 Returns 266 ------- 267 has_members : ndarray 268 A boolean array indicating which clusters have members. 269 """ 270 cdef np.npy_intp i, j, cluster_size, label 271 cdef vq_type *obs_p 272 cdef vq_type *cb_p 273 cdef np.ndarray[int, ndim=1] obs_count 274 275 # Calculate the sums the numbers of obs in each cluster 276 obs_count = np.zeros(nc, np.intc) 277 obs_p = obs 278 for i in range(nobs): 279 label = labels[i] 280 cb_p = cb + nfeat * label 281 282 for j in range(nfeat): 283 cb_p[j] += obs_p[j] 284 285 # Count the obs in each cluster 286 obs_count[label] += 1 287 obs_p += nfeat 288 289 cb_p = cb 290 for i in range(nc): 291 cluster_size = obs_count[i] 292 293 if cluster_size > 0: 294 # Calculate the centroid of each cluster 295 for j in range(nfeat): 296 cb_p[j] /= cluster_size 297 298 cb_p += nfeat 299 300 # Return a boolean array indicating which clusters have members 301 return obs_count > 0 302 303 304def update_cluster_means(np.ndarray obs, np.ndarray labels, int nc): 305 """ 306 The update-step of K-means. Calculate the mean of observations in each 307 cluster. 308 309 Parameters 310 ---------- 311 obs : ndarray 312 The observation matrix. Each row is an observation. Its dtype must be 313 float32 or float64. 314 labels : ndarray 315 The label of each observation. Must be an 1d array. 316 nc : int 317 The number of centroids. 318 319 Returns 320 ------- 321 cb : ndarray 322 The new code book. 323 has_members : ndarray 324 A boolean array indicating which clusters have members. 325 326 Notes 327 ----- 328 The empty clusters will be set to all zeros and the corresponding elements 329 in `has_members` will be `False`. The upper level function should decide 330 how to deal with them. 331 """ 332 cdef np.ndarray has_members, cb 333 cdef int nfeat 334 335 # Ensure the arrays are contiguous 336 obs = np.ascontiguousarray(obs) 337 labels = np.ascontiguousarray(labels) 338 339 if obs.dtype not in (np.float32, np.float64): 340 raise TypeError('type other than float or double not supported') 341 if labels.dtype.type is not np.int32: 342 labels = labels.astype(np.int32) 343 if labels.ndim != 1: 344 raise ValueError('labels must be an 1d array') 345 346 if obs.ndim == 1: 347 nfeat = 1 348 cb = np.zeros(nc, dtype=obs.dtype) 349 elif obs.ndim == 2: 350 nfeat = obs.shape[1] 351 cb = np.zeros((nc, nfeat), dtype=obs.dtype) 352 else: 353 raise ValueError('ndim different than 1 or 2 are not supported') 354 355 if obs.dtype.type is np.float32: 356 has_members = _update_cluster_means(<float32_t *>obs.data, 357 <int32_t *>labels.data, 358 <float32_t *>cb.data, 359 obs.shape[0], nc, nfeat) 360 elif obs.dtype.type is np.float64: 361 has_members = _update_cluster_means(<float64_t *>obs.data, 362 <int32_t *>labels.data, 363 <float64_t *>cb.data, 364 obs.shape[0], nc, nfeat) 365 366 return cb, has_members 367