1"""
2Cython rewrite of the vector quantization module, originally written
3in C at src/vq.c and the wrapper at src/vq_module.c. This should be
4easier to maintain than old SWIG output.
5
6Original C version by Damian Eads.
7Translated to Cython by David Warde-Farley, October 2009.
8"""
9
10cimport cython
11import numpy as np
12cimport numpy as np
13from scipy.linalg.cython_blas cimport dgemm, sgemm
14
15from libc.math cimport sqrt
16
17ctypedef np.float64_t float64_t
18ctypedef np.float32_t float32_t
19ctypedef np.int32_t int32_t
20
21# Use Cython fused types for templating
22# Define supported data types as vq_type
23ctypedef fused vq_type:
24    float32_t
25    float64_t
26
27# When the number of features is less than this number,
28# switch back to the naive algorithm to avoid high overhead.
29DEF NFEATURES_CUTOFF=5
30
31# Initialize the NumPy C API
32np.import_array()
33
34
35cdef inline vq_type vec_sqr(int n, vq_type *p):
36    cdef vq_type result = 0.0
37    cdef int i
38    for i in range(n):
39        result += p[i] * p[i]
40    return result
41
42
43cdef inline void cal_M(int nobs, int ncodes, int nfeat, vq_type *obs,
44                       vq_type *code_book, vq_type *M):
45    """
46    Calculate M = obs * code_book.T
47    """
48    cdef vq_type alpha = -2.0, beta = 0.0
49
50    # Call BLAS functions with Fortran ABI
51    # Note that BLAS Fortran ABI uses column-major order
52    if vq_type is float32_t:
53        sgemm("T", "N", &ncodes, &nobs, &nfeat,
54               &alpha, code_book, &nfeat, obs, &nfeat, &beta, M, &ncodes)
55    else:
56        dgemm("T", "N", &ncodes, &nobs, &nfeat,
57              &alpha, code_book, &nfeat, obs, &nfeat, &beta, M, &ncodes)
58
59
60cdef int _vq(vq_type *obs, vq_type *code_book,
61             int ncodes, int nfeat, int nobs,
62             int32_t *codes, vq_type *low_dist) except -1:
63    """
64    The underlying function (template) of _vq.vq.
65
66    Parameters
67    ----------
68    obs : vq_type*
69        The pointer to the observation matrix.
70    code_book : vq_type*
71        The pointer to the code book matrix.
72    ncodes : int
73        The number of centroids (codes).
74    nfeat : int
75        The number of features of each observation.
76    nobs : int
77        The number of observations.
78    codes : int32_t*
79        The pointer to the new codes array.
80    low_dist : vq_type*
81        low_dist[i] is the Euclidean distance from obs[i] to the corresponding
82        centroid.
83    """
84    # Naive algorithm is preferred when nfeat is small
85    if nfeat < NFEATURES_CUTOFF:
86        _vq_small_nf(obs, code_book, ncodes, nfeat, nobs, codes, low_dist)
87        return 0
88
89    cdef np.npy_intp i, j
90    cdef vq_type *p_obs
91    cdef vq_type *p_codes
92    cdef vq_type dist_sqr
93    cdef np.ndarray[vq_type, ndim=1] obs_sqr, codes_sqr
94    cdef np.ndarray[vq_type, ndim=2] M
95
96    if vq_type is float32_t:
97        obs_sqr = np.ndarray(nobs, np.float32)
98        codes_sqr = np.ndarray(ncodes, np.float32)
99        M = np.ndarray((nobs, ncodes), np.float32)
100    else:
101        obs_sqr = np.ndarray(nobs, np.float64)
102        codes_sqr = np.ndarray(ncodes, np.float64)
103        M = np.ndarray((nobs, ncodes), np.float64)
104
105    p_obs = obs
106    for i in range(nobs):
107        # obs_sqr[i] is the inner product of the i-th observation with itself
108        obs_sqr[i] = vec_sqr(nfeat, p_obs)
109        p_obs += nfeat
110
111    p_codes = code_book
112    for i in range(ncodes):
113        # codes_sqr[i] is the inner product of the i-th code with itself
114        codes_sqr[i] = vec_sqr(nfeat, p_codes)
115        p_codes += nfeat
116
117    # M[i][j] is the inner product of the i-th obs and j-th code
118    # M = obs * codes.T
119    cal_M(nobs, ncodes, nfeat, obs, code_book, <vq_type *>M.data)
120
121    for i in range(nobs):
122        for j in range(ncodes):
123            dist_sqr = (M[i, j] +
124                    obs_sqr[i] + codes_sqr[j])
125            if dist_sqr < low_dist[i]:
126                codes[i] = j
127                low_dist[i] = dist_sqr
128
129        # dist_sqr may be negative due to float point errors
130        if low_dist[i] > 0:
131            low_dist[i] = sqrt(low_dist[i])
132        else:
133            low_dist[i] = 0
134
135    return 0
136
137
138cdef void _vq_small_nf(vq_type *obs, vq_type *code_book,
139                       int ncodes, int nfeat, int nobs,
140                       int32_t *codes, vq_type *low_dist):
141    """
142    Vector quantization using naive algorithm.
143    This is preferred when nfeat is small.
144    The parameters are the same as those of _vq.
145    """
146    # Temporary variables
147    cdef vq_type dist_sqr, diff
148    cdef np.npy_intp i, j, k, obs_offset = 0, code_offset
149
150    # Index and pointer to keep track of the current position in
151    # both arrays so that we don't have to always do index * nfeat.
152    cdef vq_type *current_obs
153    cdef vq_type *current_code
154
155    for i in range(nobs):
156        code_offset = 0
157        current_obs = &(obs[obs_offset])
158
159        for j in range(ncodes):
160            dist_sqr = 0
161            current_code = &(code_book[code_offset])
162
163            # Distance between code_book[j] and obs[i]
164            for k in range(nfeat):
165                diff = current_code[k] - current_obs[k]
166                dist_sqr += diff * diff
167            code_offset += nfeat
168
169            # Replace the code assignment and record distance if necessary
170            if dist_sqr < low_dist[i]:
171                codes[i] = j
172                low_dist[i] = dist_sqr
173
174        low_dist[i] = sqrt(low_dist[i])
175
176        # Update the offset of the current observation
177        obs_offset += nfeat
178
179
180def vq(np.ndarray obs, np.ndarray codes):
181    """
182    Vector quantization ndarray wrapper. Only support float32 and float64.
183
184    Parameters
185    ----------
186    obs : ndarray
187        The observation matrix. Each row is an observation.
188    codes : ndarray
189        The code book matrix.
190
191    Notes
192    -----
193    The observation matrix and code book matrix should have same ndim and
194    same number of columns (features). Only 1-dimensional and 2-dimensional
195    arrays are supported.
196    """
197    cdef int nobs, ncodes, nfeat
198    cdef np.ndarray outcodes, outdists
199
200    # Ensure the arrays are contiguous
201    obs = np.ascontiguousarray(obs)
202    codes = np.ascontiguousarray(codes)
203
204    if obs.dtype != codes.dtype:
205        raise TypeError('observation and code should have same dtype')
206    if obs.dtype not in (np.float32, np.float64):
207        raise TypeError('type other than float or double not supported')
208    if obs.ndim != codes.ndim:
209        raise ValueError(
210            'observation and code should have same number of dimensions')
211
212    if obs.ndim == 1:
213        nfeat = 1
214        nobs = obs.shape[0]
215        ncodes = codes.shape[0]
216    elif obs.ndim == 2:
217        nfeat = obs.shape[1]
218        nobs = obs.shape[0]
219        ncodes = codes.shape[0]
220        if nfeat != codes.shape[1]:
221            raise ValueError('obs and code should have same number of '
222                             'features (columns)')
223    else:
224        raise ValueError('ndim different than 1 or 2 are not supported')
225
226    # Initialize outdists and outcodes array.
227    # Outdists should be initialized as INF.
228    outdists = np.empty((nobs,), dtype=obs.dtype)
229    outcodes = np.empty((nobs,), dtype=np.int32)
230    outdists.fill(np.inf)
231
232    if obs.dtype.type is np.float32:
233        _vq(<float32_t *>obs.data, <float32_t *>codes.data,
234            ncodes, nfeat, nobs, <int32_t *>outcodes.data,
235            <float32_t *>outdists.data)
236    elif obs.dtype.type is np.float64:
237        _vq(<float64_t *>obs.data, <float64_t *>codes.data,
238            ncodes, nfeat, nobs, <int32_t *>outcodes.data,
239            <float64_t *>outdists.data)
240
241    return outcodes, outdists
242
243
244@cython.cdivision(True)
245cdef np.ndarray _update_cluster_means(vq_type *obs, int32_t *labels,
246                                      vq_type *cb, int nobs, int nc, int nfeat):
247    """
248    The underlying function (template) of _vq.update_cluster_means.
249
250    Parameters
251    ----------
252    obs : vq_type*
253        The pointer to the observation matrix.
254    labels : int32_t*
255        The pointer to the array of the labels (codes) of the observations.
256    cb : vq_type*
257        The pointer to the new code book matrix.
258    nobs : int
259        The number of observations.
260    nc : int
261        The number of centroids (codes).
262    nfeat : int
263        The number of features of each observation.
264
265    Returns
266    -------
267    has_members : ndarray
268        A boolean array indicating which clusters have members.
269    """
270    cdef np.npy_intp i, j, cluster_size, label
271    cdef vq_type *obs_p
272    cdef vq_type *cb_p
273    cdef np.ndarray[int, ndim=1] obs_count
274
275    # Calculate the sums the numbers of obs in each cluster
276    obs_count = np.zeros(nc, np.intc)
277    obs_p = obs
278    for i in range(nobs):
279        label = labels[i]
280        cb_p = cb + nfeat * label
281
282        for j in range(nfeat):
283            cb_p[j] += obs_p[j]
284
285        # Count the obs in each cluster
286        obs_count[label] += 1
287        obs_p += nfeat
288
289    cb_p = cb
290    for i in range(nc):
291        cluster_size = obs_count[i]
292
293        if cluster_size > 0:
294            # Calculate the centroid of each cluster
295            for j in range(nfeat):
296                cb_p[j] /= cluster_size
297
298        cb_p += nfeat
299
300    # Return a boolean array indicating which clusters have members
301    return obs_count > 0
302
303
304def update_cluster_means(np.ndarray obs, np.ndarray labels, int nc):
305    """
306    The update-step of K-means. Calculate the mean of observations in each
307    cluster.
308
309    Parameters
310    ----------
311    obs : ndarray
312        The observation matrix. Each row is an observation. Its dtype must be
313        float32 or float64.
314    labels : ndarray
315        The label of each observation. Must be an 1d array.
316    nc : int
317        The number of centroids.
318
319    Returns
320    -------
321    cb : ndarray
322        The new code book.
323    has_members : ndarray
324        A boolean array indicating which clusters have members.
325
326    Notes
327    -----
328    The empty clusters will be set to all zeros and the corresponding elements
329    in `has_members` will be `False`. The upper level function should decide
330    how to deal with them.
331    """
332    cdef np.ndarray has_members, cb
333    cdef int nfeat
334
335    # Ensure the arrays are contiguous
336    obs = np.ascontiguousarray(obs)
337    labels = np.ascontiguousarray(labels)
338
339    if obs.dtype not in (np.float32, np.float64):
340        raise TypeError('type other than float or double not supported')
341    if labels.dtype.type is not np.int32:
342        labels = labels.astype(np.int32)
343    if labels.ndim != 1:
344        raise ValueError('labels must be an 1d array')
345
346    if obs.ndim == 1:
347        nfeat = 1
348        cb = np.zeros(nc, dtype=obs.dtype)
349    elif obs.ndim == 2:
350        nfeat = obs.shape[1]
351        cb = np.zeros((nc, nfeat), dtype=obs.dtype)
352    else:
353        raise ValueError('ndim different than 1 or 2 are not supported')
354
355    if obs.dtype.type is np.float32:
356        has_members = _update_cluster_means(<float32_t *>obs.data,
357                                            <int32_t *>labels.data,
358                                            <float32_t *>cb.data,
359                                            obs.shape[0], nc, nfeat)
360    elif obs.dtype.type is np.float64:
361        has_members = _update_cluster_means(<float64_t *>obs.data,
362                                            <int32_t *>labels.data,
363                                            <float64_t *>cb.data,
364                                            obs.shape[0], nc, nfeat)
365
366    return cb, has_members
367