1# !python
2# cython: boundscheck=False
3# cython: wraparound=False
4# cython: cdivision=True
5
6# By Jake Vanderplas (2013) <jakevdp@cs.washington.edu>
7# written for the scikit-learn project
8# modified for HDBSCAN Dual Tree Boruvka algorithm
9# License: BSD
10
11import numpy as np
12cimport numpy as np
13np.import_array()  # required in order to use C-API
14
15from libc.math cimport fabs, sqrt, exp, cos, pow, log, acos, M_PI
16
17DTYPE = np.double
18ITYPE = np.intp
19
20
21######################################################################
22# Numpy 1.3-1.4 compatibility utilities
23cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D(
24                               np.ndarray[DTYPE_t, ndim=2, mode='c'] X):
25    return <DTYPE_t[:X.shape[0], :X.shape[1]:1]> (<DTYPE_t*> X.data)
26
27
28cdef DTYPE_t* get_vec_ptr(np.ndarray[DTYPE_t, ndim=1, mode='c'] vec):
29    return &vec[0]
30
31
32cdef DTYPE_t* get_mat_ptr(np.ndarray[DTYPE_t, ndim=2, mode='c'] mat):
33    return &mat[0, 0]
34######################################################################
35
36
37# First, define a function to get an ndarray from a memory bufffer
38cdef extern from "numpy/arrayobject.h":
39    object PyArray_SimpleNewFromData(int nd, np.npy_intp* dims,
40                                     int typenum, void* data)
41
42
43cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n):
44    # Wrap a memory buffer with an ndarray. Warning: this is not robust.
45    # In particular, if x is deallocated before the returned array goes
46    # out of scope, this could cause memory errors.  Since there is not
47    # a possibility of this for our use-case, this should be safe.
48
49    # Note: this Segfaults unless np.import_array() is called above
50    return PyArray_SimpleNewFromData(1, &n, DTYPECODE, <void*>x)
51
52
53# some handy constants
54from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin
55cdef DTYPE_t INF = np.inf
56
57
58######################################################################
59# newObj function
60#  this is a helper function for pickling
61def newObj(obj):
62    return obj.__new__(obj)
63
64
65######################################################################
66# metric mappings
67#  These map from metric id strings to class names
68METRIC_MAPPING = {'euclidean': EuclideanDistance,
69                  'l2': EuclideanDistance,
70                  'minkowski': MinkowskiDistance,
71                  'p': MinkowskiDistance,
72                  'manhattan': ManhattanDistance,
73                  'cityblock': ManhattanDistance,
74                  'l1': ManhattanDistance,
75                  'chebyshev': ChebyshevDistance,
76                  'infinity': ChebyshevDistance,
77                  'seuclidean': SEuclideanDistance,
78                  'mahalanobis': MahalanobisDistance,
79                  'wminkowski': WMinkowskiDistance,
80                  'hamming': HammingDistance,
81                  'canberra': CanberraDistance,
82                  'braycurtis': BrayCurtisDistance,
83                  'matching': MatchingDistance,
84                  'jaccard': JaccardDistance,
85                  'dice': DiceDistance,
86                  'kulsinski': KulsinskiDistance,
87                  'rogerstanimoto': RogersTanimotoDistance,
88                  'russellrao': RussellRaoDistance,
89                  'sokalmichener': SokalMichenerDistance,
90                  'sokalsneath': SokalSneathDistance,
91                  'haversine': HaversineDistance,
92                  'cosine': ArccosDistance,
93                  'arccos': ArccosDistance,
94                  'pyfunc': PyFuncDistance}
95
96
97def get_valid_metric_ids(L):
98    """Given an iterable of metric class names or class identifiers,
99    return a list of metric IDs which map to those classes.
100
101    Examples
102    --------
103    >>> L = get_valid_metric_ids([EuclideanDistance, 'ManhattanDistance'])
104    >>> sorted(L)
105    ['cityblock', 'euclidean', 'l1', 'l2', 'manhattan']
106    """
107    return [key for (key, val) in METRIC_MAPPING.items()
108            if (val.__name__ in L) or (val in L)]
109
110
111######################################################################
112# Distance Metric Classes
113cdef class DistanceMetric:
114    """DistanceMetric class
115
116    This class provides a uniform interface to fast distance metric
117    functions.  The various metrics can be accessed via the `get_metric`
118    class method and the metric string identifier (see below).
119
120    Examples
121    --------
122
123    For example, to use the Euclidean distance:
124
125    >>> dist = DistanceMetric.get_metric('euclidean')
126    >>> X = [[0, 1, 2],
127             [3, 4, 5]])
128    >>> dist.pairwise(X)
129    array([[ 0.        ,  5.19615242],
130           [ 5.19615242,  0.        ]])
131
132    Available Metrics
133    The following lists the string metric identifiers and the associated
134    distance metric classes:
135
136    **Metrics intended for real-valued vector spaces:**
137
138    ==============  ====================  ========  ===============================
139    identifier      class name            args      distance function
140    --------------  --------------------  --------  -------------------------------
141    "euclidean"     EuclideanDistance     -         ``sqrt(sum((x - y)^2))``
142    "manhattan"     ManhattanDistance     -         ``sum(|x - y|)``
143    "chebyshev"     ChebyshevDistance     -         ``sum(max(|x - y|))``
144    "minkowski"     MinkowskiDistance     p         ``sum(|x - y|^p)^(1/p)``
145    "wminkowski"    WMinkowskiDistance    p, w      ``sum(w * |x - y|^p)^(1/p)``
146    "seuclidean"    SEuclideanDistance    V         ``sqrt(sum((x - y)^2 / V))``
147    "mahalanobis"   MahalanobisDistance   V or VI   ``sqrt((x - y)' V^-1 (x - y))``
148    ==============  ====================  ========  ===============================
149
150    **Metrics intended for two-dimensional vector spaces:**  Note that the haversine
151    distance metric requires data in the form of [latitude, longitude] and both
152    inputs and outputs are in units of radians.
153
154    ============  ==================  ========================================
155    identifier    class name          distance function
156    ------------  ------------------  ----------------------------------------
157    "haversine"   HaversineDistance   2 arcsin(sqrt(sin^2(0.5*dx)
158                                             + cos(x1)cos(x2)sin^2(0.5*dy)))
159    ============  ==================  ========================================
160
161
162    **Metrics intended for integer-valued vector spaces:**  Though intended
163    for integer-valued vectors, these are also valid metrics in the case of
164    real-valued vectors.
165
166    =============  ====================  ========================================
167    identifier     class name            distance function
168    -------------  --------------------  ----------------------------------------
169    "hamming"      HammingDistance       ``N_unequal(x, y) / N_tot``
170    "canberra"     CanberraDistance      ``sum(|x - y| / (|x| + |y|))``
171    "braycurtis"   BrayCurtisDistance    ``sum(|x - y|) / (sum(|x|) + sum(|y|))``
172    =============  ====================  ========================================
173
174    **Metrics intended for boolean-valued vector spaces:**  Any nonzero entry
175    is evaluated to "True".  In the listings below, the following
176    abbreviations are used:
177
178     - N  : number of dimensions
179     - NTT : number of dims in which both values are True
180     - NTF : number of dims in which the first value is True, second is False
181     - NFT : number of dims in which the first value is False, second is True
182     - NFF : number of dims in which both values are False
183     - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT
184     - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT
185
186    =================  =======================  ===============================
187    identifier         class name               distance function
188    -----------------  -----------------------  -------------------------------
189    "jaccard"          JaccardDistance          NNEQ / NNZ
190    "maching"          MatchingDistance         NNEQ / N
191    "dice"             DiceDistance             NNEQ / (NTT + NNZ)
192    "kulsinski"        KulsinskiDistance        (NNEQ + N - NTT) / (NNEQ + N)
193    "rogerstanimoto"   RogersTanimotoDistance   2 * NNEQ / (N + NNEQ)
194    "russellrao"       RussellRaoDistance       NNZ / N
195    "sokalmichener"    SokalMichenerDistance    2 * NNEQ / (N + NNEQ)
196    "sokalsneath"      SokalSneathDistance      NNEQ / (NNEQ + 0.5 * NTT)
197    =================  =======================  ===============================
198
199    **User-defined distance:**
200
201    ===========    ===============    =======
202    identifier     class name         args
203    -----------    ---------------    -------
204    "pyfunc"       PyFuncDistance     func
205    ===========    ===============    =======
206
207    Here ``func`` is a function which takes two one-dimensional numpy
208    arrays, and returns a distance.  Note that in order to be used within
209    the BallTree, the distance must be a true metric:
210    i.e. it must satisfy the following properties
211
212    1) Non-negativity: d(x, y) >= 0
213    2) Identity: d(x, y) = 0 if and only if x == y
214    3) Symmetry: d(x, y) = d(y, x)
215    4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z)
216
217    Because of the Python object overhead involved in calling the python
218    function, this will be fairly slow, but it will have the same
219    scaling as other distances.
220    """
221    def __cinit__(self):
222        self.p = 2
223        self.vec = np.zeros(1, dtype=DTYPE, order='c')
224        self.mat = np.zeros((1, 1), dtype=DTYPE, order='c')
225        self.vec_ptr = get_vec_ptr(self.vec)
226        self.mat_ptr = get_mat_ptr(self.mat)
227        self.size = 1
228
229    def __reduce__(self):
230        """
231        reduce method used for pickling
232        """
233        return (newObj, (self.__class__,), self.__getstate__())
234
235    def __getstate__(self):
236        """
237        get state for pickling
238        """
239        if self.__class__.__name__ == "PyFuncDistance":
240            return (float(self.p), self.vec, self.mat, self.func, self.kwargs)
241        return (float(self.p), self.vec, self.mat)
242
243    def __setstate__(self, state):
244        """
245        set state for pickling
246        """
247        self.p = state[0]
248        self.vec = state[1]
249        self.mat = state[2]
250        if self.__class__.__name__ == "PyFuncDistance":
251            self.func = state[3]
252            self.kwargs = state[4]
253        self.vec_ptr = get_vec_ptr(self.vec)
254        self.mat_ptr = get_mat_ptr(self.mat)
255        self.size = 1
256
257    @classmethod
258    def get_metric(cls, metric, **kwargs):
259        """Get the given distance metric from the string identifier.
260
261        See the docstring of DistanceMetric for a list of available metrics.
262
263        Parameters
264        ----------
265        metric : string or class name
266            The distance metric to use
267        **kwargs
268            additional arguments will be passed to the requested metric
269        """
270        if isinstance(metric, DistanceMetric):
271            return metric
272
273        if callable(metric):
274            return PyFuncDistance(metric, **kwargs)
275
276        # Map the metric string ID to the metric class
277        if isinstance(metric, type) and issubclass(metric, DistanceMetric):
278            pass
279        else:
280            try:
281                metric = METRIC_MAPPING[metric]
282            except:
283                raise ValueError("Unrecognized metric '%s'" % metric)
284
285        # In Minkowski special cases, return more efficient methods
286        if metric is MinkowskiDistance:
287            p = kwargs.pop('p', 2)
288            if p == 1:
289                return ManhattanDistance(**kwargs)
290            elif p == 2:
291                return EuclideanDistance(**kwargs)
292            elif np.isinf(p):
293                return ChebyshevDistance(**kwargs)
294            else:
295                return MinkowskiDistance(p, **kwargs)
296        else:
297            return metric(**kwargs)
298
299    def __init__(self):
300        if self.__class__ is DistanceMetric:
301            raise NotImplementedError("DistanceMetric is an abstract class")
302
303    cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
304                      ITYPE_t size) nogil except -1:
305        """Compute the distance between vectors x1 and x2
306
307        This should be overridden in a base class.
308        """
309        return -999
310
311    cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
312                       ITYPE_t size) nogil except -1:
313        """Compute the reduced distance between vectors x1 and x2.
314
315        This can optionally be overridden in a base class.
316
317        The reduced distance is any measure that yields the same rank as the
318        distance, but is more efficient to compute.  For example, for the
319        Euclidean metric, the reduced distance is the squared-euclidean
320        distance.
321        """
322        return self.dist(x1, x2, size)
323
324    cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1:
325        """compute the pairwise distances between points in X"""
326        cdef ITYPE_t i1, i2
327        for i1 in range(X.shape[0]):
328            for i2 in range(i1, X.shape[0]):
329                D[i1, i2] = self.dist(&X[i1, 0], &X[i2, 0], X.shape[1])
330                D[i2, i1] = D[i1, i2]
331        return 0
332
333    cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y,
334                   DTYPE_t[:, ::1] D) except -1:
335        """compute the cross-pairwise distances between arrays X and Y"""
336        cdef ITYPE_t i1, i2
337        if X.shape[1] != Y.shape[1]:
338            raise ValueError('X and Y must have the same second dimension')
339        for i1 in range(X.shape[0]):
340            for i2 in range(Y.shape[0]):
341                D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1])
342        return 0
343
344    cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1:
345        """Convert the reduced distance to the distance"""
346        return rdist
347
348    cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
349        """Convert the distance to the reduced distance"""
350        return dist
351
352    def rdist_to_dist(self, rdist):
353        """Convert the Reduced distance to the true distance.
354
355        The reduced distance, defined for some metrics, is a computationally
356        more efficent measure which preserves the rank of the true distance.
357        For example, in the Euclidean distance metric, the reduced distance
358        is the squared-euclidean distance.
359        """
360        return rdist
361
362    def dist_to_rdist(self, dist):
363        """Convert the true distance to the reduced distance.
364
365        The reduced distance, defined for some metrics, is a computationally
366        more efficent measure which preserves the rank of the true distance.
367        For example, in the Euclidean distance metric, the reduced distance
368        is the squared-euclidean distance.
369        """
370        return dist
371
372    def pairwise(self, X, Y=None):
373        """Compute the pairwise distances between X and Y
374
375        This is a convenience routine for the sake of testing.  For many
376        metrics, the utilities in scipy.spatial.distance.cdist and
377        scipy.spatial.distance.pdist will be faster.
378
379        Parameters
380        ----------
381        X : array_like
382            Array of shape (Nx, D), representing Nx points in D dimensions.
383        Y : array_like (optional)
384            Array of shape (Ny, D), representing Ny points in D dimensions.
385            If not specified, then Y=X.
386        Returns
387        -------
388        dist : ndarray
389            The shape (Nx, Ny) array of pairwise distances between points in
390            X and Y.
391        """
392        cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Xarr
393        cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Yarr
394        cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Darr
395
396        Xarr = np.asarray(X, dtype=DTYPE, order='C')
397        if Y is None:
398            Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]),
399                            dtype=DTYPE, order='C')
400            self.pdist(get_memview_DTYPE_2D(Xarr),
401                       get_memview_DTYPE_2D(Darr))
402        else:
403            Yarr = np.asarray(Y, dtype=DTYPE, order='C')
404            Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]),
405                            dtype=DTYPE, order='C')
406            self.cdist(get_memview_DTYPE_2D(Xarr),
407                       get_memview_DTYPE_2D(Yarr),
408                       get_memview_DTYPE_2D(Darr))
409        return Darr
410
411
412# ------------------------------------------------------------
413# Euclidean Distance
414#  d = sqrt(sum(x_i^2 - y_i^2))
415cdef class EuclideanDistance(DistanceMetric):
416    """Euclidean Distance metric
417
418    .. math::
419       D(x, y) = \sqrt{ \sum_i (x_i - y_i) ^ 2 }
420    """
421    def __init__(self):
422        self.p = 2
423
424    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
425                             ITYPE_t size) nogil except -1:
426        return euclidean_dist(x1, x2, size)
427
428    cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
429                              ITYPE_t size) nogil except -1:
430        return euclidean_rdist(x1, x2, size)
431
432    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1:
433        return sqrt(rdist)
434
435    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
436        return dist * dist
437
438    def rdist_to_dist(self, rdist):
439        return np.sqrt(rdist)
440
441    def dist_to_rdist(self, dist):
442        return dist ** 2
443
444
445# ------------------------------------------------------------
446# SEuclidean Distance
447#  d = sqrt(sum((x_i - y_i2)^2 / v_i))
448cdef class SEuclideanDistance(DistanceMetric):
449    """Standardized Euclidean Distance metric
450
451    .. math::
452       D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} }
453    """
454    def __init__(self, V):
455        self.vec = np.asarray(V, dtype=DTYPE)
456        self.vec_ptr = get_vec_ptr(self.vec)
457        self.size = self.vec.shape[0]
458        self.p = 2
459
460    cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
461                              ITYPE_t size) nogil except -1:
462        if size != self.size:
463            with gil:
464                raise ValueError('SEuclidean dist: size of V does not match')
465        cdef DTYPE_t tmp, d=0
466        cdef np.intp_t j
467        for j in range(size):
468            tmp = x1[j] - x2[j]
469            d += tmp * tmp / self.vec_ptr[j]
470        return d
471
472    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
473                             ITYPE_t size) nogil except -1:
474        return sqrt(self.rdist(x1, x2, size))
475
476    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1:
477        return sqrt(rdist)
478
479    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
480        return dist * dist
481
482    def rdist_to_dist(self, rdist):
483        return np.sqrt(rdist)
484
485    def dist_to_rdist(self, dist):
486        return dist ** 2
487
488
489# ------------------------------------------------------------
490# Manhattan Distance
491#  d = sum(abs(x_i - y_i))
492cdef class ManhattanDistance(DistanceMetric):
493    """Manhattan/City-block Distance metric
494
495    .. math::
496       D(x, y) = \sum_i |x_i - y_i|
497    """
498    def __init__(self):
499        self.p = 1
500
501    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
502                             ITYPE_t size) nogil except -1:
503        cdef DTYPE_t d = 0
504        cdef np.intp_t j
505        for j in range(size):
506            d += fabs(x1[j] - x2[j])
507        return d
508
509
510# ------------------------------------------------------------
511# Chebyshev Distance
512#  d = max_i(abs(x_i), abs(y_i))
513cdef class ChebyshevDistance(DistanceMetric):
514    """Chebyshev/Infinity Distance
515
516    .. math::
517       D(x, y) = max_i (|x_i - y_i|)
518    """
519    def __init__(self):
520        self.p = INF
521
522    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
523                             ITYPE_t size) nogil except -1:
524        cdef DTYPE_t d = 0
525        cdef np.intp_t j
526        for j in range(size):
527            d = fmax(d, fabs(x1[j] - x2[j]))
528        return d
529
530
531# ------------------------------------------------------------
532# Minkowski Distance
533#  d = sum(x_i^p - y_i^p) ^ (1/p)
534cdef class MinkowskiDistance(DistanceMetric):
535    """Minkowski Distance
536
537    .. math::
538       D(x, y) = [\sum_i (x_i - y_i)^p] ^ (1/p)
539
540    Minkowski Distance requires p >= 1 and finite. For p = infinity,
541    use ChebyshevDistance.
542    Note that for p=1, ManhattanDistance is more efficient, and for
543    p=2, EuclideanDistance is more efficient.
544    """
545    def __init__(self, p):
546        if p < 1:
547            raise ValueError("p must be greater than 1")
548        elif np.isinf(p):
549            raise ValueError("MinkowskiDistance requires finite p. "
550                             "For p=inf, use ChebyshevDistance.")
551        self.p = p
552
553    cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
554                              ITYPE_t size) nogil except -1:
555        cdef DTYPE_t d=0
556        cdef np.intp_t j
557        for j in range(size):
558            d += pow(fabs(x1[j] - x2[j]), self.p)
559        return d
560
561    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
562                             ITYPE_t size) nogil except -1:
563        return pow(self.rdist(x1, x2, size), 1. / self.p)
564
565    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1:
566        return pow(rdist, 1. / self.p)
567
568    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
569        return pow(dist, self.p)
570
571    def rdist_to_dist(self, rdist):
572        return rdist ** (1. / self.p)
573
574    def dist_to_rdist(self, dist):
575        return dist ** self.p
576
577
578# ------------------------------------------------------------
579# W-Minkowski Distance
580#  d = sum(w_i * (x_i^p - y_i^p)) ^ (1/p)
581cdef class WMinkowskiDistance(DistanceMetric):
582    """Weighted Minkowski Distance
583
584    .. math::
585       D(x, y) = [\sum_i w_i (x_i - y_i)^p] ^ (1/p)
586
587    Weighted Minkowski Distance requires p >= 1 and finite.
588
589    Parameters
590    ----------
591    p : int
592        The order of the norm of the difference :math:`{||u-v||}_p`.
593    w : (N,) array_like
594        The weight vector.
595
596    """
597    def __init__(self, p, w):
598        if p < 1:
599            raise ValueError("p must be greater than 1")
600        elif np.isinf(p):
601            raise ValueError("WMinkowskiDistance requires finite p. "
602                             "For p=inf, use ChebyshevDistance.")
603        self.p = p
604        self.vec = np.asarray(w, dtype=DTYPE)
605        self.vec_ptr = get_vec_ptr(self.vec)
606        self.size = self.vec.shape[0]
607
608    cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
609                              ITYPE_t size) nogil except -1:
610        if size != self.size:
611            with gil:
612                raise ValueError('WMinkowskiDistance dist: '
613                                 'size of w does not match')
614        cdef DTYPE_t d=0
615        cdef np.intp_t j
616        for j in range(size):
617            d += pow(self.vec_ptr[j] * fabs(x1[j] - x2[j]), self.p)
618        return d
619
620    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
621                             ITYPE_t size) nogil except -1:
622        return pow(self.rdist(x1, x2, size), 1. / self.p)
623
624    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1:
625        return pow(rdist, 1. / self.p)
626
627    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
628        return pow(dist, self.p)
629
630    def rdist_to_dist(self, rdist):
631        return rdist ** (1. / self.p)
632
633    def dist_to_rdist(self, dist):
634        return dist ** self.p
635
636
637# ------------------------------------------------------------
638# Mahalanobis Distance
639#  d = sqrt( (x - y)^T V^-1 (x - y) )
640cdef class MahalanobisDistance(DistanceMetric):
641    """Mahalanobis Distance
642
643    .. math::
644       D(x, y) = \sqrt{ (x - y)^T V^{-1} (x - y) }
645
646    Parameters
647    ----------
648    V : array_like
649        Symmetric positive-definite covariance matrix.
650        The inverse of this matrix will be explicitly computed.
651    VI : array_like
652        optionally specify the inverse directly.  If VI is passed,
653        then V is not referenced.
654    """
655    def __init__(self, V=None, VI=None):
656        if VI is None:
657            VI = np.linalg.inv(V)
658        if VI.ndim != 2 or VI.shape[0] != VI.shape[1]:
659            raise ValueError("V/VI must be square")
660
661        self.mat = np.asarray(VI, dtype=float, order='C')
662        self.mat_ptr = get_mat_ptr(self.mat)
663
664        self.size = self.mat.shape[0]
665
666        # we need vec as a work buffer
667        self.vec = np.zeros(self.size, dtype=DTYPE)
668        self.vec_ptr = get_vec_ptr(self.vec)
669
670    cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
671                              ITYPE_t size) nogil except -1:
672        if size != self.size:
673            with gil:
674                raise ValueError('Mahalanobis dist: size of V does not match')
675
676        cdef DTYPE_t tmp, d = 0
677        cdef np.intp_t i, j
678
679        # compute (x1 - x2).T * VI * (x1 - x2)
680        for i in range(size):
681            self.vec_ptr[i] = x1[i] - x2[i]
682
683        for i in range(size):
684            tmp = 0
685            for j in range(size):
686                tmp += self.mat_ptr[i * size + j] * self.vec_ptr[j]
687            d += tmp * self.vec_ptr[i]
688        return d
689
690    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
691                             ITYPE_t size) nogil except -1:
692        return sqrt(self.rdist(x1, x2, size))
693
694    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1:
695        return sqrt(rdist)
696
697    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
698        return dist * dist
699
700    def rdist_to_dist(self, rdist):
701        return np.sqrt(rdist)
702
703    def dist_to_rdist(self, dist):
704        return dist ** 2
705
706
707# ------------------------------------------------------------
708# Hamming Distance
709#  d = N_unequal(x, y) / N_tot
710cdef class HammingDistance(DistanceMetric):
711    """Hamming Distance
712
713    Hamming distance is meant for discrete-valued vectors, though it is
714    a valid metric for real-valued vectors.
715
716    .. math::
717       D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i}
718    """
719    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
720                             ITYPE_t size) nogil except -1:
721        cdef int n_unequal = 0
722        cdef np.intp_t j
723        for j in range(size):
724            if x1[j] != x2[j]:
725                n_unequal += 1
726        return float(n_unequal) / size
727
728
729# ------------------------------------------------------------
730# Canberra Distance
731#  D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ]
732cdef class CanberraDistance(DistanceMetric):
733    """Canberra Distance
734
735    Canberra distance is meant for discrete-valued vectors, though it is
736    a valid metric for real-valued vectors.
737
738    .. math::
739       D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|}
740    """
741    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
742                             ITYPE_t size) nogil except -1:
743        cdef DTYPE_t denom, d = 0
744        cdef np.intp_t j
745        for j in range(size):
746            denom = fabs(x1[j]) + fabs(x2[j])
747            if denom > 0:
748                d += fabs(x1[j] - x2[j]) / denom
749        return d
750
751
752# ------------------------------------------------------------
753# Bray-Curtis Distance
754#  D(x, y) = sum[abs(x_i - y_i)] / sum[abs(x_i) + abs(y_i)]
755cdef class BrayCurtisDistance(DistanceMetric):
756    """Bray-Curtis Distance
757
758    Bray-Curtis distance is meant for discrete-valued vectors, though it is
759    a valid metric for real-valued vectors.
760
761    .. math::
762       D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)}
763    """
764    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
765                             ITYPE_t size) nogil except -1:
766        cdef DTYPE_t num = 0, denom = 0
767        cdef np.intp_t j
768        for j in range(size):
769            num += fabs(x1[j] - x2[j])
770            denom += fabs(x1[j]) + fabs(x2[j])
771        if denom > 0:
772            return num / denom
773        else:
774            return 0.0
775
776
777# ------------------------------------------------------------
778# Jaccard Distance (boolean)
779#  D(x, y) = N_unequal(x, y) / N_nonzero(x, y)
780cdef class JaccardDistance(DistanceMetric):
781    """Jaccard Distance
782
783    Jaccard Distance is a dissimilarity measure for boolean-valued
784    vectors. All nonzero entries will be treated as True, zero entries will
785    be treated as False.
786
787    .. math::
788       D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} + N_{TF} + N_{FT}}
789    """
790    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
791                             ITYPE_t size) nogil except -1:
792        cdef int tf1, tf2, n_eq = 0, nnz = 0
793        cdef np.intp_t j
794        for j in range(size):
795            tf1 = x1[j] != 0
796            tf2 = x2[j] != 0
797            nnz += (tf1 or tf2)
798            n_eq += (tf1 and tf2)
799        if nnz == 0:
800            return 0.0
801        return (nnz - n_eq) * 1.0 / nnz
802
803
804# ------------------------------------------------------------
805# Matching Distance (boolean)
806#  D(x, y) = n_neq / n
807cdef class MatchingDistance(DistanceMetric):
808    """Matching Distance
809
810    Matching Distance is a dissimilarity measure for boolean-valued
811    vectors. All nonzero entries will be treated as True, zero entries will
812    be treated as False.
813
814    .. math::
815       D(x, y) = \frac{N_{TF} + N_{FT}}{N}
816    """
817    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
818                             ITYPE_t size) nogil except -1:
819        cdef int tf1, tf2, n_neq = 0
820        cdef np.intp_t j
821        for j in range(size):
822            tf1 = x1[j] != 0
823            tf2 = x2[j] != 0
824            n_neq += (tf1 != tf2)
825        return n_neq * 1. / size
826
827
828# ------------------------------------------------------------
829# Dice Distance (boolean)
830#  D(x, y) = n_neq / (2 * ntt + n_neq)
831cdef class DiceDistance(DistanceMetric):
832    """Dice Distance
833
834    Dice Distance is a dissimilarity measure for boolean-valued
835    vectors. All nonzero entries will be treated as True, zero entries will
836    be treated as False.
837
838    .. math::
839       D(x, y) = \frac{N_{TF} + N_{FT}}{2 * N_{TT} + N_{TF} + N_{FT}}
840    """
841    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
842                             ITYPE_t size) nogil except -1:
843        cdef int tf1, tf2, n_neq = 0, ntt = 0
844        cdef np.intp_t j
845        for j in range(size):
846            tf1 = x1[j] != 0
847            tf2 = x2[j] != 0
848            ntt += (tf1 and tf2)
849            n_neq += (tf1 != tf2)
850        return n_neq / (2.0 * ntt + n_neq)
851
852
853# ------------------------------------------------------------
854# Kulsinski Distance (boolean)
855#  D(x, y) = (ntf + nft - ntt + n) / (n_neq + n)
856cdef class KulsinskiDistance(DistanceMetric):
857    """Kulsinski Distance
858
859    Kulsinski Distance is a dissimilarity measure for boolean-valued
860    vectors. All nonzero entries will be treated as True, zero entries will
861    be treated as False.
862
863    .. math::
864       D(x, y) = 1 - \frac{N_{TT}}{N + N_{TF} + N_{FT}}
865    """
866    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
867                             ITYPE_t size) nogil except -1:
868        cdef int tf1, tf2, ntt = 0, n_neq = 0
869        cdef np.intp_t j
870        for j in range(size):
871            tf1 = x1[j] != 0
872            tf2 = x2[j] != 0
873            n_neq += (tf1 != tf2)
874            ntt += (tf1 and tf2)
875        return (n_neq - ntt + size) * 1.0 / (n_neq + size)
876
877
878# ------------------------------------------------------------
879# Rogers-Tanimoto Distance (boolean)
880#  D(x, y) = 2 * n_neq / (n + n_neq)
881cdef class RogersTanimotoDistance(DistanceMetric):
882    """Rogers-Tanimoto Distance
883
884    Rogers-Tanimoto Distance is a dissimilarity measure for boolean-valued
885    vectors. All nonzero entries will be treated as True, zero entries will
886    be treated as False.
887
888    .. math::
889       D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}}
890    """
891    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
892                             ITYPE_t size) nogil except -1:
893        cdef int tf1, tf2, n_neq = 0
894        cdef np.intp_t j
895        for j in range(size):
896            tf1 = x1[j] != 0
897            tf2 = x2[j] != 0
898            n_neq += (tf1 != tf2)
899        return (2.0 * n_neq) / (size + n_neq)
900
901
902# ------------------------------------------------------------
903# Russell-Rao Distance (boolean)
904#  D(x, y) = (n - ntt) / n
905cdef class RussellRaoDistance(DistanceMetric):
906    """Russell-Rao Distance
907
908    Russell-Rao Distance is a dissimilarity measure for boolean-valued
909    vectors. All nonzero entries will be treated as True, zero entries will
910    be treated as False.
911
912    .. math::
913       D(x, y) = \frac{N - N_{TT}}{N}
914    """
915    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
916                             ITYPE_t size) nogil except -1:
917        cdef int tf1, tf2, ntt = 0
918        cdef np.intp_t j
919        for j in range(size):
920            tf1 = x1[j] != 0
921            tf2 = x2[j] != 0
922            ntt += (tf1 and tf2)
923        return (size - ntt) * 1. / size
924
925
926# ------------------------------------------------------------
927# Sokal-Michener Distance (boolean)
928#  D(x, y) = 2 * n_neq / (n + n_neq)
929cdef class SokalMichenerDistance(DistanceMetric):
930    """Sokal-Michener Distance
931
932    Sokal-Michener Distance is a dissimilarity measure for boolean-valued
933    vectors. All nonzero entries will be treated as True, zero entries will
934    be treated as False.
935
936    .. math::
937       D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}}
938    """
939    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
940                             ITYPE_t size) nogil except -1:
941        cdef int tf1, tf2, n_neq = 0
942        cdef np.intp_t j
943        for j in range(size):
944            tf1 = x1[j] != 0
945            tf2 = x2[j] != 0
946            n_neq += (tf1 != tf2)
947        return (2.0 * n_neq) / (size + n_neq)
948
949
950# ------------------------------------------------------------
951# Sokal-Sneath Distance (boolean)
952#  D(x, y) = n_neq / (0.5 * n_tt + n_neq)
953cdef class SokalSneathDistance(DistanceMetric):
954    """Sokal-Sneath Distance
955
956    Sokal-Sneath Distance is a dissimilarity measure for boolean-valued
957    vectors. All nonzero entries will be treated as True, zero entries will
958    be treated as False.
959
960    .. math::
961       D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} / 2 + N_{TF} + N_{FT}}
962    """
963    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
964                             ITYPE_t size) nogil except -1:
965        cdef int tf1, tf2, ntt = 0, n_neq = 0
966        cdef np.intp_t j
967        for j in range(size):
968            tf1 = x1[j] != 0
969            tf2 = x2[j] != 0
970            n_neq += (tf1 != tf2)
971            ntt += (tf1 and tf2)
972        return n_neq / (0.5 * ntt + n_neq)
973
974
975# ------------------------------------------------------------
976# Haversine Distance (2 dimensional)
977#  D(x, y) = 2 arcsin{sqrt[sin^2 ((x1 - y1) / 2)
978#                          + cos(x1) cos(y1) sin^2 ((x2 - y2) / 2)]}
979cdef class HaversineDistance(DistanceMetric):
980    """Haversine (Spherical) Distance
981
982    The Haversine distance is the angular distance between two points on
983    the surface of a sphere.  The first distance of each point is assumed
984    to be the latitude, the second is the longitude, given in radians.
985    The dimension of the points must be 2:
986
987    .. math::
988       D(x, y) = 2\arcsin[\sqrt{\sin^2((x1 - y1) / 2)
989                                + cos(x1)cos(y1)sin^2((x2 - y2) / 2)}]
990    """
991    cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2,
992                              ITYPE_t size) nogil except -1:
993        if size != 2:
994            with gil:
995                raise ValueError("Haversine distance only valid "
996                                 "in 2 dimensions")
997        cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0]))
998        cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1]))
999        return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)
1000
1001    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
1002                             ITYPE_t size) nogil except -1:
1003        if size != 2:
1004            with gil:
1005                raise ValueError("Haversine distance only valid in"
1006                                 " 2 dimensions")
1007        cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0]))
1008        cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1]))
1009        return 2 * asin(sqrt(sin_0 * sin_0 +
1010                             cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1))
1011
1012    cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1:
1013        return 2 * asin(sqrt(rdist))
1014
1015    cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1:
1016        cdef DTYPE_t tmp = sin(0.5 * dist)
1017        return tmp * tmp
1018
1019    def rdist_to_dist(self, rdist):
1020        return 2 * np.arcsin(np.sqrt(rdist))
1021
1022    def dist_to_rdist(self, dist):
1023        tmp = np.sin(0.5 * dist)
1024        return tmp * tmp
1025
1026
1027# ------------------------------------------------------------
1028# Yule Distance (boolean)
1029#  D(x, y) = 2 * ntf * nft / (ntt * nff + ntf * nft)
1030# [This is not a true metric, so we will leave it out.]
1031#
1032# cdef class YuleDistance(DistanceMetric):
1033#    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size):
1034#        cdef int tf1, tf2, ntf = 0, nft = 0, ntt = 0, nff = 0
1035#        cdef np.intp_t j
1036#        for j in range(size):
1037#            tf1 = x1[j] != 0
1038#            tf2 = x2[j] != 0
1039#            ntt += tf1 and tf2
1040#            ntf += tf1 and (tf2 == 0)
1041#            nft += (tf1 == 0) and tf2
1042#        nff = size - ntt - ntf - nft
1043#        return (2.0 * ntf * nft) / (ntt * nff + ntf * nft)
1044
1045
1046# ------------------------------------------------------------
1047# Cosine Distance
1048#  D(x, y) = dot(x, y) / (|x| * |y|)
1049# [This is not a true metric, so we will leave it out. Use the `arccos`
1050#  distance instead]
1051
1052# cdef class CosineDistance(DistanceMetric):
1053#    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
1054#                             ITYPE_t size) nogil except -1:
1055#        cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0
1056#        cdef np.intp_t j
1057#        for j in range(size):
1058#            d += x1[j] * x2[j]
1059#            norm1 += x1[j] * x1[j]
1060#            norm2 += x2[j] * x2[j]
1061#        return 1.0 - d / sqrt(norm1 * norm2)
1062
1063# ------------------------------------------------------------
1064# Arccos Distance
1065#  D(x, y) = arccos(dot(x, y) / (|x| * |y|)) / PI
1066
1067cdef class ArccosDistance(DistanceMetric):
1068    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
1069                             ITYPE_t size) nogil except -1:
1070        cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0
1071        cdef np.intp_t j
1072        for j in range(size):
1073            d += x1[j] * x2[j]
1074            norm1 += x1[j] * x1[j]
1075            norm2 += x2[j] * x2[j]
1076        return acos(d / sqrt(norm1 * norm2)) / M_PI
1077
1078
1079# ------------------------------------------------------------
1080# Correlation Distance
1081#  D(x, y) = dot((x - mx), (y - my)) / (|x - mx| * |y - my|)
1082# [This is not a true metric, so we will leave it out.]
1083#
1084# cdef class CorrelationDistance(DistanceMetric):
1085#    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size):
1086#        cdef DTYPE_t mu1 = 0, mu2 = 0, x1nrm = 0, x2nrm = 0, x1Tx2 = 0
1087#        cdef DTYPE_t tmp1, tmp2
1088#
1089#        cdef np.intp_t i
1090#        for i in range(size):
1091#            mu1 += x1[i]
1092#            mu2 += x2[i]
1093#        mu1 /= size
1094#        mu2 /= size
1095#
1096#        for i in range(size):
1097#            tmp1 = x1[i] - mu1
1098#            tmp2 = x2[i] - mu2
1099#            x1nrm += tmp1 * tmp1
1100#            x2nrm += tmp2 * tmp2
1101#            x1Tx2 += tmp1 * tmp2
1102#
1103#        return (1. - x1Tx2) / sqrt(x1nrm * x2nrm)
1104
1105
1106# ------------------------------------------------------------
1107# User-defined distance
1108#
1109cdef class PyFuncDistance(DistanceMetric):
1110    """PyFunc Distance
1111    A user-defined distance
1112    Parameters
1113    ----------
1114    func : function
1115        func should take two numpy arrays as input, and return a distance.
1116    """
1117    def __init__(self, func, **kwargs):
1118        self.func = func
1119        self.kwargs = kwargs
1120
1121    # in cython < 0.26, GIL was required to be acquired during definition of
1122    # the function and inside the body of the function. This behaviour is not
1123    # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The
1124    # only way to be back compatible is to inherit `dist` from the base class
1125    # without GIL and called an inline `_dist` which acquire GIL.
1126    cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2,
1127                             ITYPE_t size) nogil except -1:
1128        return self._dist(x1, x2, size)
1129
1130    cdef inline DTYPE_t _dist(self, DTYPE_t* x1, DTYPE_t* x2,
1131                              ITYPE_t size) except -1 with gil:
1132        cdef np.ndarray x1arr
1133        cdef np.ndarray x2arr
1134        x1arr = _buffer_to_ndarray(x1, size)
1135        x2arr = _buffer_to_ndarray(x2, size)
1136        d = self.func(x1arr, x2arr, **self.kwargs)
1137        try:
1138            # Cython generates code here that results in a TypeError
1139            # if d is the wrong type.
1140            return d
1141        except TypeError:
1142            raise TypeError("Custom distance function must accept two "
1143                            "vectors and return a float.")
1144
1145
1146cdef inline double fmax(double a, double b) nogil:
1147    return max(a, b)
1148