1# !python 2# cython: boundscheck=False 3# cython: wraparound=False 4# cython: cdivision=True 5 6# By Jake Vanderplas (2013) <jakevdp@cs.washington.edu> 7# written for the scikit-learn project 8# modified for HDBSCAN Dual Tree Boruvka algorithm 9# License: BSD 10 11import numpy as np 12cimport numpy as np 13np.import_array() # required in order to use C-API 14 15from libc.math cimport fabs, sqrt, exp, cos, pow, log, acos, M_PI 16 17DTYPE = np.double 18ITYPE = np.intp 19 20 21###################################################################### 22# Numpy 1.3-1.4 compatibility utilities 23cdef DTYPE_t[:, ::1] get_memview_DTYPE_2D( 24 np.ndarray[DTYPE_t, ndim=2, mode='c'] X): 25 return <DTYPE_t[:X.shape[0], :X.shape[1]:1]> (<DTYPE_t*> X.data) 26 27 28cdef DTYPE_t* get_vec_ptr(np.ndarray[DTYPE_t, ndim=1, mode='c'] vec): 29 return &vec[0] 30 31 32cdef DTYPE_t* get_mat_ptr(np.ndarray[DTYPE_t, ndim=2, mode='c'] mat): 33 return &mat[0, 0] 34###################################################################### 35 36 37# First, define a function to get an ndarray from a memory bufffer 38cdef extern from "numpy/arrayobject.h": 39 object PyArray_SimpleNewFromData(int nd, np.npy_intp* dims, 40 int typenum, void* data) 41 42 43cdef inline np.ndarray _buffer_to_ndarray(DTYPE_t* x, np.npy_intp n): 44 # Wrap a memory buffer with an ndarray. Warning: this is not robust. 45 # In particular, if x is deallocated before the returned array goes 46 # out of scope, this could cause memory errors. Since there is not 47 # a possibility of this for our use-case, this should be safe. 48 49 # Note: this Segfaults unless np.import_array() is called above 50 return PyArray_SimpleNewFromData(1, &n, DTYPECODE, <void*>x) 51 52 53# some handy constants 54from libc.math cimport fabs, sqrt, exp, pow, cos, sin, asin 55cdef DTYPE_t INF = np.inf 56 57 58###################################################################### 59# newObj function 60# this is a helper function for pickling 61def newObj(obj): 62 return obj.__new__(obj) 63 64 65###################################################################### 66# metric mappings 67# These map from metric id strings to class names 68METRIC_MAPPING = {'euclidean': EuclideanDistance, 69 'l2': EuclideanDistance, 70 'minkowski': MinkowskiDistance, 71 'p': MinkowskiDistance, 72 'manhattan': ManhattanDistance, 73 'cityblock': ManhattanDistance, 74 'l1': ManhattanDistance, 75 'chebyshev': ChebyshevDistance, 76 'infinity': ChebyshevDistance, 77 'seuclidean': SEuclideanDistance, 78 'mahalanobis': MahalanobisDistance, 79 'wminkowski': WMinkowskiDistance, 80 'hamming': HammingDistance, 81 'canberra': CanberraDistance, 82 'braycurtis': BrayCurtisDistance, 83 'matching': MatchingDistance, 84 'jaccard': JaccardDistance, 85 'dice': DiceDistance, 86 'kulsinski': KulsinskiDistance, 87 'rogerstanimoto': RogersTanimotoDistance, 88 'russellrao': RussellRaoDistance, 89 'sokalmichener': SokalMichenerDistance, 90 'sokalsneath': SokalSneathDistance, 91 'haversine': HaversineDistance, 92 'cosine': ArccosDistance, 93 'arccos': ArccosDistance, 94 'pyfunc': PyFuncDistance} 95 96 97def get_valid_metric_ids(L): 98 """Given an iterable of metric class names or class identifiers, 99 return a list of metric IDs which map to those classes. 100 101 Examples 102 -------- 103 >>> L = get_valid_metric_ids([EuclideanDistance, 'ManhattanDistance']) 104 >>> sorted(L) 105 ['cityblock', 'euclidean', 'l1', 'l2', 'manhattan'] 106 """ 107 return [key for (key, val) in METRIC_MAPPING.items() 108 if (val.__name__ in L) or (val in L)] 109 110 111###################################################################### 112# Distance Metric Classes 113cdef class DistanceMetric: 114 """DistanceMetric class 115 116 This class provides a uniform interface to fast distance metric 117 functions. The various metrics can be accessed via the `get_metric` 118 class method and the metric string identifier (see below). 119 120 Examples 121 -------- 122 123 For example, to use the Euclidean distance: 124 125 >>> dist = DistanceMetric.get_metric('euclidean') 126 >>> X = [[0, 1, 2], 127 [3, 4, 5]]) 128 >>> dist.pairwise(X) 129 array([[ 0. , 5.19615242], 130 [ 5.19615242, 0. ]]) 131 132 Available Metrics 133 The following lists the string metric identifiers and the associated 134 distance metric classes: 135 136 **Metrics intended for real-valued vector spaces:** 137 138 ============== ==================== ======== =============================== 139 identifier class name args distance function 140 -------------- -------------------- -------- ------------------------------- 141 "euclidean" EuclideanDistance - ``sqrt(sum((x - y)^2))`` 142 "manhattan" ManhattanDistance - ``sum(|x - y|)`` 143 "chebyshev" ChebyshevDistance - ``sum(max(|x - y|))`` 144 "minkowski" MinkowskiDistance p ``sum(|x - y|^p)^(1/p)`` 145 "wminkowski" WMinkowskiDistance p, w ``sum(w * |x - y|^p)^(1/p)`` 146 "seuclidean" SEuclideanDistance V ``sqrt(sum((x - y)^2 / V))`` 147 "mahalanobis" MahalanobisDistance V or VI ``sqrt((x - y)' V^-1 (x - y))`` 148 ============== ==================== ======== =============================== 149 150 **Metrics intended for two-dimensional vector spaces:** Note that the haversine 151 distance metric requires data in the form of [latitude, longitude] and both 152 inputs and outputs are in units of radians. 153 154 ============ ================== ======================================== 155 identifier class name distance function 156 ------------ ------------------ ---------------------------------------- 157 "haversine" HaversineDistance 2 arcsin(sqrt(sin^2(0.5*dx) 158 + cos(x1)cos(x2)sin^2(0.5*dy))) 159 ============ ================== ======================================== 160 161 162 **Metrics intended for integer-valued vector spaces:** Though intended 163 for integer-valued vectors, these are also valid metrics in the case of 164 real-valued vectors. 165 166 ============= ==================== ======================================== 167 identifier class name distance function 168 ------------- -------------------- ---------------------------------------- 169 "hamming" HammingDistance ``N_unequal(x, y) / N_tot`` 170 "canberra" CanberraDistance ``sum(|x - y| / (|x| + |y|))`` 171 "braycurtis" BrayCurtisDistance ``sum(|x - y|) / (sum(|x|) + sum(|y|))`` 172 ============= ==================== ======================================== 173 174 **Metrics intended for boolean-valued vector spaces:** Any nonzero entry 175 is evaluated to "True". In the listings below, the following 176 abbreviations are used: 177 178 - N : number of dimensions 179 - NTT : number of dims in which both values are True 180 - NTF : number of dims in which the first value is True, second is False 181 - NFT : number of dims in which the first value is False, second is True 182 - NFF : number of dims in which both values are False 183 - NNEQ : number of non-equal dimensions, NNEQ = NTF + NFT 184 - NNZ : number of nonzero dimensions, NNZ = NTF + NFT + NTT 185 186 ================= ======================= =============================== 187 identifier class name distance function 188 ----------------- ----------------------- ------------------------------- 189 "jaccard" JaccardDistance NNEQ / NNZ 190 "maching" MatchingDistance NNEQ / N 191 "dice" DiceDistance NNEQ / (NTT + NNZ) 192 "kulsinski" KulsinskiDistance (NNEQ + N - NTT) / (NNEQ + N) 193 "rogerstanimoto" RogersTanimotoDistance 2 * NNEQ / (N + NNEQ) 194 "russellrao" RussellRaoDistance NNZ / N 195 "sokalmichener" SokalMichenerDistance 2 * NNEQ / (N + NNEQ) 196 "sokalsneath" SokalSneathDistance NNEQ / (NNEQ + 0.5 * NTT) 197 ================= ======================= =============================== 198 199 **User-defined distance:** 200 201 =========== =============== ======= 202 identifier class name args 203 ----------- --------------- ------- 204 "pyfunc" PyFuncDistance func 205 =========== =============== ======= 206 207 Here ``func`` is a function which takes two one-dimensional numpy 208 arrays, and returns a distance. Note that in order to be used within 209 the BallTree, the distance must be a true metric: 210 i.e. it must satisfy the following properties 211 212 1) Non-negativity: d(x, y) >= 0 213 2) Identity: d(x, y) = 0 if and only if x == y 214 3) Symmetry: d(x, y) = d(y, x) 215 4) Triangle Inequality: d(x, y) + d(y, z) >= d(x, z) 216 217 Because of the Python object overhead involved in calling the python 218 function, this will be fairly slow, but it will have the same 219 scaling as other distances. 220 """ 221 def __cinit__(self): 222 self.p = 2 223 self.vec = np.zeros(1, dtype=DTYPE, order='c') 224 self.mat = np.zeros((1, 1), dtype=DTYPE, order='c') 225 self.vec_ptr = get_vec_ptr(self.vec) 226 self.mat_ptr = get_mat_ptr(self.mat) 227 self.size = 1 228 229 def __reduce__(self): 230 """ 231 reduce method used for pickling 232 """ 233 return (newObj, (self.__class__,), self.__getstate__()) 234 235 def __getstate__(self): 236 """ 237 get state for pickling 238 """ 239 if self.__class__.__name__ == "PyFuncDistance": 240 return (float(self.p), self.vec, self.mat, self.func, self.kwargs) 241 return (float(self.p), self.vec, self.mat) 242 243 def __setstate__(self, state): 244 """ 245 set state for pickling 246 """ 247 self.p = state[0] 248 self.vec = state[1] 249 self.mat = state[2] 250 if self.__class__.__name__ == "PyFuncDistance": 251 self.func = state[3] 252 self.kwargs = state[4] 253 self.vec_ptr = get_vec_ptr(self.vec) 254 self.mat_ptr = get_mat_ptr(self.mat) 255 self.size = 1 256 257 @classmethod 258 def get_metric(cls, metric, **kwargs): 259 """Get the given distance metric from the string identifier. 260 261 See the docstring of DistanceMetric for a list of available metrics. 262 263 Parameters 264 ---------- 265 metric : string or class name 266 The distance metric to use 267 **kwargs 268 additional arguments will be passed to the requested metric 269 """ 270 if isinstance(metric, DistanceMetric): 271 return metric 272 273 if callable(metric): 274 return PyFuncDistance(metric, **kwargs) 275 276 # Map the metric string ID to the metric class 277 if isinstance(metric, type) and issubclass(metric, DistanceMetric): 278 pass 279 else: 280 try: 281 metric = METRIC_MAPPING[metric] 282 except: 283 raise ValueError("Unrecognized metric '%s'" % metric) 284 285 # In Minkowski special cases, return more efficient methods 286 if metric is MinkowskiDistance: 287 p = kwargs.pop('p', 2) 288 if p == 1: 289 return ManhattanDistance(**kwargs) 290 elif p == 2: 291 return EuclideanDistance(**kwargs) 292 elif np.isinf(p): 293 return ChebyshevDistance(**kwargs) 294 else: 295 return MinkowskiDistance(p, **kwargs) 296 else: 297 return metric(**kwargs) 298 299 def __init__(self): 300 if self.__class__ is DistanceMetric: 301 raise NotImplementedError("DistanceMetric is an abstract class") 302 303 cdef DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 304 ITYPE_t size) nogil except -1: 305 """Compute the distance between vectors x1 and x2 306 307 This should be overridden in a base class. 308 """ 309 return -999 310 311 cdef DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, 312 ITYPE_t size) nogil except -1: 313 """Compute the reduced distance between vectors x1 and x2. 314 315 This can optionally be overridden in a base class. 316 317 The reduced distance is any measure that yields the same rank as the 318 distance, but is more efficient to compute. For example, for the 319 Euclidean metric, the reduced distance is the squared-euclidean 320 distance. 321 """ 322 return self.dist(x1, x2, size) 323 324 cdef int pdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] D) except -1: 325 """compute the pairwise distances between points in X""" 326 cdef ITYPE_t i1, i2 327 for i1 in range(X.shape[0]): 328 for i2 in range(i1, X.shape[0]): 329 D[i1, i2] = self.dist(&X[i1, 0], &X[i2, 0], X.shape[1]) 330 D[i2, i1] = D[i1, i2] 331 return 0 332 333 cdef int cdist(self, DTYPE_t[:, ::1] X, DTYPE_t[:, ::1] Y, 334 DTYPE_t[:, ::1] D) except -1: 335 """compute the cross-pairwise distances between arrays X and Y""" 336 cdef ITYPE_t i1, i2 337 if X.shape[1] != Y.shape[1]: 338 raise ValueError('X and Y must have the same second dimension') 339 for i1 in range(X.shape[0]): 340 for i2 in range(Y.shape[0]): 341 D[i1, i2] = self.dist(&X[i1, 0], &Y[i2, 0], X.shape[1]) 342 return 0 343 344 cdef DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: 345 """Convert the reduced distance to the distance""" 346 return rdist 347 348 cdef DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: 349 """Convert the distance to the reduced distance""" 350 return dist 351 352 def rdist_to_dist(self, rdist): 353 """Convert the Reduced distance to the true distance. 354 355 The reduced distance, defined for some metrics, is a computationally 356 more efficent measure which preserves the rank of the true distance. 357 For example, in the Euclidean distance metric, the reduced distance 358 is the squared-euclidean distance. 359 """ 360 return rdist 361 362 def dist_to_rdist(self, dist): 363 """Convert the true distance to the reduced distance. 364 365 The reduced distance, defined for some metrics, is a computationally 366 more efficent measure which preserves the rank of the true distance. 367 For example, in the Euclidean distance metric, the reduced distance 368 is the squared-euclidean distance. 369 """ 370 return dist 371 372 def pairwise(self, X, Y=None): 373 """Compute the pairwise distances between X and Y 374 375 This is a convenience routine for the sake of testing. For many 376 metrics, the utilities in scipy.spatial.distance.cdist and 377 scipy.spatial.distance.pdist will be faster. 378 379 Parameters 380 ---------- 381 X : array_like 382 Array of shape (Nx, D), representing Nx points in D dimensions. 383 Y : array_like (optional) 384 Array of shape (Ny, D), representing Ny points in D dimensions. 385 If not specified, then Y=X. 386 Returns 387 ------- 388 dist : ndarray 389 The shape (Nx, Ny) array of pairwise distances between points in 390 X and Y. 391 """ 392 cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Xarr 393 cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Yarr 394 cdef np.ndarray[DTYPE_t, ndim=2, mode='c'] Darr 395 396 Xarr = np.asarray(X, dtype=DTYPE, order='C') 397 if Y is None: 398 Darr = np.zeros((Xarr.shape[0], Xarr.shape[0]), 399 dtype=DTYPE, order='C') 400 self.pdist(get_memview_DTYPE_2D(Xarr), 401 get_memview_DTYPE_2D(Darr)) 402 else: 403 Yarr = np.asarray(Y, dtype=DTYPE, order='C') 404 Darr = np.zeros((Xarr.shape[0], Yarr.shape[0]), 405 dtype=DTYPE, order='C') 406 self.cdist(get_memview_DTYPE_2D(Xarr), 407 get_memview_DTYPE_2D(Yarr), 408 get_memview_DTYPE_2D(Darr)) 409 return Darr 410 411 412# ------------------------------------------------------------ 413# Euclidean Distance 414# d = sqrt(sum(x_i^2 - y_i^2)) 415cdef class EuclideanDistance(DistanceMetric): 416 """Euclidean Distance metric 417 418 .. math:: 419 D(x, y) = \sqrt{ \sum_i (x_i - y_i) ^ 2 } 420 """ 421 def __init__(self): 422 self.p = 2 423 424 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 425 ITYPE_t size) nogil except -1: 426 return euclidean_dist(x1, x2, size) 427 428 cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, 429 ITYPE_t size) nogil except -1: 430 return euclidean_rdist(x1, x2, size) 431 432 cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: 433 return sqrt(rdist) 434 435 cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: 436 return dist * dist 437 438 def rdist_to_dist(self, rdist): 439 return np.sqrt(rdist) 440 441 def dist_to_rdist(self, dist): 442 return dist ** 2 443 444 445# ------------------------------------------------------------ 446# SEuclidean Distance 447# d = sqrt(sum((x_i - y_i2)^2 / v_i)) 448cdef class SEuclideanDistance(DistanceMetric): 449 """Standardized Euclidean Distance metric 450 451 .. math:: 452 D(x, y) = \sqrt{ \sum_i \frac{ (x_i - y_i) ^ 2}{V_i} } 453 """ 454 def __init__(self, V): 455 self.vec = np.asarray(V, dtype=DTYPE) 456 self.vec_ptr = get_vec_ptr(self.vec) 457 self.size = self.vec.shape[0] 458 self.p = 2 459 460 cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, 461 ITYPE_t size) nogil except -1: 462 if size != self.size: 463 with gil: 464 raise ValueError('SEuclidean dist: size of V does not match') 465 cdef DTYPE_t tmp, d=0 466 cdef np.intp_t j 467 for j in range(size): 468 tmp = x1[j] - x2[j] 469 d += tmp * tmp / self.vec_ptr[j] 470 return d 471 472 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 473 ITYPE_t size) nogil except -1: 474 return sqrt(self.rdist(x1, x2, size)) 475 476 cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: 477 return sqrt(rdist) 478 479 cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: 480 return dist * dist 481 482 def rdist_to_dist(self, rdist): 483 return np.sqrt(rdist) 484 485 def dist_to_rdist(self, dist): 486 return dist ** 2 487 488 489# ------------------------------------------------------------ 490# Manhattan Distance 491# d = sum(abs(x_i - y_i)) 492cdef class ManhattanDistance(DistanceMetric): 493 """Manhattan/City-block Distance metric 494 495 .. math:: 496 D(x, y) = \sum_i |x_i - y_i| 497 """ 498 def __init__(self): 499 self.p = 1 500 501 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 502 ITYPE_t size) nogil except -1: 503 cdef DTYPE_t d = 0 504 cdef np.intp_t j 505 for j in range(size): 506 d += fabs(x1[j] - x2[j]) 507 return d 508 509 510# ------------------------------------------------------------ 511# Chebyshev Distance 512# d = max_i(abs(x_i), abs(y_i)) 513cdef class ChebyshevDistance(DistanceMetric): 514 """Chebyshev/Infinity Distance 515 516 .. math:: 517 D(x, y) = max_i (|x_i - y_i|) 518 """ 519 def __init__(self): 520 self.p = INF 521 522 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 523 ITYPE_t size) nogil except -1: 524 cdef DTYPE_t d = 0 525 cdef np.intp_t j 526 for j in range(size): 527 d = fmax(d, fabs(x1[j] - x2[j])) 528 return d 529 530 531# ------------------------------------------------------------ 532# Minkowski Distance 533# d = sum(x_i^p - y_i^p) ^ (1/p) 534cdef class MinkowskiDistance(DistanceMetric): 535 """Minkowski Distance 536 537 .. math:: 538 D(x, y) = [\sum_i (x_i - y_i)^p] ^ (1/p) 539 540 Minkowski Distance requires p >= 1 and finite. For p = infinity, 541 use ChebyshevDistance. 542 Note that for p=1, ManhattanDistance is more efficient, and for 543 p=2, EuclideanDistance is more efficient. 544 """ 545 def __init__(self, p): 546 if p < 1: 547 raise ValueError("p must be greater than 1") 548 elif np.isinf(p): 549 raise ValueError("MinkowskiDistance requires finite p. " 550 "For p=inf, use ChebyshevDistance.") 551 self.p = p 552 553 cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, 554 ITYPE_t size) nogil except -1: 555 cdef DTYPE_t d=0 556 cdef np.intp_t j 557 for j in range(size): 558 d += pow(fabs(x1[j] - x2[j]), self.p) 559 return d 560 561 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 562 ITYPE_t size) nogil except -1: 563 return pow(self.rdist(x1, x2, size), 1. / self.p) 564 565 cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: 566 return pow(rdist, 1. / self.p) 567 568 cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: 569 return pow(dist, self.p) 570 571 def rdist_to_dist(self, rdist): 572 return rdist ** (1. / self.p) 573 574 def dist_to_rdist(self, dist): 575 return dist ** self.p 576 577 578# ------------------------------------------------------------ 579# W-Minkowski Distance 580# d = sum(w_i * (x_i^p - y_i^p)) ^ (1/p) 581cdef class WMinkowskiDistance(DistanceMetric): 582 """Weighted Minkowski Distance 583 584 .. math:: 585 D(x, y) = [\sum_i w_i (x_i - y_i)^p] ^ (1/p) 586 587 Weighted Minkowski Distance requires p >= 1 and finite. 588 589 Parameters 590 ---------- 591 p : int 592 The order of the norm of the difference :math:`{||u-v||}_p`. 593 w : (N,) array_like 594 The weight vector. 595 596 """ 597 def __init__(self, p, w): 598 if p < 1: 599 raise ValueError("p must be greater than 1") 600 elif np.isinf(p): 601 raise ValueError("WMinkowskiDistance requires finite p. " 602 "For p=inf, use ChebyshevDistance.") 603 self.p = p 604 self.vec = np.asarray(w, dtype=DTYPE) 605 self.vec_ptr = get_vec_ptr(self.vec) 606 self.size = self.vec.shape[0] 607 608 cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, 609 ITYPE_t size) nogil except -1: 610 if size != self.size: 611 with gil: 612 raise ValueError('WMinkowskiDistance dist: ' 613 'size of w does not match') 614 cdef DTYPE_t d=0 615 cdef np.intp_t j 616 for j in range(size): 617 d += pow(self.vec_ptr[j] * fabs(x1[j] - x2[j]), self.p) 618 return d 619 620 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 621 ITYPE_t size) nogil except -1: 622 return pow(self.rdist(x1, x2, size), 1. / self.p) 623 624 cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: 625 return pow(rdist, 1. / self.p) 626 627 cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: 628 return pow(dist, self.p) 629 630 def rdist_to_dist(self, rdist): 631 return rdist ** (1. / self.p) 632 633 def dist_to_rdist(self, dist): 634 return dist ** self.p 635 636 637# ------------------------------------------------------------ 638# Mahalanobis Distance 639# d = sqrt( (x - y)^T V^-1 (x - y) ) 640cdef class MahalanobisDistance(DistanceMetric): 641 """Mahalanobis Distance 642 643 .. math:: 644 D(x, y) = \sqrt{ (x - y)^T V^{-1} (x - y) } 645 646 Parameters 647 ---------- 648 V : array_like 649 Symmetric positive-definite covariance matrix. 650 The inverse of this matrix will be explicitly computed. 651 VI : array_like 652 optionally specify the inverse directly. If VI is passed, 653 then V is not referenced. 654 """ 655 def __init__(self, V=None, VI=None): 656 if VI is None: 657 VI = np.linalg.inv(V) 658 if VI.ndim != 2 or VI.shape[0] != VI.shape[1]: 659 raise ValueError("V/VI must be square") 660 661 self.mat = np.asarray(VI, dtype=float, order='C') 662 self.mat_ptr = get_mat_ptr(self.mat) 663 664 self.size = self.mat.shape[0] 665 666 # we need vec as a work buffer 667 self.vec = np.zeros(self.size, dtype=DTYPE) 668 self.vec_ptr = get_vec_ptr(self.vec) 669 670 cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, 671 ITYPE_t size) nogil except -1: 672 if size != self.size: 673 with gil: 674 raise ValueError('Mahalanobis dist: size of V does not match') 675 676 cdef DTYPE_t tmp, d = 0 677 cdef np.intp_t i, j 678 679 # compute (x1 - x2).T * VI * (x1 - x2) 680 for i in range(size): 681 self.vec_ptr[i] = x1[i] - x2[i] 682 683 for i in range(size): 684 tmp = 0 685 for j in range(size): 686 tmp += self.mat_ptr[i * size + j] * self.vec_ptr[j] 687 d += tmp * self.vec_ptr[i] 688 return d 689 690 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 691 ITYPE_t size) nogil except -1: 692 return sqrt(self.rdist(x1, x2, size)) 693 694 cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: 695 return sqrt(rdist) 696 697 cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: 698 return dist * dist 699 700 def rdist_to_dist(self, rdist): 701 return np.sqrt(rdist) 702 703 def dist_to_rdist(self, dist): 704 return dist ** 2 705 706 707# ------------------------------------------------------------ 708# Hamming Distance 709# d = N_unequal(x, y) / N_tot 710cdef class HammingDistance(DistanceMetric): 711 """Hamming Distance 712 713 Hamming distance is meant for discrete-valued vectors, though it is 714 a valid metric for real-valued vectors. 715 716 .. math:: 717 D(x, y) = \frac{1}{N} \sum_i \delta_{x_i, y_i} 718 """ 719 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 720 ITYPE_t size) nogil except -1: 721 cdef int n_unequal = 0 722 cdef np.intp_t j 723 for j in range(size): 724 if x1[j] != x2[j]: 725 n_unequal += 1 726 return float(n_unequal) / size 727 728 729# ------------------------------------------------------------ 730# Canberra Distance 731# D(x, y) = sum[ abs(x_i - y_i) / (abs(x_i) + abs(y_i)) ] 732cdef class CanberraDistance(DistanceMetric): 733 """Canberra Distance 734 735 Canberra distance is meant for discrete-valued vectors, though it is 736 a valid metric for real-valued vectors. 737 738 .. math:: 739 D(x, y) = \sum_i \frac{|x_i - y_i|}{|x_i| + |y_i|} 740 """ 741 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 742 ITYPE_t size) nogil except -1: 743 cdef DTYPE_t denom, d = 0 744 cdef np.intp_t j 745 for j in range(size): 746 denom = fabs(x1[j]) + fabs(x2[j]) 747 if denom > 0: 748 d += fabs(x1[j] - x2[j]) / denom 749 return d 750 751 752# ------------------------------------------------------------ 753# Bray-Curtis Distance 754# D(x, y) = sum[abs(x_i - y_i)] / sum[abs(x_i) + abs(y_i)] 755cdef class BrayCurtisDistance(DistanceMetric): 756 """Bray-Curtis Distance 757 758 Bray-Curtis distance is meant for discrete-valued vectors, though it is 759 a valid metric for real-valued vectors. 760 761 .. math:: 762 D(x, y) = \frac{\sum_i |x_i - y_i|}{\sum_i(|x_i| + |y_i|)} 763 """ 764 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 765 ITYPE_t size) nogil except -1: 766 cdef DTYPE_t num = 0, denom = 0 767 cdef np.intp_t j 768 for j in range(size): 769 num += fabs(x1[j] - x2[j]) 770 denom += fabs(x1[j]) + fabs(x2[j]) 771 if denom > 0: 772 return num / denom 773 else: 774 return 0.0 775 776 777# ------------------------------------------------------------ 778# Jaccard Distance (boolean) 779# D(x, y) = N_unequal(x, y) / N_nonzero(x, y) 780cdef class JaccardDistance(DistanceMetric): 781 """Jaccard Distance 782 783 Jaccard Distance is a dissimilarity measure for boolean-valued 784 vectors. All nonzero entries will be treated as True, zero entries will 785 be treated as False. 786 787 .. math:: 788 D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} + N_{TF} + N_{FT}} 789 """ 790 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 791 ITYPE_t size) nogil except -1: 792 cdef int tf1, tf2, n_eq = 0, nnz = 0 793 cdef np.intp_t j 794 for j in range(size): 795 tf1 = x1[j] != 0 796 tf2 = x2[j] != 0 797 nnz += (tf1 or tf2) 798 n_eq += (tf1 and tf2) 799 if nnz == 0: 800 return 0.0 801 return (nnz - n_eq) * 1.0 / nnz 802 803 804# ------------------------------------------------------------ 805# Matching Distance (boolean) 806# D(x, y) = n_neq / n 807cdef class MatchingDistance(DistanceMetric): 808 """Matching Distance 809 810 Matching Distance is a dissimilarity measure for boolean-valued 811 vectors. All nonzero entries will be treated as True, zero entries will 812 be treated as False. 813 814 .. math:: 815 D(x, y) = \frac{N_{TF} + N_{FT}}{N} 816 """ 817 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 818 ITYPE_t size) nogil except -1: 819 cdef int tf1, tf2, n_neq = 0 820 cdef np.intp_t j 821 for j in range(size): 822 tf1 = x1[j] != 0 823 tf2 = x2[j] != 0 824 n_neq += (tf1 != tf2) 825 return n_neq * 1. / size 826 827 828# ------------------------------------------------------------ 829# Dice Distance (boolean) 830# D(x, y) = n_neq / (2 * ntt + n_neq) 831cdef class DiceDistance(DistanceMetric): 832 """Dice Distance 833 834 Dice Distance is a dissimilarity measure for boolean-valued 835 vectors. All nonzero entries will be treated as True, zero entries will 836 be treated as False. 837 838 .. math:: 839 D(x, y) = \frac{N_{TF} + N_{FT}}{2 * N_{TT} + N_{TF} + N_{FT}} 840 """ 841 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 842 ITYPE_t size) nogil except -1: 843 cdef int tf1, tf2, n_neq = 0, ntt = 0 844 cdef np.intp_t j 845 for j in range(size): 846 tf1 = x1[j] != 0 847 tf2 = x2[j] != 0 848 ntt += (tf1 and tf2) 849 n_neq += (tf1 != tf2) 850 return n_neq / (2.0 * ntt + n_neq) 851 852 853# ------------------------------------------------------------ 854# Kulsinski Distance (boolean) 855# D(x, y) = (ntf + nft - ntt + n) / (n_neq + n) 856cdef class KulsinskiDistance(DistanceMetric): 857 """Kulsinski Distance 858 859 Kulsinski Distance is a dissimilarity measure for boolean-valued 860 vectors. All nonzero entries will be treated as True, zero entries will 861 be treated as False. 862 863 .. math:: 864 D(x, y) = 1 - \frac{N_{TT}}{N + N_{TF} + N_{FT}} 865 """ 866 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 867 ITYPE_t size) nogil except -1: 868 cdef int tf1, tf2, ntt = 0, n_neq = 0 869 cdef np.intp_t j 870 for j in range(size): 871 tf1 = x1[j] != 0 872 tf2 = x2[j] != 0 873 n_neq += (tf1 != tf2) 874 ntt += (tf1 and tf2) 875 return (n_neq - ntt + size) * 1.0 / (n_neq + size) 876 877 878# ------------------------------------------------------------ 879# Rogers-Tanimoto Distance (boolean) 880# D(x, y) = 2 * n_neq / (n + n_neq) 881cdef class RogersTanimotoDistance(DistanceMetric): 882 """Rogers-Tanimoto Distance 883 884 Rogers-Tanimoto Distance is a dissimilarity measure for boolean-valued 885 vectors. All nonzero entries will be treated as True, zero entries will 886 be treated as False. 887 888 .. math:: 889 D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}} 890 """ 891 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 892 ITYPE_t size) nogil except -1: 893 cdef int tf1, tf2, n_neq = 0 894 cdef np.intp_t j 895 for j in range(size): 896 tf1 = x1[j] != 0 897 tf2 = x2[j] != 0 898 n_neq += (tf1 != tf2) 899 return (2.0 * n_neq) / (size + n_neq) 900 901 902# ------------------------------------------------------------ 903# Russell-Rao Distance (boolean) 904# D(x, y) = (n - ntt) / n 905cdef class RussellRaoDistance(DistanceMetric): 906 """Russell-Rao Distance 907 908 Russell-Rao Distance is a dissimilarity measure for boolean-valued 909 vectors. All nonzero entries will be treated as True, zero entries will 910 be treated as False. 911 912 .. math:: 913 D(x, y) = \frac{N - N_{TT}}{N} 914 """ 915 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 916 ITYPE_t size) nogil except -1: 917 cdef int tf1, tf2, ntt = 0 918 cdef np.intp_t j 919 for j in range(size): 920 tf1 = x1[j] != 0 921 tf2 = x2[j] != 0 922 ntt += (tf1 and tf2) 923 return (size - ntt) * 1. / size 924 925 926# ------------------------------------------------------------ 927# Sokal-Michener Distance (boolean) 928# D(x, y) = 2 * n_neq / (n + n_neq) 929cdef class SokalMichenerDistance(DistanceMetric): 930 """Sokal-Michener Distance 931 932 Sokal-Michener Distance is a dissimilarity measure for boolean-valued 933 vectors. All nonzero entries will be treated as True, zero entries will 934 be treated as False. 935 936 .. math:: 937 D(x, y) = \frac{2 (N_{TF} + N_{FT})}{N + N_{TF} + N_{FT}} 938 """ 939 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 940 ITYPE_t size) nogil except -1: 941 cdef int tf1, tf2, n_neq = 0 942 cdef np.intp_t j 943 for j in range(size): 944 tf1 = x1[j] != 0 945 tf2 = x2[j] != 0 946 n_neq += (tf1 != tf2) 947 return (2.0 * n_neq) / (size + n_neq) 948 949 950# ------------------------------------------------------------ 951# Sokal-Sneath Distance (boolean) 952# D(x, y) = n_neq / (0.5 * n_tt + n_neq) 953cdef class SokalSneathDistance(DistanceMetric): 954 """Sokal-Sneath Distance 955 956 Sokal-Sneath Distance is a dissimilarity measure for boolean-valued 957 vectors. All nonzero entries will be treated as True, zero entries will 958 be treated as False. 959 960 .. math:: 961 D(x, y) = \frac{N_{TF} + N_{FT}}{N_{TT} / 2 + N_{TF} + N_{FT}} 962 """ 963 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 964 ITYPE_t size) nogil except -1: 965 cdef int tf1, tf2, ntt = 0, n_neq = 0 966 cdef np.intp_t j 967 for j in range(size): 968 tf1 = x1[j] != 0 969 tf2 = x2[j] != 0 970 n_neq += (tf1 != tf2) 971 ntt += (tf1 and tf2) 972 return n_neq / (0.5 * ntt + n_neq) 973 974 975# ------------------------------------------------------------ 976# Haversine Distance (2 dimensional) 977# D(x, y) = 2 arcsin{sqrt[sin^2 ((x1 - y1) / 2) 978# + cos(x1) cos(y1) sin^2 ((x2 - y2) / 2)]} 979cdef class HaversineDistance(DistanceMetric): 980 """Haversine (Spherical) Distance 981 982 The Haversine distance is the angular distance between two points on 983 the surface of a sphere. The first distance of each point is assumed 984 to be the latitude, the second is the longitude, given in radians. 985 The dimension of the points must be 2: 986 987 .. math:: 988 D(x, y) = 2\arcsin[\sqrt{\sin^2((x1 - y1) / 2) 989 + cos(x1)cos(y1)sin^2((x2 - y2) / 2)}] 990 """ 991 cdef inline DTYPE_t rdist(self, DTYPE_t* x1, DTYPE_t* x2, 992 ITYPE_t size) nogil except -1: 993 if size != 2: 994 with gil: 995 raise ValueError("Haversine distance only valid " 996 "in 2 dimensions") 997 cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0])) 998 cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1])) 999 return (sin_0 * sin_0 + cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1) 1000 1001 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 1002 ITYPE_t size) nogil except -1: 1003 if size != 2: 1004 with gil: 1005 raise ValueError("Haversine distance only valid in" 1006 " 2 dimensions") 1007 cdef DTYPE_t sin_0 = sin(0.5 * (x1[0] - x2[0])) 1008 cdef DTYPE_t sin_1 = sin(0.5 * (x1[1] - x2[1])) 1009 return 2 * asin(sqrt(sin_0 * sin_0 + 1010 cos(x1[0]) * cos(x2[0]) * sin_1 * sin_1)) 1011 1012 cdef inline DTYPE_t _rdist_to_dist(self, DTYPE_t rdist) except -1: 1013 return 2 * asin(sqrt(rdist)) 1014 1015 cdef inline DTYPE_t _dist_to_rdist(self, DTYPE_t dist) nogil except -1: 1016 cdef DTYPE_t tmp = sin(0.5 * dist) 1017 return tmp * tmp 1018 1019 def rdist_to_dist(self, rdist): 1020 return 2 * np.arcsin(np.sqrt(rdist)) 1021 1022 def dist_to_rdist(self, dist): 1023 tmp = np.sin(0.5 * dist) 1024 return tmp * tmp 1025 1026 1027# ------------------------------------------------------------ 1028# Yule Distance (boolean) 1029# D(x, y) = 2 * ntf * nft / (ntt * nff + ntf * nft) 1030# [This is not a true metric, so we will leave it out.] 1031# 1032# cdef class YuleDistance(DistanceMetric): 1033# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size): 1034# cdef int tf1, tf2, ntf = 0, nft = 0, ntt = 0, nff = 0 1035# cdef np.intp_t j 1036# for j in range(size): 1037# tf1 = x1[j] != 0 1038# tf2 = x2[j] != 0 1039# ntt += tf1 and tf2 1040# ntf += tf1 and (tf2 == 0) 1041# nft += (tf1 == 0) and tf2 1042# nff = size - ntt - ntf - nft 1043# return (2.0 * ntf * nft) / (ntt * nff + ntf * nft) 1044 1045 1046# ------------------------------------------------------------ 1047# Cosine Distance 1048# D(x, y) = dot(x, y) / (|x| * |y|) 1049# [This is not a true metric, so we will leave it out. Use the `arccos` 1050# distance instead] 1051 1052# cdef class CosineDistance(DistanceMetric): 1053# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 1054# ITYPE_t size) nogil except -1: 1055# cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0 1056# cdef np.intp_t j 1057# for j in range(size): 1058# d += x1[j] * x2[j] 1059# norm1 += x1[j] * x1[j] 1060# norm2 += x2[j] * x2[j] 1061# return 1.0 - d / sqrt(norm1 * norm2) 1062 1063# ------------------------------------------------------------ 1064# Arccos Distance 1065# D(x, y) = arccos(dot(x, y) / (|x| * |y|)) / PI 1066 1067cdef class ArccosDistance(DistanceMetric): 1068 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 1069 ITYPE_t size) nogil except -1: 1070 cdef DTYPE_t d = 0, norm1 = 0, norm2 = 0 1071 cdef np.intp_t j 1072 for j in range(size): 1073 d += x1[j] * x2[j] 1074 norm1 += x1[j] * x1[j] 1075 norm2 += x2[j] * x2[j] 1076 return acos(d / sqrt(norm1 * norm2)) / M_PI 1077 1078 1079# ------------------------------------------------------------ 1080# Correlation Distance 1081# D(x, y) = dot((x - mx), (y - my)) / (|x - mx| * |y - my|) 1082# [This is not a true metric, so we will leave it out.] 1083# 1084# cdef class CorrelationDistance(DistanceMetric): 1085# cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, ITYPE_t size): 1086# cdef DTYPE_t mu1 = 0, mu2 = 0, x1nrm = 0, x2nrm = 0, x1Tx2 = 0 1087# cdef DTYPE_t tmp1, tmp2 1088# 1089# cdef np.intp_t i 1090# for i in range(size): 1091# mu1 += x1[i] 1092# mu2 += x2[i] 1093# mu1 /= size 1094# mu2 /= size 1095# 1096# for i in range(size): 1097# tmp1 = x1[i] - mu1 1098# tmp2 = x2[i] - mu2 1099# x1nrm += tmp1 * tmp1 1100# x2nrm += tmp2 * tmp2 1101# x1Tx2 += tmp1 * tmp2 1102# 1103# return (1. - x1Tx2) / sqrt(x1nrm * x2nrm) 1104 1105 1106# ------------------------------------------------------------ 1107# User-defined distance 1108# 1109cdef class PyFuncDistance(DistanceMetric): 1110 """PyFunc Distance 1111 A user-defined distance 1112 Parameters 1113 ---------- 1114 func : function 1115 func should take two numpy arrays as input, and return a distance. 1116 """ 1117 def __init__(self, func, **kwargs): 1118 self.func = func 1119 self.kwargs = kwargs 1120 1121 # in cython < 0.26, GIL was required to be acquired during definition of 1122 # the function and inside the body of the function. This behaviour is not 1123 # allowed in cython >= 0.26 since it is a redundant GIL acquisition. The 1124 # only way to be back compatible is to inherit `dist` from the base class 1125 # without GIL and called an inline `_dist` which acquire GIL. 1126 cdef inline DTYPE_t dist(self, DTYPE_t* x1, DTYPE_t* x2, 1127 ITYPE_t size) nogil except -1: 1128 return self._dist(x1, x2, size) 1129 1130 cdef inline DTYPE_t _dist(self, DTYPE_t* x1, DTYPE_t* x2, 1131 ITYPE_t size) except -1 with gil: 1132 cdef np.ndarray x1arr 1133 cdef np.ndarray x2arr 1134 x1arr = _buffer_to_ndarray(x1, size) 1135 x2arr = _buffer_to_ndarray(x2, size) 1136 d = self.func(x1arr, x2arr, **self.kwargs) 1137 try: 1138 # Cython generates code here that results in a TypeError 1139 # if d is the wrong type. 1140 return d 1141 except TypeError: 1142 raise TypeError("Custom distance function must accept two " 1143 "vectors and return a float.") 1144 1145 1146cdef inline double fmax(double a, double b) nogil: 1147 return max(a, b) 1148