1# -*- coding: utf-8 -*- 2 3# Authors: Alexandre Gramfort <alexandre.gramfort@inria.fr> 4# Mathieu Blondel <mathieu@mblondel.org> 5# Robert Layton <robertlayton@gmail.com> 6# Andreas Mueller <amueller@ais.uni-bonn.de> 7# Philippe Gervais <philippe.gervais@inria.fr> 8# Lars Buitinck 9# Joel Nothman <joel.nothman@gmail.com> 10# License: BSD 3 clause 11 12import itertools 13from functools import partial 14import warnings 15 16import numpy as np 17from scipy.spatial import distance 18from scipy.sparse import csr_matrix 19from scipy.sparse import issparse 20from joblib import Parallel, effective_n_jobs 21 22from ..utils.validation import _num_samples 23from ..utils.validation import check_non_negative 24from ..utils import check_array 25from ..utils import gen_even_slices 26from ..utils import gen_batches, get_chunk_n_rows 27from ..utils import is_scalar_nan 28from ..utils.extmath import row_norms, safe_sparse_dot 29from ..preprocessing import normalize 30from ..utils._mask import _get_mask 31from ..utils.fixes import delayed 32from ..utils.fixes import sp_version, parse_version 33 34from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan 35from ..exceptions import DataConversionWarning 36 37 38# Utility Functions 39def _return_float_dtype(X, Y): 40 """ 41 1. If dtype of X and Y is float32, then dtype float32 is returned. 42 2. Else dtype float is returned. 43 """ 44 if not issparse(X) and not isinstance(X, np.ndarray): 45 X = np.asarray(X) 46 47 if Y is None: 48 Y_dtype = X.dtype 49 elif not issparse(Y) and not isinstance(Y, np.ndarray): 50 Y = np.asarray(Y) 51 Y_dtype = Y.dtype 52 else: 53 Y_dtype = Y.dtype 54 55 if X.dtype == Y_dtype == np.float32: 56 dtype = np.float32 57 else: 58 dtype = float 59 60 return X, Y, dtype 61 62 63def check_pairwise_arrays( 64 X, 65 Y, 66 *, 67 precomputed=False, 68 dtype=None, 69 accept_sparse="csr", 70 force_all_finite=True, 71 copy=False, 72): 73 """Set X and Y appropriately and checks inputs. 74 75 If Y is None, it is set as a pointer to X (i.e. not a copy). 76 If Y is given, this does not happen. 77 All distance metrics should use this function first to assert that the 78 given parameters are correct and safe to use. 79 80 Specifically, this function first ensures that both X and Y are arrays, 81 then checks that they are at least two dimensional while ensuring that 82 their elements are floats (or dtype if provided). Finally, the function 83 checks that the size of the second dimension of the two arrays is equal, or 84 the equivalent check for a precomputed distance matrix. 85 86 Parameters 87 ---------- 88 X : {array-like, sparse matrix} of shape (n_samples_X, n_features) 89 90 Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) 91 92 precomputed : bool, default=False 93 True if X is to be treated as precomputed distances to the samples in 94 Y. 95 96 dtype : str, type, list of type, default=None 97 Data type required for X and Y. If None, the dtype will be an 98 appropriate float type selected by _return_float_dtype. 99 100 .. versionadded:: 0.18 101 102 accept_sparse : str, bool or list/tuple of str, default='csr' 103 String[s] representing allowed sparse matrix formats, such as 'csc', 104 'csr', etc. If the input is sparse but not in the allowed format, 105 it will be converted to the first listed format. True allows the input 106 to be any format. False means that a sparse matrix input will 107 raise an error. 108 109 force_all_finite : bool or 'allow-nan', default=True 110 Whether to raise an error on np.inf, np.nan, pd.NA in array. The 111 possibilities are: 112 113 - True: Force all values of array to be finite. 114 - False: accepts np.inf, np.nan, pd.NA in array. 115 - 'allow-nan': accepts only np.nan and pd.NA values in array. Values 116 cannot be infinite. 117 118 .. versionadded:: 0.22 119 ``force_all_finite`` accepts the string ``'allow-nan'``. 120 121 .. versionchanged:: 0.23 122 Accepts `pd.NA` and converts it into `np.nan`. 123 124 copy : bool, default=False 125 Whether a forced copy will be triggered. If copy=False, a copy might 126 be triggered by a conversion. 127 128 .. versionadded:: 0.22 129 130 Returns 131 ------- 132 safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features) 133 An array equal to X, guaranteed to be a numpy array. 134 135 safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) 136 An array equal to Y if Y was not None, guaranteed to be a numpy array. 137 If Y was None, safe_Y will be a pointer to X. 138 139 """ 140 X, Y, dtype_float = _return_float_dtype(X, Y) 141 142 estimator = "check_pairwise_arrays" 143 if dtype is None: 144 dtype = dtype_float 145 146 if Y is X or Y is None: 147 X = Y = check_array( 148 X, 149 accept_sparse=accept_sparse, 150 dtype=dtype, 151 copy=copy, 152 force_all_finite=force_all_finite, 153 estimator=estimator, 154 ) 155 else: 156 X = check_array( 157 X, 158 accept_sparse=accept_sparse, 159 dtype=dtype, 160 copy=copy, 161 force_all_finite=force_all_finite, 162 estimator=estimator, 163 ) 164 Y = check_array( 165 Y, 166 accept_sparse=accept_sparse, 167 dtype=dtype, 168 copy=copy, 169 force_all_finite=force_all_finite, 170 estimator=estimator, 171 ) 172 173 if precomputed: 174 if X.shape[1] != Y.shape[0]: 175 raise ValueError( 176 "Precomputed metric requires shape " 177 "(n_queries, n_indexed). Got (%d, %d) " 178 "for %d indexed." % (X.shape[0], X.shape[1], Y.shape[0]) 179 ) 180 elif X.shape[1] != Y.shape[1]: 181 raise ValueError( 182 "Incompatible dimension for X and Y matrices: " 183 "X.shape[1] == %d while Y.shape[1] == %d" % (X.shape[1], Y.shape[1]) 184 ) 185 186 return X, Y 187 188 189def check_paired_arrays(X, Y): 190 """Set X and Y appropriately and checks inputs for paired distances. 191 192 All paired distance metrics should use this function first to assert that 193 the given parameters are correct and safe to use. 194 195 Specifically, this function first ensures that both X and Y are arrays, 196 then checks that they are at least two dimensional while ensuring that 197 their elements are floats. Finally, the function checks that the size 198 of the dimensions of the two arrays are equal. 199 200 Parameters 201 ---------- 202 X : {array-like, sparse matrix} of shape (n_samples_X, n_features) 203 204 Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) 205 206 Returns 207 ------- 208 safe_X : {array-like, sparse matrix} of shape (n_samples_X, n_features) 209 An array equal to X, guaranteed to be a numpy array. 210 211 safe_Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) 212 An array equal to Y if Y was not None, guaranteed to be a numpy array. 213 If Y was None, safe_Y will be a pointer to X. 214 215 """ 216 X, Y = check_pairwise_arrays(X, Y) 217 if X.shape != Y.shape: 218 raise ValueError( 219 "X and Y should be of same shape. They were respectively %r and %r long." 220 % (X.shape, Y.shape) 221 ) 222 return X, Y 223 224 225# Pairwise distances 226def euclidean_distances( 227 X, Y=None, *, Y_norm_squared=None, squared=False, X_norm_squared=None 228): 229 """ 230 Compute the distance matrix between each pair from a vector array X and Y. 231 232 For efficiency reasons, the euclidean distance between a pair of row 233 vector x and y is computed as:: 234 235 dist(x, y) = sqrt(dot(x, x) - 2 * dot(x, y) + dot(y, y)) 236 237 This formulation has two advantages over other ways of computing distances. 238 First, it is computationally efficient when dealing with sparse data. 239 Second, if one argument varies but the other remains unchanged, then 240 `dot(x, x)` and/or `dot(y, y)` can be pre-computed. 241 242 However, this is not the most precise way of doing this computation, 243 because this equation potentially suffers from "catastrophic cancellation". 244 Also, the distance matrix returned by this function may not be exactly 245 symmetric as required by, e.g., ``scipy.spatial.distance`` functions. 246 247 Read more in the :ref:`User Guide <metrics>`. 248 249 Parameters 250 ---------- 251 X : {array-like, sparse matrix} of shape (n_samples_X, n_features) 252 An array where each row is a sample and each column is a feature. 253 254 Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \ 255 default=None 256 An array where each row is a sample and each column is a feature. 257 If `None`, method uses `Y=X`. 258 259 Y_norm_squared : array-like of shape (n_samples_Y,) or (n_samples_Y, 1) \ 260 or (1, n_samples_Y), default=None 261 Pre-computed dot-products of vectors in Y (e.g., 262 ``(Y**2).sum(axis=1)``) 263 May be ignored in some cases, see the note below. 264 265 squared : bool, default=False 266 Return squared Euclidean distances. 267 268 X_norm_squared : array-like of shape (n_samples_X,) or (n_samples_X, 1) \ 269 or (1, n_samples_X), default=None 270 Pre-computed dot-products of vectors in X (e.g., 271 ``(X**2).sum(axis=1)``) 272 May be ignored in some cases, see the note below. 273 274 Returns 275 ------- 276 distances : ndarray of shape (n_samples_X, n_samples_Y) 277 Returns the distances between the row vectors of `X` 278 and the row vectors of `Y`. 279 280 See Also 281 -------- 282 paired_distances : Distances betweens pairs of elements of X and Y. 283 284 Notes 285 ----- 286 To achieve a better accuracy, `X_norm_squared` and `Y_norm_squared` may be 287 unused if they are passed as `np.float32`. 288 289 Examples 290 -------- 291 >>> from sklearn.metrics.pairwise import euclidean_distances 292 >>> X = [[0, 1], [1, 1]] 293 >>> # distance between rows of X 294 >>> euclidean_distances(X, X) 295 array([[0., 1.], 296 [1., 0.]]) 297 >>> # get distance to origin 298 >>> euclidean_distances(X, [[0, 0]]) 299 array([[1. ], 300 [1.41421356]]) 301 """ 302 X, Y = check_pairwise_arrays(X, Y) 303 304 if X_norm_squared is not None: 305 X_norm_squared = check_array(X_norm_squared, ensure_2d=False) 306 original_shape = X_norm_squared.shape 307 if X_norm_squared.shape == (X.shape[0],): 308 X_norm_squared = X_norm_squared.reshape(-1, 1) 309 if X_norm_squared.shape == (1, X.shape[0]): 310 X_norm_squared = X_norm_squared.T 311 if X_norm_squared.shape != (X.shape[0], 1): 312 raise ValueError( 313 f"Incompatible dimensions for X of shape {X.shape} and " 314 f"X_norm_squared of shape {original_shape}." 315 ) 316 317 if Y_norm_squared is not None: 318 Y_norm_squared = check_array(Y_norm_squared, ensure_2d=False) 319 original_shape = Y_norm_squared.shape 320 if Y_norm_squared.shape == (Y.shape[0],): 321 Y_norm_squared = Y_norm_squared.reshape(1, -1) 322 if Y_norm_squared.shape == (Y.shape[0], 1): 323 Y_norm_squared = Y_norm_squared.T 324 if Y_norm_squared.shape != (1, Y.shape[0]): 325 raise ValueError( 326 f"Incompatible dimensions for Y of shape {Y.shape} and " 327 f"Y_norm_squared of shape {original_shape}." 328 ) 329 330 return _euclidean_distances(X, Y, X_norm_squared, Y_norm_squared, squared) 331 332 333def _euclidean_distances(X, Y, X_norm_squared=None, Y_norm_squared=None, squared=False): 334 """Computational part of euclidean_distances 335 336 Assumes inputs are already checked. 337 338 If norms are passed as float32, they are unused. If arrays are passed as 339 float32, norms needs to be recomputed on upcast chunks. 340 TODO: use a float64 accumulator in row_norms to avoid the latter. 341 """ 342 if X_norm_squared is not None: 343 if X_norm_squared.dtype == np.float32: 344 XX = None 345 else: 346 XX = X_norm_squared.reshape(-1, 1) 347 elif X.dtype == np.float32: 348 XX = None 349 else: 350 XX = row_norms(X, squared=True)[:, np.newaxis] 351 352 if Y is X: 353 YY = None if XX is None else XX.T 354 else: 355 if Y_norm_squared is not None: 356 if Y_norm_squared.dtype == np.float32: 357 YY = None 358 else: 359 YY = Y_norm_squared.reshape(1, -1) 360 elif Y.dtype == np.float32: 361 YY = None 362 else: 363 YY = row_norms(Y, squared=True)[np.newaxis, :] 364 365 if X.dtype == np.float32: 366 # To minimize precision issues with float32, we compute the distance 367 # matrix on chunks of X and Y upcast to float64 368 distances = _euclidean_distances_upcast(X, XX, Y, YY) 369 else: 370 # if dtype is already float64, no need to chunk and upcast 371 distances = -2 * safe_sparse_dot(X, Y.T, dense_output=True) 372 distances += XX 373 distances += YY 374 np.maximum(distances, 0, out=distances) 375 376 # Ensure that distances between vectors and themselves are set to 0.0. 377 # This may not be the case due to floating point rounding errors. 378 if X is Y: 379 np.fill_diagonal(distances, 0) 380 381 return distances if squared else np.sqrt(distances, out=distances) 382 383 384def nan_euclidean_distances( 385 X, Y=None, *, squared=False, missing_values=np.nan, copy=True 386): 387 """Calculate the euclidean distances in the presence of missing values. 388 389 Compute the euclidean distance between each pair of samples in X and Y, 390 where Y=X is assumed if Y=None. When calculating the distance between a 391 pair of samples, this formulation ignores feature coordinates with a 392 missing value in either sample and scales up the weight of the remaining 393 coordinates: 394 395 dist(x,y) = sqrt(weight * sq. distance from present coordinates) 396 where, 397 weight = Total # of coordinates / # of present coordinates 398 399 For example, the distance between ``[3, na, na, 6]`` and ``[1, na, 4, 5]`` 400 is: 401 402 .. math:: 403 \\sqrt{\\frac{4}{2}((3-1)^2 + (6-5)^2)} 404 405 If all the coordinates are missing or if there are no common present 406 coordinates then NaN is returned for that pair. 407 408 Read more in the :ref:`User Guide <metrics>`. 409 410 .. versionadded:: 0.22 411 412 Parameters 413 ---------- 414 X : array-like of shape=(n_samples_X, n_features) 415 416 Y : array-like of shape=(n_samples_Y, n_features), default=None 417 418 squared : bool, default=False 419 Return squared Euclidean distances. 420 421 missing_values : np.nan or int, default=np.nan 422 Representation of missing value. 423 424 copy : bool, default=True 425 Make and use a deep copy of X and Y (if Y exists). 426 427 Returns 428 ------- 429 distances : ndarray of shape (n_samples_X, n_samples_Y) 430 431 See Also 432 -------- 433 paired_distances : Distances between pairs of elements of X and Y. 434 435 Examples 436 -------- 437 >>> from sklearn.metrics.pairwise import nan_euclidean_distances 438 >>> nan = float("NaN") 439 >>> X = [[0, 1], [1, nan]] 440 >>> nan_euclidean_distances(X, X) # distance between rows of X 441 array([[0. , 1.41421356], 442 [1.41421356, 0. ]]) 443 444 >>> # get distance to origin 445 >>> nan_euclidean_distances(X, [[0, 0]]) 446 array([[1. ], 447 [1.41421356]]) 448 449 References 450 ---------- 451 * John K. Dixon, "Pattern Recognition with Partly Missing Data", 452 IEEE Transactions on Systems, Man, and Cybernetics, Volume: 9, Issue: 453 10, pp. 617 - 621, Oct. 1979. 454 http://ieeexplore.ieee.org/abstract/document/4310090/ 455 """ 456 457 force_all_finite = "allow-nan" if is_scalar_nan(missing_values) else True 458 X, Y = check_pairwise_arrays( 459 X, Y, accept_sparse=False, force_all_finite=force_all_finite, copy=copy 460 ) 461 # Get missing mask for X 462 missing_X = _get_mask(X, missing_values) 463 464 # Get missing mask for Y 465 missing_Y = missing_X if Y is X else _get_mask(Y, missing_values) 466 467 # set missing values to zero 468 X[missing_X] = 0 469 Y[missing_Y] = 0 470 471 distances = euclidean_distances(X, Y, squared=True) 472 473 # Adjust distances for missing values 474 XX = X * X 475 YY = Y * Y 476 distances -= np.dot(XX, missing_Y.T) 477 distances -= np.dot(missing_X, YY.T) 478 479 np.clip(distances, 0, None, out=distances) 480 481 if X is Y: 482 # Ensure that distances between vectors and themselves are set to 0.0. 483 # This may not be the case due to floating point rounding errors. 484 np.fill_diagonal(distances, 0.0) 485 486 present_X = 1 - missing_X 487 present_Y = present_X if Y is X else ~missing_Y 488 present_count = np.dot(present_X, present_Y.T) 489 distances[present_count == 0] = np.nan 490 # avoid divide by zero 491 np.maximum(1, present_count, out=present_count) 492 distances /= present_count 493 distances *= X.shape[1] 494 495 if not squared: 496 np.sqrt(distances, out=distances) 497 498 return distances 499 500 501def _euclidean_distances_upcast(X, XX=None, Y=None, YY=None, batch_size=None): 502 """Euclidean distances between X and Y. 503 504 Assumes X and Y have float32 dtype. 505 Assumes XX and YY have float64 dtype or are None. 506 507 X and Y are upcast to float64 by chunks, which size is chosen to limit 508 memory increase by approximately 10% (at least 10MiB). 509 """ 510 n_samples_X = X.shape[0] 511 n_samples_Y = Y.shape[0] 512 n_features = X.shape[1] 513 514 distances = np.empty((n_samples_X, n_samples_Y), dtype=np.float32) 515 516 if batch_size is None: 517 x_density = X.nnz / np.prod(X.shape) if issparse(X) else 1 518 y_density = Y.nnz / np.prod(Y.shape) if issparse(Y) else 1 519 520 # Allow 10% more memory than X, Y and the distance matrix take (at 521 # least 10MiB) 522 maxmem = max( 523 ( 524 (x_density * n_samples_X + y_density * n_samples_Y) * n_features 525 + (x_density * n_samples_X * y_density * n_samples_Y) 526 ) 527 / 10, 528 10 * 2 ** 17, 529 ) 530 531 # The increase amount of memory in 8-byte blocks is: 532 # - x_density * batch_size * n_features (copy of chunk of X) 533 # - y_density * batch_size * n_features (copy of chunk of Y) 534 # - batch_size * batch_size (chunk of distance matrix) 535 # Hence x² + (xd+yd)kx = M, where x=batch_size, k=n_features, M=maxmem 536 # xd=x_density and yd=y_density 537 tmp = (x_density + y_density) * n_features 538 batch_size = (-tmp + np.sqrt(tmp ** 2 + 4 * maxmem)) / 2 539 batch_size = max(int(batch_size), 1) 540 541 x_batches = gen_batches(n_samples_X, batch_size) 542 543 for i, x_slice in enumerate(x_batches): 544 X_chunk = X[x_slice].astype(np.float64) 545 if XX is None: 546 XX_chunk = row_norms(X_chunk, squared=True)[:, np.newaxis] 547 else: 548 XX_chunk = XX[x_slice] 549 550 y_batches = gen_batches(n_samples_Y, batch_size) 551 552 for j, y_slice in enumerate(y_batches): 553 if X is Y and j < i: 554 # when X is Y the distance matrix is symmetric so we only need 555 # to compute half of it. 556 d = distances[y_slice, x_slice].T 557 558 else: 559 Y_chunk = Y[y_slice].astype(np.float64) 560 if YY is None: 561 YY_chunk = row_norms(Y_chunk, squared=True)[np.newaxis, :] 562 else: 563 YY_chunk = YY[:, y_slice] 564 565 d = -2 * safe_sparse_dot(X_chunk, Y_chunk.T, dense_output=True) 566 d += XX_chunk 567 d += YY_chunk 568 569 distances[x_slice, y_slice] = d.astype(np.float32, copy=False) 570 571 return distances 572 573 574def _argmin_min_reduce(dist, start): 575 indices = dist.argmin(axis=1) 576 values = dist[np.arange(dist.shape[0]), indices] 577 return indices, values 578 579 580def pairwise_distances_argmin_min( 581 X, Y, *, axis=1, metric="euclidean", metric_kwargs=None 582): 583 """Compute minimum distances between one point and a set of points. 584 585 This function computes for each row in X, the index of the row of Y which 586 is closest (according to the specified distance). The minimal distances are 587 also returned. 588 589 This is mostly equivalent to calling: 590 591 (pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis), 592 pairwise_distances(X, Y=Y, metric=metric).min(axis=axis)) 593 594 but uses much less memory, and is faster for large arrays. 595 596 Parameters 597 ---------- 598 X : {array-like, sparse matrix} of shape (n_samples_X, n_features) 599 Array containing points. 600 601 Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features) 602 Array containing points. 603 604 axis : int, default=1 605 Axis along which the argmin and distances are to be computed. 606 607 metric : str or callable, default='euclidean' 608 Metric to use for distance computation. Any metric from scikit-learn 609 or scipy.spatial.distance can be used. 610 611 If metric is a callable function, it is called on each 612 pair of instances (rows) and the resulting value recorded. The callable 613 should take two arrays as input and return one value indicating the 614 distance between them. This works for Scipy's metrics, but is less 615 efficient than passing the metric name as a string. 616 617 Distance matrices are not supported. 618 619 Valid values for metric are: 620 621 - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 622 'manhattan'] 623 624 - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 625 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 626 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 627 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 628 'yule'] 629 630 See the documentation for scipy.spatial.distance for details on these 631 metrics. 632 633 metric_kwargs : dict, default=None 634 Keyword arguments to pass to specified metric function. 635 636 Returns 637 ------- 638 argmin : ndarray 639 Y[argmin[i], :] is the row in Y that is closest to X[i, :]. 640 641 distances : ndarray 642 distances[i] is the distance between the i-th row in X and the 643 argmin[i]-th row in Y. 644 645 See Also 646 -------- 647 sklearn.metrics.pairwise_distances 648 sklearn.metrics.pairwise_distances_argmin 649 """ 650 X, Y = check_pairwise_arrays(X, Y) 651 652 if metric_kwargs is None: 653 metric_kwargs = {} 654 655 if axis == 0: 656 X, Y = Y, X 657 658 indices, values = zip( 659 *pairwise_distances_chunked( 660 X, Y, reduce_func=_argmin_min_reduce, metric=metric, **metric_kwargs 661 ) 662 ) 663 indices = np.concatenate(indices) 664 values = np.concatenate(values) 665 666 return indices, values 667 668 669def pairwise_distances_argmin(X, Y, *, axis=1, metric="euclidean", metric_kwargs=None): 670 """Compute minimum distances between one point and a set of points. 671 672 This function computes for each row in X, the index of the row of Y which 673 is closest (according to the specified distance). 674 675 This is mostly equivalent to calling: 676 677 pairwise_distances(X, Y=Y, metric=metric).argmin(axis=axis) 678 679 but uses much less memory, and is faster for large arrays. 680 681 This function works with dense 2D arrays only. 682 683 Parameters 684 ---------- 685 X : array-like of shape (n_samples_X, n_features) 686 Array containing points. 687 688 Y : array-like of shape (n_samples_Y, n_features) 689 Arrays containing points. 690 691 axis : int, default=1 692 Axis along which the argmin and distances are to be computed. 693 694 metric : str or callable, default="euclidean" 695 Metric to use for distance computation. Any metric from scikit-learn 696 or scipy.spatial.distance can be used. 697 698 If metric is a callable function, it is called on each 699 pair of instances (rows) and the resulting value recorded. The callable 700 should take two arrays as input and return one value indicating the 701 distance between them. This works for Scipy's metrics, but is less 702 efficient than passing the metric name as a string. 703 704 Distance matrices are not supported. 705 706 Valid values for metric are: 707 708 - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 709 'manhattan'] 710 711 - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 712 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 713 'mahalanobis', 'minkowski', 'rogerstanimoto', 'russellrao', 714 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 715 'yule'] 716 717 See the documentation for scipy.spatial.distance for details on these 718 metrics. 719 720 metric_kwargs : dict, default=None 721 Keyword arguments to pass to specified metric function. 722 723 Returns 724 ------- 725 argmin : numpy.ndarray 726 Y[argmin[i], :] is the row in Y that is closest to X[i, :]. 727 728 See Also 729 -------- 730 sklearn.metrics.pairwise_distances 731 sklearn.metrics.pairwise_distances_argmin_min 732 """ 733 if metric_kwargs is None: 734 metric_kwargs = {} 735 736 return pairwise_distances_argmin_min( 737 X, Y, axis=axis, metric=metric, metric_kwargs=metric_kwargs 738 )[0] 739 740 741def haversine_distances(X, Y=None): 742 """Compute the Haversine distance between samples in X and Y. 743 744 The Haversine (or great circle) distance is the angular distance between 745 two points on the surface of a sphere. The first coordinate of each point 746 is assumed to be the latitude, the second is the longitude, given 747 in radians. The dimension of the data must be 2. 748 749 .. math:: 750 D(x, y) = 2\\arcsin[\\sqrt{\\sin^2((x1 - y1) / 2) 751 + \\cos(x1)\\cos(y1)\\sin^2((x2 - y2) / 2)}] 752 753 Parameters 754 ---------- 755 X : array-like of shape (n_samples_X, 2) 756 757 Y : array-like of shape (n_samples_Y, 2), default=None 758 759 Returns 760 ------- 761 distance : ndarray of shape (n_samples_X, n_samples_Y) 762 763 Notes 764 ----- 765 As the Earth is nearly spherical, the haversine formula provides a good 766 approximation of the distance between two points of the Earth surface, with 767 a less than 1% error on average. 768 769 Examples 770 -------- 771 We want to calculate the distance between the Ezeiza Airport 772 (Buenos Aires, Argentina) and the Charles de Gaulle Airport (Paris, 773 France). 774 775 >>> from sklearn.metrics.pairwise import haversine_distances 776 >>> from math import radians 777 >>> bsas = [-34.83333, -58.5166646] 778 >>> paris = [49.0083899664, 2.53844117956] 779 >>> bsas_in_radians = [radians(_) for _ in bsas] 780 >>> paris_in_radians = [radians(_) for _ in paris] 781 >>> result = haversine_distances([bsas_in_radians, paris_in_radians]) 782 >>> result * 6371000/1000 # multiply by Earth radius to get kilometers 783 array([[ 0. , 11099.54035582], 784 [11099.54035582, 0. ]]) 785 """ 786 from ..metrics import DistanceMetric 787 788 return DistanceMetric.get_metric("haversine").pairwise(X, Y) 789 790 791def manhattan_distances(X, Y=None, *, sum_over_features=True): 792 """Compute the L1 distances between the vectors in X and Y. 793 794 With sum_over_features equal to False it returns the componentwise 795 distances. 796 797 Read more in the :ref:`User Guide <metrics>`. 798 799 Parameters 800 ---------- 801 X : array-like of shape (n_samples_X, n_features) 802 803 Y : array-like of shape (n_samples_Y, n_features), default=None 804 If `None`, uses `Y=X`. 805 806 sum_over_features : bool, default=True 807 If True the function returns the pairwise distance matrix 808 else it returns the componentwise L1 pairwise-distances. 809 Not supported for sparse matrix inputs. 810 811 Returns 812 ------- 813 D : ndarray of shape (n_samples_X * n_samples_Y, n_features) or \ 814 (n_samples_X, n_samples_Y) 815 If sum_over_features is False shape is 816 (n_samples_X * n_samples_Y, n_features) and D contains the 817 componentwise L1 pairwise-distances (ie. absolute difference), 818 else shape is (n_samples_X, n_samples_Y) and D contains 819 the pairwise L1 distances. 820 821 Notes 822 -------- 823 When X and/or Y are CSR sparse matrices and they are not already 824 in canonical format, this function modifies them in-place to 825 make them canonical. 826 827 Examples 828 -------- 829 >>> from sklearn.metrics.pairwise import manhattan_distances 830 >>> manhattan_distances([[3]], [[3]]) 831 array([[0.]]) 832 >>> manhattan_distances([[3]], [[2]]) 833 array([[1.]]) 834 >>> manhattan_distances([[2]], [[3]]) 835 array([[1.]]) 836 >>> manhattan_distances([[1, 2], [3, 4]],\ 837 [[1, 2], [0, 3]]) 838 array([[0., 2.], 839 [4., 4.]]) 840 >>> import numpy as np 841 >>> X = np.ones((1, 2)) 842 >>> y = np.full((2, 2), 2.) 843 >>> manhattan_distances(X, y, sum_over_features=False) 844 array([[1., 1.], 845 [1., 1.]]) 846 """ 847 X, Y = check_pairwise_arrays(X, Y) 848 849 if issparse(X) or issparse(Y): 850 if not sum_over_features: 851 raise TypeError( 852 "sum_over_features=%r not supported for sparse matrices" 853 % sum_over_features 854 ) 855 856 X = csr_matrix(X, copy=False) 857 Y = csr_matrix(Y, copy=False) 858 X.sum_duplicates() # this also sorts indices in-place 859 Y.sum_duplicates() 860 D = np.zeros((X.shape[0], Y.shape[0])) 861 _sparse_manhattan(X.data, X.indices, X.indptr, Y.data, Y.indices, Y.indptr, D) 862 return D 863 864 if sum_over_features: 865 return distance.cdist(X, Y, "cityblock") 866 867 D = X[:, np.newaxis, :] - Y[np.newaxis, :, :] 868 D = np.abs(D, D) 869 return D.reshape((-1, X.shape[1])) 870 871 872def cosine_distances(X, Y=None): 873 """Compute cosine distance between samples in X and Y. 874 875 Cosine distance is defined as 1.0 minus the cosine similarity. 876 877 Read more in the :ref:`User Guide <metrics>`. 878 879 Parameters 880 ---------- 881 X : {array-like, sparse matrix} of shape (n_samples_X, n_features) 882 Matrix `X`. 883 884 Y : {array-like, sparse matrix} of shape (n_samples_Y, n_features), \ 885 default=None 886 Matrix `Y`. 887 888 Returns 889 ------- 890 distance matrix : ndarray of shape (n_samples_X, n_samples_Y) 891 892 See Also 893 -------- 894 cosine_similarity 895 scipy.spatial.distance.cosine : Dense matrices only. 896 """ 897 # 1.0 - cosine_similarity(X, Y) without copy 898 S = cosine_similarity(X, Y) 899 S *= -1 900 S += 1 901 np.clip(S, 0, 2, out=S) 902 if X is Y or Y is None: 903 # Ensure that distances between vectors and themselves are set to 0.0. 904 # This may not be the case due to floating point rounding errors. 905 S[np.diag_indices_from(S)] = 0.0 906 return S 907 908 909# Paired distances 910def paired_euclidean_distances(X, Y): 911 """ 912 Computes the paired euclidean distances between X and Y. 913 914 Read more in the :ref:`User Guide <metrics>`. 915 916 Parameters 917 ---------- 918 X : array-like of shape (n_samples, n_features) 919 920 Y : array-like of shape (n_samples, n_features) 921 922 Returns 923 ------- 924 distances : ndarray of shape (n_samples,) 925 """ 926 X, Y = check_paired_arrays(X, Y) 927 return row_norms(X - Y) 928 929 930def paired_manhattan_distances(X, Y): 931 """Compute the L1 distances between the vectors in X and Y. 932 933 Read more in the :ref:`User Guide <metrics>`. 934 935 Parameters 936 ---------- 937 X : array-like of shape (n_samples, n_features) 938 939 Y : array-like of shape (n_samples, n_features) 940 941 Returns 942 ------- 943 distances : ndarray of shape (n_samples,) 944 """ 945 X, Y = check_paired_arrays(X, Y) 946 diff = X - Y 947 if issparse(diff): 948 diff.data = np.abs(diff.data) 949 return np.squeeze(np.array(diff.sum(axis=1))) 950 else: 951 return np.abs(diff).sum(axis=-1) 952 953 954def paired_cosine_distances(X, Y): 955 """ 956 Computes the paired cosine distances between X and Y. 957 958 Read more in the :ref:`User Guide <metrics>`. 959 960 Parameters 961 ---------- 962 X : array-like of shape (n_samples, n_features) 963 964 Y : array-like of shape (n_samples, n_features) 965 966 Returns 967 ------- 968 distances : ndarray of shape (n_samples,) 969 970 Notes 971 ----- 972 The cosine distance is equivalent to the half the squared 973 euclidean distance if each sample is normalized to unit norm. 974 """ 975 X, Y = check_paired_arrays(X, Y) 976 return 0.5 * row_norms(normalize(X) - normalize(Y), squared=True) 977 978 979PAIRED_DISTANCES = { 980 "cosine": paired_cosine_distances, 981 "euclidean": paired_euclidean_distances, 982 "l2": paired_euclidean_distances, 983 "l1": paired_manhattan_distances, 984 "manhattan": paired_manhattan_distances, 985 "cityblock": paired_manhattan_distances, 986} 987 988 989def paired_distances(X, Y, *, metric="euclidean", **kwds): 990 """ 991 Computes the paired distances between X and Y. 992 993 Computes the distances between (X[0], Y[0]), (X[1], Y[1]), etc... 994 995 Read more in the :ref:`User Guide <metrics>`. 996 997 Parameters 998 ---------- 999 X : ndarray of shape (n_samples, n_features) 1000 Array 1 for distance computation. 1001 1002 Y : ndarray of shape (n_samples, n_features) 1003 Array 2 for distance computation. 1004 1005 metric : str or callable, default="euclidean" 1006 The metric to use when calculating distance between instances in a 1007 feature array. If metric is a string, it must be one of the options 1008 specified in PAIRED_DISTANCES, including "euclidean", 1009 "manhattan", or "cosine". 1010 Alternatively, if metric is a callable function, it is called on each 1011 pair of instances (rows) and the resulting value recorded. The callable 1012 should take two arrays from X as input and return a value indicating 1013 the distance between them. 1014 1015 Returns 1016 ------- 1017 distances : ndarray of shape (n_samples,) 1018 1019 See Also 1020 -------- 1021 pairwise_distances : Computes the distance between every pair of samples. 1022 1023 Examples 1024 -------- 1025 >>> from sklearn.metrics.pairwise import paired_distances 1026 >>> X = [[0, 1], [1, 1]] 1027 >>> Y = [[0, 1], [2, 1]] 1028 >>> paired_distances(X, Y) 1029 array([0., 1.]) 1030 """ 1031 1032 if metric in PAIRED_DISTANCES: 1033 func = PAIRED_DISTANCES[metric] 1034 return func(X, Y) 1035 elif callable(metric): 1036 # Check the matrix first (it is usually done by the metric) 1037 X, Y = check_paired_arrays(X, Y) 1038 distances = np.zeros(len(X)) 1039 for i in range(len(X)): 1040 distances[i] = metric(X[i], Y[i]) 1041 return distances 1042 else: 1043 raise ValueError("Unknown distance %s" % metric) 1044 1045 1046# Kernels 1047def linear_kernel(X, Y=None, dense_output=True): 1048 """ 1049 Compute the linear kernel between X and Y. 1050 1051 Read more in the :ref:`User Guide <linear_kernel>`. 1052 1053 Parameters 1054 ---------- 1055 X : ndarray of shape (n_samples_X, n_features) 1056 A feature array. 1057 1058 Y : ndarray of shape (n_samples_Y, n_features), default=None 1059 An optional second feature array. If `None`, uses `Y=X`. 1060 1061 dense_output : bool, default=True 1062 Whether to return dense output even when the input is sparse. If 1063 ``False``, the output is sparse if both input arrays are sparse. 1064 1065 .. versionadded:: 0.20 1066 1067 Returns 1068 ------- 1069 Gram matrix : ndarray of shape (n_samples_X, n_samples_Y) 1070 The Gram matrix of the linear kernel, i.e. `X @ Y.T`. 1071 """ 1072 X, Y = check_pairwise_arrays(X, Y) 1073 return safe_sparse_dot(X, Y.T, dense_output=dense_output) 1074 1075 1076def polynomial_kernel(X, Y=None, degree=3, gamma=None, coef0=1): 1077 """ 1078 Compute the polynomial kernel between X and Y:: 1079 1080 K(X, Y) = (gamma <X, Y> + coef0)^degree 1081 1082 Read more in the :ref:`User Guide <polynomial_kernel>`. 1083 1084 Parameters 1085 ---------- 1086 X : ndarray of shape (n_samples_X, n_features) 1087 1088 Y : ndarray of shape (n_samples_Y, n_features), default=None 1089 1090 degree : int, default=3 1091 1092 gamma : float, default=None 1093 If None, defaults to 1.0 / n_features. 1094 1095 coef0 : float, default=1 1096 1097 Returns 1098 ------- 1099 Gram matrix : ndarray of shape (n_samples_X, n_samples_Y) 1100 """ 1101 X, Y = check_pairwise_arrays(X, Y) 1102 if gamma is None: 1103 gamma = 1.0 / X.shape[1] 1104 1105 K = safe_sparse_dot(X, Y.T, dense_output=True) 1106 K *= gamma 1107 K += coef0 1108 K **= degree 1109 return K 1110 1111 1112def sigmoid_kernel(X, Y=None, gamma=None, coef0=1): 1113 """ 1114 Compute the sigmoid kernel between X and Y:: 1115 1116 K(X, Y) = tanh(gamma <X, Y> + coef0) 1117 1118 Read more in the :ref:`User Guide <sigmoid_kernel>`. 1119 1120 Parameters 1121 ---------- 1122 X : ndarray of shape (n_samples_X, n_features) 1123 1124 Y : ndarray of shape (n_samples_Y, n_features), default=None 1125 If `None`, uses `Y=X`. 1126 1127 gamma : float, default=None 1128 If None, defaults to 1.0 / n_features. 1129 1130 coef0 : float, default=1 1131 1132 Returns 1133 ------- 1134 Gram matrix : ndarray of shape (n_samples_X, n_samples_Y) 1135 """ 1136 X, Y = check_pairwise_arrays(X, Y) 1137 if gamma is None: 1138 gamma = 1.0 / X.shape[1] 1139 1140 K = safe_sparse_dot(X, Y.T, dense_output=True) 1141 K *= gamma 1142 K += coef0 1143 np.tanh(K, K) # compute tanh in-place 1144 return K 1145 1146 1147def rbf_kernel(X, Y=None, gamma=None): 1148 """ 1149 Compute the rbf (gaussian) kernel between X and Y:: 1150 1151 K(x, y) = exp(-gamma ||x-y||^2) 1152 1153 for each pair of rows x in X and y in Y. 1154 1155 Read more in the :ref:`User Guide <rbf_kernel>`. 1156 1157 Parameters 1158 ---------- 1159 X : ndarray of shape (n_samples_X, n_features) 1160 1161 Y : ndarray of shape (n_samples_Y, n_features), default=None 1162 If `None`, uses `Y=X`. 1163 1164 gamma : float, default=None 1165 If None, defaults to 1.0 / n_features. 1166 1167 Returns 1168 ------- 1169 kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y) 1170 """ 1171 X, Y = check_pairwise_arrays(X, Y) 1172 if gamma is None: 1173 gamma = 1.0 / X.shape[1] 1174 1175 K = euclidean_distances(X, Y, squared=True) 1176 K *= -gamma 1177 np.exp(K, K) # exponentiate K in-place 1178 return K 1179 1180 1181def laplacian_kernel(X, Y=None, gamma=None): 1182 """Compute the laplacian kernel between X and Y. 1183 1184 The laplacian kernel is defined as:: 1185 1186 K(x, y) = exp(-gamma ||x-y||_1) 1187 1188 for each pair of rows x in X and y in Y. 1189 Read more in the :ref:`User Guide <laplacian_kernel>`. 1190 1191 .. versionadded:: 0.17 1192 1193 Parameters 1194 ---------- 1195 X : ndarray of shape (n_samples_X, n_features) 1196 1197 Y : ndarray of shape (n_samples_Y, n_features), default=None 1198 If `None`, uses `Y=X`. 1199 1200 gamma : float, default=None 1201 If None, defaults to 1.0 / n_features. 1202 1203 Returns 1204 ------- 1205 kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y) 1206 """ 1207 X, Y = check_pairwise_arrays(X, Y) 1208 if gamma is None: 1209 gamma = 1.0 / X.shape[1] 1210 1211 K = -gamma * manhattan_distances(X, Y) 1212 np.exp(K, K) # exponentiate K in-place 1213 return K 1214 1215 1216def cosine_similarity(X, Y=None, dense_output=True): 1217 """Compute cosine similarity between samples in X and Y. 1218 1219 Cosine similarity, or the cosine kernel, computes similarity as the 1220 normalized dot product of X and Y: 1221 1222 K(X, Y) = <X, Y> / (||X||*||Y||) 1223 1224 On L2-normalized data, this function is equivalent to linear_kernel. 1225 1226 Read more in the :ref:`User Guide <cosine_similarity>`. 1227 1228 Parameters 1229 ---------- 1230 X : {ndarray, sparse matrix} of shape (n_samples_X, n_features) 1231 Input data. 1232 1233 Y : {ndarray, sparse matrix} of shape (n_samples_Y, n_features), \ 1234 default=None 1235 Input data. If ``None``, the output will be the pairwise 1236 similarities between all samples in ``X``. 1237 1238 dense_output : bool, default=True 1239 Whether to return dense output even when the input is sparse. If 1240 ``False``, the output is sparse if both input arrays are sparse. 1241 1242 .. versionadded:: 0.17 1243 parameter ``dense_output`` for dense output. 1244 1245 Returns 1246 ------- 1247 kernel matrix : ndarray of shape (n_samples_X, n_samples_Y) 1248 """ 1249 # to avoid recursive import 1250 1251 X, Y = check_pairwise_arrays(X, Y) 1252 1253 X_normalized = normalize(X, copy=True) 1254 if X is Y: 1255 Y_normalized = X_normalized 1256 else: 1257 Y_normalized = normalize(Y, copy=True) 1258 1259 K = safe_sparse_dot(X_normalized, Y_normalized.T, dense_output=dense_output) 1260 1261 return K 1262 1263 1264def additive_chi2_kernel(X, Y=None): 1265 """Computes the additive chi-squared kernel between observations in X and 1266 Y. 1267 1268 The chi-squared kernel is computed between each pair of rows in X and Y. X 1269 and Y have to be non-negative. This kernel is most commonly applied to 1270 histograms. 1271 1272 The chi-squared kernel is given by:: 1273 1274 k(x, y) = -Sum [(x - y)^2 / (x + y)] 1275 1276 It can be interpreted as a weighted difference per entry. 1277 1278 Read more in the :ref:`User Guide <chi2_kernel>`. 1279 1280 Notes 1281 ----- 1282 As the negative of a distance, this kernel is only conditionally positive 1283 definite. 1284 1285 1286 Parameters 1287 ---------- 1288 X : array-like of shape (n_samples_X, n_features) 1289 1290 Y : ndarray of shape (n_samples_Y, n_features), default=None 1291 If `None`, uses `Y=X`. 1292 1293 Returns 1294 ------- 1295 kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y) 1296 1297 See Also 1298 -------- 1299 chi2_kernel : The exponentiated version of the kernel, which is usually 1300 preferable. 1301 sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation 1302 to this kernel. 1303 1304 References 1305 ---------- 1306 * Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C. 1307 Local features and kernels for classification of texture and object 1308 categories: A comprehensive study 1309 International Journal of Computer Vision 2007 1310 https://research.microsoft.com/en-us/um/people/manik/projects/trade-off/papers/ZhangIJCV06.pdf 1311 """ 1312 if issparse(X) or issparse(Y): 1313 raise ValueError("additive_chi2 does not support sparse matrices.") 1314 X, Y = check_pairwise_arrays(X, Y) 1315 if (X < 0).any(): 1316 raise ValueError("X contains negative values.") 1317 if Y is not X and (Y < 0).any(): 1318 raise ValueError("Y contains negative values.") 1319 1320 result = np.zeros((X.shape[0], Y.shape[0]), dtype=X.dtype) 1321 _chi2_kernel_fast(X, Y, result) 1322 return result 1323 1324 1325def chi2_kernel(X, Y=None, gamma=1.0): 1326 """Computes the exponential chi-squared kernel X and Y. 1327 1328 The chi-squared kernel is computed between each pair of rows in X and Y. X 1329 and Y have to be non-negative. This kernel is most commonly applied to 1330 histograms. 1331 1332 The chi-squared kernel is given by:: 1333 1334 k(x, y) = exp(-gamma Sum [(x - y)^2 / (x + y)]) 1335 1336 It can be interpreted as a weighted difference per entry. 1337 1338 Read more in the :ref:`User Guide <chi2_kernel>`. 1339 1340 Parameters 1341 ---------- 1342 X : array-like of shape (n_samples_X, n_features) 1343 1344 Y : ndarray of shape (n_samples_Y, n_features), default=None 1345 1346 gamma : float, default=1. 1347 Scaling parameter of the chi2 kernel. 1348 1349 Returns 1350 ------- 1351 kernel_matrix : ndarray of shape (n_samples_X, n_samples_Y) 1352 1353 See Also 1354 -------- 1355 additive_chi2_kernel : The additive version of this kernel. 1356 sklearn.kernel_approximation.AdditiveChi2Sampler : A Fourier approximation 1357 to the additive version of this kernel. 1358 1359 References 1360 ---------- 1361 * Zhang, J. and Marszalek, M. and Lazebnik, S. and Schmid, C. 1362 Local features and kernels for classification of texture and object 1363 categories: A comprehensive study 1364 International Journal of Computer Vision 2007 1365 https://research.microsoft.com/en-us/um/people/manik/projects/trade-off/papers/ZhangIJCV06.pdf 1366 """ 1367 K = additive_chi2_kernel(X, Y) 1368 K *= gamma 1369 return np.exp(K, K) 1370 1371 1372# Helper functions - distance 1373PAIRWISE_DISTANCE_FUNCTIONS = { 1374 # If updating this dictionary, update the doc in both distance_metrics() 1375 # and also in pairwise_distances()! 1376 "cityblock": manhattan_distances, 1377 "cosine": cosine_distances, 1378 "euclidean": euclidean_distances, 1379 "haversine": haversine_distances, 1380 "l2": euclidean_distances, 1381 "l1": manhattan_distances, 1382 "manhattan": manhattan_distances, 1383 "precomputed": None, # HACK: precomputed is always allowed, never called 1384 "nan_euclidean": nan_euclidean_distances, 1385} 1386 1387 1388def distance_metrics(): 1389 """Valid metrics for pairwise_distances. 1390 1391 This function simply returns the valid pairwise distance metrics. 1392 It exists to allow for a description of the mapping for 1393 each of the valid strings. 1394 1395 The valid distance metrics, and the function they map to, are: 1396 1397 =============== ======================================== 1398 metric Function 1399 =============== ======================================== 1400 'cityblock' metrics.pairwise.manhattan_distances 1401 'cosine' metrics.pairwise.cosine_distances 1402 'euclidean' metrics.pairwise.euclidean_distances 1403 'haversine' metrics.pairwise.haversine_distances 1404 'l1' metrics.pairwise.manhattan_distances 1405 'l2' metrics.pairwise.euclidean_distances 1406 'manhattan' metrics.pairwise.manhattan_distances 1407 'nan_euclidean' metrics.pairwise.nan_euclidean_distances 1408 =============== ======================================== 1409 1410 Read more in the :ref:`User Guide <metrics>`. 1411 1412 """ 1413 return PAIRWISE_DISTANCE_FUNCTIONS 1414 1415 1416def _dist_wrapper(dist_func, dist_matrix, slice_, *args, **kwargs): 1417 """Write in-place to a slice of a distance matrix.""" 1418 dist_matrix[:, slice_] = dist_func(*args, **kwargs) 1419 1420 1421def _parallel_pairwise(X, Y, func, n_jobs, **kwds): 1422 """Break the pairwise matrix in n_jobs even slices 1423 and compute them in parallel.""" 1424 1425 if Y is None: 1426 Y = X 1427 X, Y, dtype = _return_float_dtype(X, Y) 1428 1429 if effective_n_jobs(n_jobs) == 1: 1430 return func(X, Y, **kwds) 1431 1432 # enforce a threading backend to prevent data communication overhead 1433 fd = delayed(_dist_wrapper) 1434 ret = np.empty((X.shape[0], Y.shape[0]), dtype=dtype, order="F") 1435 Parallel(backend="threading", n_jobs=n_jobs)( 1436 fd(func, ret, s, X, Y[s], **kwds) 1437 for s in gen_even_slices(_num_samples(Y), effective_n_jobs(n_jobs)) 1438 ) 1439 1440 if (X is Y or Y is None) and func is euclidean_distances: 1441 # zeroing diagonal for euclidean norm. 1442 # TODO: do it also for other norms. 1443 np.fill_diagonal(ret, 0) 1444 1445 return ret 1446 1447 1448def _pairwise_callable(X, Y, metric, force_all_finite=True, **kwds): 1449 """Handle the callable case for pairwise_{distances,kernels}.""" 1450 X, Y = check_pairwise_arrays(X, Y, force_all_finite=force_all_finite) 1451 1452 if X is Y: 1453 # Only calculate metric for upper triangle 1454 out = np.zeros((X.shape[0], Y.shape[0]), dtype="float") 1455 iterator = itertools.combinations(range(X.shape[0]), 2) 1456 for i, j in iterator: 1457 out[i, j] = metric(X[i], Y[j], **kwds) 1458 1459 # Make symmetric 1460 # NB: out += out.T will produce incorrect results 1461 out = out + out.T 1462 1463 # Calculate diagonal 1464 # NB: nonzero diagonals are allowed for both metrics and kernels 1465 for i in range(X.shape[0]): 1466 x = X[i] 1467 out[i, i] = metric(x, x, **kwds) 1468 1469 else: 1470 # Calculate all cells 1471 out = np.empty((X.shape[0], Y.shape[0]), dtype="float") 1472 iterator = itertools.product(range(X.shape[0]), range(Y.shape[0])) 1473 for i, j in iterator: 1474 out[i, j] = metric(X[i], Y[j], **kwds) 1475 1476 return out 1477 1478 1479_VALID_METRICS = [ 1480 "euclidean", 1481 "l2", 1482 "l1", 1483 "manhattan", 1484 "cityblock", 1485 "braycurtis", 1486 "canberra", 1487 "chebyshev", 1488 "correlation", 1489 "cosine", 1490 "dice", 1491 "hamming", 1492 "jaccard", 1493 "kulsinski", 1494 "mahalanobis", 1495 "matching", 1496 "minkowski", 1497 "rogerstanimoto", 1498 "russellrao", 1499 "seuclidean", 1500 "sokalmichener", 1501 "sokalsneath", 1502 "sqeuclidean", 1503 "yule", 1504 "wminkowski", 1505 "nan_euclidean", 1506 "haversine", 1507] 1508 1509_NAN_METRICS = ["nan_euclidean"] 1510 1511 1512def _check_chunk_size(reduced, chunk_size): 1513 """Checks chunk is a sequence of expected size or a tuple of same.""" 1514 if reduced is None: 1515 return 1516 is_tuple = isinstance(reduced, tuple) 1517 if not is_tuple: 1518 reduced = (reduced,) 1519 if any(isinstance(r, tuple) or not hasattr(r, "__iter__") for r in reduced): 1520 raise TypeError( 1521 "reduce_func returned %r. Expected sequence(s) of length %d." 1522 % (reduced if is_tuple else reduced[0], chunk_size) 1523 ) 1524 if any(_num_samples(r) != chunk_size for r in reduced): 1525 actual_size = tuple(_num_samples(r) for r in reduced) 1526 raise ValueError( 1527 "reduce_func returned object of length %s. " 1528 "Expected same length as input: %d." 1529 % (actual_size if is_tuple else actual_size[0], chunk_size) 1530 ) 1531 1532 1533def _precompute_metric_params(X, Y, metric=None, **kwds): 1534 """Precompute data-derived metric parameters if not provided.""" 1535 if metric == "seuclidean" and "V" not in kwds: 1536 # There is a bug in scipy < 1.5 that will cause a crash if 1537 # X.dtype != np.double (float64). See PR #15730 1538 dtype = np.float64 if sp_version < parse_version("1.5") else None 1539 if X is Y: 1540 V = np.var(X, axis=0, ddof=1, dtype=dtype) 1541 else: 1542 raise ValueError( 1543 "The 'V' parameter is required for the seuclidean metric " 1544 "when Y is passed." 1545 ) 1546 return {"V": V} 1547 if metric == "mahalanobis" and "VI" not in kwds: 1548 if X is Y: 1549 VI = np.linalg.inv(np.cov(X.T)).T 1550 else: 1551 raise ValueError( 1552 "The 'VI' parameter is required for the mahalanobis metric " 1553 "when Y is passed." 1554 ) 1555 return {"VI": VI} 1556 return {} 1557 1558 1559def pairwise_distances_chunked( 1560 X, 1561 Y=None, 1562 *, 1563 reduce_func=None, 1564 metric="euclidean", 1565 n_jobs=None, 1566 working_memory=None, 1567 **kwds, 1568): 1569 """Generate a distance matrix chunk by chunk with optional reduction. 1570 1571 In cases where not all of a pairwise distance matrix needs to be stored at 1572 once, this is used to calculate pairwise distances in 1573 ``working_memory``-sized chunks. If ``reduce_func`` is given, it is run 1574 on each chunk and its return values are concatenated into lists, arrays 1575 or sparse matrices. 1576 1577 Parameters 1578 ---------- 1579 X : ndarray of shape (n_samples_X, n_samples_X) or \ 1580 (n_samples_X, n_features) 1581 Array of pairwise distances between samples, or a feature array. 1582 The shape the array should be (n_samples_X, n_samples_X) if 1583 metric='precomputed' and (n_samples_X, n_features) otherwise. 1584 1585 Y : ndarray of shape (n_samples_Y, n_features), default=None 1586 An optional second feature array. Only allowed if 1587 metric != "precomputed". 1588 1589 reduce_func : callable, default=None 1590 The function which is applied on each chunk of the distance matrix, 1591 reducing it to needed values. ``reduce_func(D_chunk, start)`` 1592 is called repeatedly, where ``D_chunk`` is a contiguous vertical 1593 slice of the pairwise distance matrix, starting at row ``start``. 1594 It should return one of: None; an array, a list, or a sparse matrix 1595 of length ``D_chunk.shape[0]``; or a tuple of such objects. Returning 1596 None is useful for in-place operations, rather than reductions. 1597 1598 If None, pairwise_distances_chunked returns a generator of vertical 1599 chunks of the distance matrix. 1600 1601 metric : str or callable, default='euclidean' 1602 The metric to use when calculating distance between instances in a 1603 feature array. If metric is a string, it must be one of the options 1604 allowed by scipy.spatial.distance.pdist for its metric parameter, or 1605 a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. 1606 If metric is "precomputed", X is assumed to be a distance matrix. 1607 Alternatively, if metric is a callable function, it is called on each 1608 pair of instances (rows) and the resulting value recorded. The callable 1609 should take two arrays from X as input and return a value indicating 1610 the distance between them. 1611 1612 n_jobs : int, default=None 1613 The number of jobs to use for the computation. This works by breaking 1614 down the pairwise matrix into n_jobs even slices and computing them in 1615 parallel. 1616 1617 ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 1618 ``-1`` means using all processors. See :term:`Glossary <n_jobs>` 1619 for more details. 1620 1621 working_memory : int, default=None 1622 The sought maximum memory for temporary distance matrix chunks. 1623 When None (default), the value of 1624 ``sklearn.get_config()['working_memory']`` is used. 1625 1626 `**kwds` : optional keyword parameters 1627 Any further parameters are passed directly to the distance function. 1628 If using a scipy.spatial.distance metric, the parameters are still 1629 metric dependent. See the scipy docs for usage examples. 1630 1631 Yields 1632 ------ 1633 D_chunk : {ndarray, sparse matrix} 1634 A contiguous slice of distance matrix, optionally processed by 1635 ``reduce_func``. 1636 1637 Examples 1638 -------- 1639 Without reduce_func: 1640 1641 >>> import numpy as np 1642 >>> from sklearn.metrics import pairwise_distances_chunked 1643 >>> X = np.random.RandomState(0).rand(5, 3) 1644 >>> D_chunk = next(pairwise_distances_chunked(X)) 1645 >>> D_chunk 1646 array([[0. ..., 0.29..., 0.41..., 0.19..., 0.57...], 1647 [0.29..., 0. ..., 0.57..., 0.41..., 0.76...], 1648 [0.41..., 0.57..., 0. ..., 0.44..., 0.90...], 1649 [0.19..., 0.41..., 0.44..., 0. ..., 0.51...], 1650 [0.57..., 0.76..., 0.90..., 0.51..., 0. ...]]) 1651 1652 Retrieve all neighbors and average distance within radius r: 1653 1654 >>> r = .2 1655 >>> def reduce_func(D_chunk, start): 1656 ... neigh = [np.flatnonzero(d < r) for d in D_chunk] 1657 ... avg_dist = (D_chunk * (D_chunk < r)).mean(axis=1) 1658 ... return neigh, avg_dist 1659 >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func) 1660 >>> neigh, avg_dist = next(gen) 1661 >>> neigh 1662 [array([0, 3]), array([1]), array([2]), array([0, 3]), array([4])] 1663 >>> avg_dist 1664 array([0.039..., 0. , 0. , 0.039..., 0. ]) 1665 1666 Where r is defined per sample, we need to make use of ``start``: 1667 1668 >>> r = [.2, .4, .4, .3, .1] 1669 >>> def reduce_func(D_chunk, start): 1670 ... neigh = [np.flatnonzero(d < r[i]) 1671 ... for i, d in enumerate(D_chunk, start)] 1672 ... return neigh 1673 >>> neigh = next(pairwise_distances_chunked(X, reduce_func=reduce_func)) 1674 >>> neigh 1675 [array([0, 3]), array([0, 1]), array([2]), array([0, 3]), array([4])] 1676 1677 Force row-by-row generation by reducing ``working_memory``: 1678 1679 >>> gen = pairwise_distances_chunked(X, reduce_func=reduce_func, 1680 ... working_memory=0) 1681 >>> next(gen) 1682 [array([0, 3])] 1683 >>> next(gen) 1684 [array([0, 1])] 1685 """ 1686 n_samples_X = _num_samples(X) 1687 if metric == "precomputed": 1688 slices = (slice(0, n_samples_X),) 1689 else: 1690 if Y is None: 1691 Y = X 1692 # We get as many rows as possible within our working_memory budget to 1693 # store len(Y) distances in each row of output. 1694 # 1695 # Note: 1696 # - this will get at least 1 row, even if 1 row of distances will 1697 # exceed working_memory. 1698 # - this does not account for any temporary memory usage while 1699 # calculating distances (e.g. difference of vectors in manhattan 1700 # distance. 1701 chunk_n_rows = get_chunk_n_rows( 1702 row_bytes=8 * _num_samples(Y), 1703 max_n_rows=n_samples_X, 1704 working_memory=working_memory, 1705 ) 1706 slices = gen_batches(n_samples_X, chunk_n_rows) 1707 1708 # precompute data-derived metric params 1709 params = _precompute_metric_params(X, Y, metric=metric, **kwds) 1710 kwds.update(**params) 1711 1712 for sl in slices: 1713 if sl.start == 0 and sl.stop == n_samples_X: 1714 X_chunk = X # enable optimised paths for X is Y 1715 else: 1716 X_chunk = X[sl] 1717 D_chunk = pairwise_distances(X_chunk, Y, metric=metric, n_jobs=n_jobs, **kwds) 1718 if (X is Y or Y is None) and PAIRWISE_DISTANCE_FUNCTIONS.get( 1719 metric, None 1720 ) is euclidean_distances: 1721 # zeroing diagonal, taking care of aliases of "euclidean", 1722 # i.e. "l2" 1723 D_chunk.flat[sl.start :: _num_samples(X) + 1] = 0 1724 if reduce_func is not None: 1725 chunk_size = D_chunk.shape[0] 1726 D_chunk = reduce_func(D_chunk, sl.start) 1727 _check_chunk_size(D_chunk, chunk_size) 1728 yield D_chunk 1729 1730 1731def pairwise_distances( 1732 X, Y=None, metric="euclidean", *, n_jobs=None, force_all_finite=True, **kwds 1733): 1734 """Compute the distance matrix from a vector array X and optional Y. 1735 1736 This method takes either a vector array or a distance matrix, and returns 1737 a distance matrix. If the input is a vector array, the distances are 1738 computed. If the input is a distances matrix, it is returned instead. 1739 1740 This method provides a safe way to take a distance matrix as input, while 1741 preserving compatibility with many other algorithms that take a vector 1742 array. 1743 1744 If Y is given (default is None), then the returned matrix is the pairwise 1745 distance between the arrays from both X and Y. 1746 1747 Valid values for metric are: 1748 1749 - From scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 1750 'manhattan']. These metrics support sparse matrix 1751 inputs. 1752 ['nan_euclidean'] but it does not yet support sparse matrices. 1753 1754 - From scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', 1755 'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 1756 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 1757 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule'] 1758 See the documentation for scipy.spatial.distance for details on these 1759 metrics. These metrics do not support sparse matrix inputs. 1760 1761 Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are 1762 valid scipy.spatial.distance metrics), the scikit-learn implementation 1763 will be used, which is faster and has support for sparse matrices (except 1764 for 'cityblock'). For a verbose description of the metrics from 1765 scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics 1766 function. 1767 1768 Read more in the :ref:`User Guide <metrics>`. 1769 1770 Parameters 1771 ---------- 1772 X : ndarray of shape (n_samples_X, n_samples_X) or \ 1773 (n_samples_X, n_features) 1774 Array of pairwise distances between samples, or a feature array. 1775 The shape of the array should be (n_samples_X, n_samples_X) if 1776 metric == "precomputed" and (n_samples_X, n_features) otherwise. 1777 1778 Y : ndarray of shape (n_samples_Y, n_features), default=None 1779 An optional second feature array. Only allowed if 1780 metric != "precomputed". 1781 1782 metric : str or callable, default='euclidean' 1783 The metric to use when calculating distance between instances in a 1784 feature array. If metric is a string, it must be one of the options 1785 allowed by scipy.spatial.distance.pdist for its metric parameter, or 1786 a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. 1787 If metric is "precomputed", X is assumed to be a distance matrix. 1788 Alternatively, if metric is a callable function, it is called on each 1789 pair of instances (rows) and the resulting value recorded. The callable 1790 should take two arrays from X as input and return a value indicating 1791 the distance between them. 1792 1793 n_jobs : int, default=None 1794 The number of jobs to use for the computation. This works by breaking 1795 down the pairwise matrix into n_jobs even slices and computing them in 1796 parallel. 1797 1798 ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 1799 ``-1`` means using all processors. See :term:`Glossary <n_jobs>` 1800 for more details. 1801 1802 force_all_finite : bool or 'allow-nan', default=True 1803 Whether to raise an error on np.inf, np.nan, pd.NA in array. Ignored 1804 for a metric listed in ``pairwise.PAIRWISE_DISTANCE_FUNCTIONS``. The 1805 possibilities are: 1806 1807 - True: Force all values of array to be finite. 1808 - False: accepts np.inf, np.nan, pd.NA in array. 1809 - 'allow-nan': accepts only np.nan and pd.NA values in array. Values 1810 cannot be infinite. 1811 1812 .. versionadded:: 0.22 1813 ``force_all_finite`` accepts the string ``'allow-nan'``. 1814 1815 .. versionchanged:: 0.23 1816 Accepts `pd.NA` and converts it into `np.nan`. 1817 1818 **kwds : optional keyword parameters 1819 Any further parameters are passed directly to the distance function. 1820 If using a scipy.spatial.distance metric, the parameters are still 1821 metric dependent. See the scipy docs for usage examples. 1822 1823 Returns 1824 ------- 1825 D : ndarray of shape (n_samples_X, n_samples_X) or \ 1826 (n_samples_X, n_samples_Y) 1827 A distance matrix D such that D_{i, j} is the distance between the 1828 ith and jth vectors of the given matrix X, if Y is None. 1829 If Y is not None, then D_{i, j} is the distance between the ith array 1830 from X and the jth array from Y. 1831 1832 See Also 1833 -------- 1834 pairwise_distances_chunked : Performs the same calculation as this 1835 function, but returns a generator of chunks of the distance matrix, in 1836 order to limit memory usage. 1837 paired_distances : Computes the distances between corresponding elements 1838 of two arrays. 1839 """ 1840 if ( 1841 metric not in _VALID_METRICS 1842 and not callable(metric) 1843 and metric != "precomputed" 1844 ): 1845 raise ValueError( 1846 "Unknown metric %s. Valid metrics are %s, or 'precomputed', or a callable" 1847 % (metric, _VALID_METRICS) 1848 ) 1849 1850 if metric == "precomputed": 1851 X, _ = check_pairwise_arrays( 1852 X, Y, precomputed=True, force_all_finite=force_all_finite 1853 ) 1854 1855 whom = ( 1856 "`pairwise_distances`. Precomputed distance " 1857 " need to have non-negative values." 1858 ) 1859 check_non_negative(X, whom=whom) 1860 return X 1861 elif metric in PAIRWISE_DISTANCE_FUNCTIONS: 1862 func = PAIRWISE_DISTANCE_FUNCTIONS[metric] 1863 elif callable(metric): 1864 func = partial( 1865 _pairwise_callable, metric=metric, force_all_finite=force_all_finite, **kwds 1866 ) 1867 else: 1868 if issparse(X) or issparse(Y): 1869 raise TypeError("scipy distance metrics do not support sparse matrices.") 1870 1871 dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None 1872 1873 if dtype == bool and (X.dtype != bool or (Y is not None and Y.dtype != bool)): 1874 msg = "Data was converted to boolean for metric %s" % metric 1875 warnings.warn(msg, DataConversionWarning) 1876 1877 X, Y = check_pairwise_arrays( 1878 X, Y, dtype=dtype, force_all_finite=force_all_finite 1879 ) 1880 1881 # precompute data-derived metric params 1882 params = _precompute_metric_params(X, Y, metric=metric, **kwds) 1883 kwds.update(**params) 1884 1885 if effective_n_jobs(n_jobs) == 1 and X is Y: 1886 return distance.squareform(distance.pdist(X, metric=metric, **kwds)) 1887 func = partial(distance.cdist, metric=metric, **kwds) 1888 1889 return _parallel_pairwise(X, Y, func, n_jobs, **kwds) 1890 1891 1892# These distances require boolean arrays, when using scipy.spatial.distance 1893PAIRWISE_BOOLEAN_FUNCTIONS = [ 1894 "dice", 1895 "jaccard", 1896 "kulsinski", 1897 "matching", 1898 "rogerstanimoto", 1899 "russellrao", 1900 "sokalmichener", 1901 "sokalsneath", 1902 "yule", 1903] 1904 1905# Helper functions - distance 1906PAIRWISE_KERNEL_FUNCTIONS = { 1907 # If updating this dictionary, update the doc in both distance_metrics() 1908 # and also in pairwise_distances()! 1909 "additive_chi2": additive_chi2_kernel, 1910 "chi2": chi2_kernel, 1911 "linear": linear_kernel, 1912 "polynomial": polynomial_kernel, 1913 "poly": polynomial_kernel, 1914 "rbf": rbf_kernel, 1915 "laplacian": laplacian_kernel, 1916 "sigmoid": sigmoid_kernel, 1917 "cosine": cosine_similarity, 1918} 1919 1920 1921def kernel_metrics(): 1922 """Valid metrics for pairwise_kernels. 1923 1924 This function simply returns the valid pairwise distance metrics. 1925 It exists, however, to allow for a verbose description of the mapping for 1926 each of the valid strings. 1927 1928 The valid distance metrics, and the function they map to, are: 1929 =============== ======================================== 1930 metric Function 1931 =============== ======================================== 1932 'additive_chi2' sklearn.pairwise.additive_chi2_kernel 1933 'chi2' sklearn.pairwise.chi2_kernel 1934 'linear' sklearn.pairwise.linear_kernel 1935 'poly' sklearn.pairwise.polynomial_kernel 1936 'polynomial' sklearn.pairwise.polynomial_kernel 1937 'rbf' sklearn.pairwise.rbf_kernel 1938 'laplacian' sklearn.pairwise.laplacian_kernel 1939 'sigmoid' sklearn.pairwise.sigmoid_kernel 1940 'cosine' sklearn.pairwise.cosine_similarity 1941 =============== ======================================== 1942 1943 Read more in the :ref:`User Guide <metrics>`. 1944 """ 1945 return PAIRWISE_KERNEL_FUNCTIONS 1946 1947 1948KERNEL_PARAMS = { 1949 "additive_chi2": (), 1950 "chi2": frozenset(["gamma"]), 1951 "cosine": (), 1952 "linear": (), 1953 "poly": frozenset(["gamma", "degree", "coef0"]), 1954 "polynomial": frozenset(["gamma", "degree", "coef0"]), 1955 "rbf": frozenset(["gamma"]), 1956 "laplacian": frozenset(["gamma"]), 1957 "sigmoid": frozenset(["gamma", "coef0"]), 1958} 1959 1960 1961def pairwise_kernels( 1962 X, Y=None, metric="linear", *, filter_params=False, n_jobs=None, **kwds 1963): 1964 """Compute the kernel between arrays X and optional array Y. 1965 1966 This method takes either a vector array or a kernel matrix, and returns 1967 a kernel matrix. If the input is a vector array, the kernels are 1968 computed. If the input is a kernel matrix, it is returned instead. 1969 1970 This method provides a safe way to take a kernel matrix as input, while 1971 preserving compatibility with many other algorithms that take a vector 1972 array. 1973 1974 If Y is given (default is None), then the returned matrix is the pairwise 1975 kernel between the arrays from both X and Y. 1976 1977 Valid values for metric are: 1978 ['additive_chi2', 'chi2', 'linear', 'poly', 'polynomial', 'rbf', 1979 'laplacian', 'sigmoid', 'cosine'] 1980 1981 Read more in the :ref:`User Guide <metrics>`. 1982 1983 Parameters 1984 ---------- 1985 X : ndarray of shape (n_samples_X, n_samples_X) or \ 1986 (n_samples_X, n_features) 1987 Array of pairwise kernels between samples, or a feature array. 1988 The shape of the array should be (n_samples_X, n_samples_X) if 1989 metric == "precomputed" and (n_samples_X, n_features) otherwise. 1990 1991 Y : ndarray of shape (n_samples_Y, n_features), default=None 1992 A second feature array only if X has shape (n_samples_X, n_features). 1993 1994 metric : str or callable, default="linear" 1995 The metric to use when calculating kernel between instances in a 1996 feature array. If metric is a string, it must be one of the metrics 1997 in pairwise.PAIRWISE_KERNEL_FUNCTIONS. 1998 If metric is "precomputed", X is assumed to be a kernel matrix. 1999 Alternatively, if metric is a callable function, it is called on each 2000 pair of instances (rows) and the resulting value recorded. The callable 2001 should take two rows from X as input and return the corresponding 2002 kernel value as a single number. This means that callables from 2003 :mod:`sklearn.metrics.pairwise` are not allowed, as they operate on 2004 matrices, not single samples. Use the string identifying the kernel 2005 instead. 2006 2007 filter_params : bool, default=False 2008 Whether to filter invalid parameters or not. 2009 2010 n_jobs : int, default=None 2011 The number of jobs to use for the computation. This works by breaking 2012 down the pairwise matrix into n_jobs even slices and computing them in 2013 parallel. 2014 2015 ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. 2016 ``-1`` means using all processors. See :term:`Glossary <n_jobs>` 2017 for more details. 2018 2019 **kwds : optional keyword parameters 2020 Any further parameters are passed directly to the kernel function. 2021 2022 Returns 2023 ------- 2024 K : ndarray of shape (n_samples_X, n_samples_X) or \ 2025 (n_samples_X, n_samples_Y) 2026 A kernel matrix K such that K_{i, j} is the kernel between the 2027 ith and jth vectors of the given matrix X, if Y is None. 2028 If Y is not None, then K_{i, j} is the kernel between the ith array 2029 from X and the jth array from Y. 2030 2031 Notes 2032 ----- 2033 If metric is 'precomputed', Y is ignored and X is returned. 2034 2035 """ 2036 # import GPKernel locally to prevent circular imports 2037 from ..gaussian_process.kernels import Kernel as GPKernel 2038 2039 if metric == "precomputed": 2040 X, _ = check_pairwise_arrays(X, Y, precomputed=True) 2041 return X 2042 elif isinstance(metric, GPKernel): 2043 func = metric.__call__ 2044 elif metric in PAIRWISE_KERNEL_FUNCTIONS: 2045 if filter_params: 2046 kwds = {k: kwds[k] for k in kwds if k in KERNEL_PARAMS[metric]} 2047 func = PAIRWISE_KERNEL_FUNCTIONS[metric] 2048 elif callable(metric): 2049 func = partial(_pairwise_callable, metric=metric, **kwds) 2050 else: 2051 raise ValueError("Unknown kernel %r" % metric) 2052 2053 return _parallel_pairwise(X, Y, func, n_jobs, **kwds) 2054