1from types import GeneratorType 2 3import numpy as np 4from numpy import linalg 5 6from scipy.sparse import dok_matrix, csr_matrix, issparse 7from scipy.spatial.distance import cosine, cityblock, minkowski 8from scipy.spatial.distance import cdist, pdist, squareform 9 10try: 11 from scipy.spatial.distance import wminkowski 12except ImportError: 13 # In scipy 1.6.0, wminkowski is deprecated and minkowski 14 # should be used instead. 15 from scipy.spatial.distance import minkowski as wminkowski 16 17from sklearn.utils.fixes import sp_version, parse_version 18 19import pytest 20 21from sklearn import config_context 22 23from sklearn.utils._testing import assert_array_almost_equal 24from sklearn.utils._testing import assert_allclose 25from sklearn.utils._testing import assert_almost_equal 26from sklearn.utils._testing import assert_array_equal 27from sklearn.utils._testing import ignore_warnings 28 29from sklearn.metrics.pairwise import euclidean_distances 30from sklearn.metrics.pairwise import nan_euclidean_distances 31from sklearn.metrics.pairwise import manhattan_distances 32from sklearn.metrics.pairwise import haversine_distances 33from sklearn.metrics.pairwise import linear_kernel 34from sklearn.metrics.pairwise import chi2_kernel, additive_chi2_kernel 35from sklearn.metrics.pairwise import polynomial_kernel 36from sklearn.metrics.pairwise import rbf_kernel 37from sklearn.metrics.pairwise import laplacian_kernel 38from sklearn.metrics.pairwise import sigmoid_kernel 39from sklearn.metrics.pairwise import cosine_similarity 40from sklearn.metrics.pairwise import cosine_distances 41from sklearn.metrics.pairwise import pairwise_distances 42from sklearn.metrics.pairwise import pairwise_distances_chunked 43from sklearn.metrics.pairwise import pairwise_distances_argmin_min 44from sklearn.metrics.pairwise import pairwise_distances_argmin 45from sklearn.metrics.pairwise import pairwise_kernels 46from sklearn.metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS 47from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS 48from sklearn.metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS 49from sklearn.metrics.pairwise import PAIRED_DISTANCES 50from sklearn.metrics.pairwise import check_pairwise_arrays 51from sklearn.metrics.pairwise import check_paired_arrays 52from sklearn.metrics.pairwise import paired_distances 53from sklearn.metrics.pairwise import paired_euclidean_distances 54from sklearn.metrics.pairwise import paired_manhattan_distances 55from sklearn.metrics.pairwise import _euclidean_distances_upcast 56from sklearn.preprocessing import normalize 57from sklearn.exceptions import DataConversionWarning 58 59 60def test_pairwise_distances(): 61 # Test the pairwise_distance helper function. 62 rng = np.random.RandomState(0) 63 64 # Euclidean distance should be equivalent to calling the function. 65 X = rng.random_sample((5, 4)) 66 S = pairwise_distances(X, metric="euclidean") 67 S2 = euclidean_distances(X) 68 assert_array_almost_equal(S, S2) 69 70 # Euclidean distance, with Y != X. 71 Y = rng.random_sample((2, 4)) 72 S = pairwise_distances(X, Y, metric="euclidean") 73 S2 = euclidean_distances(X, Y) 74 assert_array_almost_equal(S, S2) 75 # Check to ensure NaNs work with pairwise_distances. 76 X_masked = rng.random_sample((5, 4)) 77 Y_masked = rng.random_sample((2, 4)) 78 X_masked[0, 0] = np.nan 79 Y_masked[0, 0] = np.nan 80 S_masked = pairwise_distances(X_masked, Y_masked, metric="nan_euclidean") 81 S2_masked = nan_euclidean_distances(X_masked, Y_masked) 82 assert_array_almost_equal(S_masked, S2_masked) 83 # Test with tuples as X and Y 84 X_tuples = tuple([tuple([v for v in row]) for row in X]) 85 Y_tuples = tuple([tuple([v for v in row]) for row in Y]) 86 S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean") 87 assert_array_almost_equal(S, S2) 88 89 # Test haversine distance 90 # The data should be valid latitude and longitude 91 X = rng.random_sample((5, 2)) 92 X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi / 2 93 X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi 94 S = pairwise_distances(X, metric="haversine") 95 S2 = haversine_distances(X) 96 assert_array_almost_equal(S, S2) 97 98 # Test haversine distance, with Y != X 99 Y = rng.random_sample((2, 2)) 100 Y[:, 0] = (Y[:, 0] - 0.5) * 2 * np.pi / 2 101 Y[:, 1] = (Y[:, 1] - 0.5) * 2 * np.pi 102 S = pairwise_distances(X, Y, metric="haversine") 103 S2 = haversine_distances(X, Y) 104 assert_array_almost_equal(S, S2) 105 106 # "cityblock" uses scikit-learn metric, cityblock (function) is 107 # scipy.spatial. 108 S = pairwise_distances(X, metric="cityblock") 109 S2 = pairwise_distances(X, metric=cityblock) 110 assert S.shape[0] == S.shape[1] 111 assert S.shape[0] == X.shape[0] 112 assert_array_almost_equal(S, S2) 113 114 # The manhattan metric should be equivalent to cityblock. 115 S = pairwise_distances(X, Y, metric="manhattan") 116 S2 = pairwise_distances(X, Y, metric=cityblock) 117 assert S.shape[0] == X.shape[0] 118 assert S.shape[1] == Y.shape[0] 119 assert_array_almost_equal(S, S2) 120 121 # Test cosine as a string metric versus cosine callable 122 # The string "cosine" uses sklearn.metric, 123 # while the function cosine is scipy.spatial 124 S = pairwise_distances(X, Y, metric="cosine") 125 S2 = pairwise_distances(X, Y, metric=cosine) 126 assert S.shape[0] == X.shape[0] 127 assert S.shape[1] == Y.shape[0] 128 assert_array_almost_equal(S, S2) 129 130 # Test with sparse X and Y, 131 # currently only supported for Euclidean, L1 and cosine. 132 X_sparse = csr_matrix(X) 133 Y_sparse = csr_matrix(Y) 134 S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean") 135 S2 = euclidean_distances(X_sparse, Y_sparse) 136 assert_array_almost_equal(S, S2) 137 S = pairwise_distances(X_sparse, Y_sparse, metric="cosine") 138 S2 = cosine_distances(X_sparse, Y_sparse) 139 assert_array_almost_equal(S, S2) 140 S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan") 141 S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo()) 142 assert_array_almost_equal(S, S2) 143 S2 = manhattan_distances(X, Y) 144 assert_array_almost_equal(S, S2) 145 146 # Test with scipy.spatial.distance metric, with a kwd 147 kwds = {"p": 2.0} 148 S = pairwise_distances(X, Y, metric="minkowski", **kwds) 149 S2 = pairwise_distances(X, Y, metric=minkowski, **kwds) 150 assert_array_almost_equal(S, S2) 151 152 # same with Y = None 153 kwds = {"p": 2.0} 154 S = pairwise_distances(X, metric="minkowski", **kwds) 155 S2 = pairwise_distances(X, metric=minkowski, **kwds) 156 assert_array_almost_equal(S, S2) 157 158 # Test that scipy distance metrics throw an error if sparse matrix given 159 with pytest.raises(TypeError): 160 pairwise_distances(X_sparse, metric="minkowski") 161 with pytest.raises(TypeError): 162 pairwise_distances(X, Y_sparse, metric="minkowski") 163 164 # Test that a value error is raised if the metric is unknown 165 with pytest.raises(ValueError): 166 pairwise_distances(X, Y, metric="blah") 167 168 169@pytest.mark.parametrize("metric", PAIRWISE_BOOLEAN_FUNCTIONS) 170def test_pairwise_boolean_distance(metric): 171 # test that we convert to boolean arrays for boolean distances 172 rng = np.random.RandomState(0) 173 X = rng.randn(5, 4) 174 Y = X.copy() 175 Y[0, 0] = 1 - Y[0, 0] 176 177 # ignore conversion to boolean in pairwise_distances 178 with ignore_warnings(category=DataConversionWarning): 179 for Z in [Y, None]: 180 res = pairwise_distances(X, Z, metric=metric) 181 res[np.isnan(res)] = 0 182 assert np.sum(res != 0) == 0 183 184 # non-boolean arrays are converted to boolean for boolean 185 # distance metrics with a data conversion warning 186 msg = "Data was converted to boolean for metric %s" % metric 187 with pytest.warns(DataConversionWarning, match=msg): 188 pairwise_distances(X, metric=metric) 189 190 # Check that the warning is raised if X is boolean by Y is not boolean: 191 with pytest.warns(DataConversionWarning, match=msg): 192 pairwise_distances(X.astype(bool), Y=Y, metric=metric) 193 194 # Check that no warning is raised if X is already boolean and Y is None: 195 with pytest.warns(None) as records: 196 pairwise_distances(X.astype(bool), metric=metric) 197 assert len(records) == 0 198 199 200def test_no_data_conversion_warning(): 201 # No warnings issued if metric is not a boolean distance function 202 rng = np.random.RandomState(0) 203 X = rng.randn(5, 4) 204 with pytest.warns(None) as records: 205 pairwise_distances(X, metric="minkowski") 206 assert len(records) == 0 207 208 209@pytest.mark.parametrize("func", [pairwise_distances, pairwise_kernels]) 210def test_pairwise_precomputed(func): 211 # Test correct shape 212 with pytest.raises(ValueError, match=".* shape .*"): 213 func(np.zeros((5, 3)), metric="precomputed") 214 # with two args 215 with pytest.raises(ValueError, match=".* shape .*"): 216 func(np.zeros((5, 3)), np.zeros((4, 4)), metric="precomputed") 217 # even if shape[1] agrees (although thus second arg is spurious) 218 with pytest.raises(ValueError, match=".* shape .*"): 219 func(np.zeros((5, 3)), np.zeros((4, 3)), metric="precomputed") 220 221 # Test not copied (if appropriate dtype) 222 S = np.zeros((5, 5)) 223 S2 = func(S, metric="precomputed") 224 assert S is S2 225 # with two args 226 S = np.zeros((5, 3)) 227 S2 = func(S, np.zeros((3, 3)), metric="precomputed") 228 assert S is S2 229 230 # Test always returns float dtype 231 S = func(np.array([[1]], dtype="int"), metric="precomputed") 232 assert "f" == S.dtype.kind 233 234 # Test converts list to array-like 235 S = func([[1.0]], metric="precomputed") 236 assert isinstance(S, np.ndarray) 237 238 239def test_pairwise_precomputed_non_negative(): 240 # Test non-negative values 241 with pytest.raises(ValueError, match=".* non-negative values.*"): 242 pairwise_distances(np.full((5, 5), -1), metric="precomputed") 243 244 245_minkowski_kwds = {"w": np.arange(1, 5).astype("double", copy=False), "p": 1} 246_wminkowski_kwds = {"w": np.arange(1, 5).astype("double", copy=False), "p": 1} 247 248 249def callable_rbf_kernel(x, y, **kwds): 250 # Callable version of pairwise.rbf_kernel. 251 K = rbf_kernel(np.atleast_2d(x), np.atleast_2d(y), **kwds) 252 return K 253 254 255@pytest.mark.parametrize( 256 "func, metric, kwds", 257 [ 258 (pairwise_distances, "euclidean", {}), 259 pytest.param( 260 pairwise_distances, 261 minkowski, 262 _minkowski_kwds, 263 marks=pytest.mark.skipif( 264 sp_version < parse_version("1.0"), 265 reason="minkowski does not accept the w parameter prior to scipy 1.0.", 266 ), 267 ), 268 pytest.param( 269 pairwise_distances, 270 "minkowski", 271 _minkowski_kwds, 272 marks=pytest.mark.skipif( 273 sp_version < parse_version("1.0"), 274 reason="minkowski does not accept the w parameter prior to scipy 1.0.", 275 ), 276 ), 277 pytest.param( 278 pairwise_distances, 279 wminkowski, 280 _wminkowski_kwds, 281 marks=pytest.mark.skipif( 282 sp_version >= parse_version("1.6.0"), 283 reason="wminkowski is now minkowski and it has been already tested.", 284 ), 285 ), 286 pytest.param( 287 pairwise_distances, 288 "wminkowski", 289 _wminkowski_kwds, 290 marks=pytest.mark.skipif( 291 sp_version >= parse_version("1.6.0"), 292 reason="wminkowski is now minkowski and it has been already tested.", 293 ), 294 ), 295 (pairwise_kernels, "polynomial", {"degree": 1}), 296 (pairwise_kernels, callable_rbf_kernel, {"gamma": 0.1}), 297 ], 298) 299@pytest.mark.parametrize("dtype", [np.float64, int]) 300def test_pairwise_parallel(func, metric, kwds, dtype): 301 rng = np.random.RandomState(0) 302 X = np.array(5 * rng.random_sample((5, 4)), dtype=dtype) 303 Y = np.array(5 * rng.random_sample((3, 4)), dtype=dtype) 304 305 S = func(X, metric=metric, n_jobs=1, **kwds) 306 S2 = func(X, metric=metric, n_jobs=2, **kwds) 307 assert_allclose(S, S2) 308 309 S = func(X, Y, metric=metric, n_jobs=1, **kwds) 310 S2 = func(X, Y, metric=metric, n_jobs=2, **kwds) 311 assert_allclose(S, S2) 312 313 314def test_pairwise_callable_nonstrict_metric(): 315 # paired_distances should allow callable metric where metric(x, x) != 0 316 # Knowing that the callable is a strict metric would allow the diagonal to 317 # be left uncalculated and set to 0. 318 assert pairwise_distances([[1.0]], metric=lambda x, y: 5)[0, 0] == 5 319 320 321# Test with all metrics that should be in PAIRWISE_KERNEL_FUNCTIONS. 322@pytest.mark.parametrize( 323 "metric", 324 ["rbf", "laplacian", "sigmoid", "polynomial", "linear", "chi2", "additive_chi2"], 325) 326def test_pairwise_kernels(metric): 327 # Test the pairwise_kernels helper function. 328 329 rng = np.random.RandomState(0) 330 X = rng.random_sample((5, 4)) 331 Y = rng.random_sample((2, 4)) 332 function = PAIRWISE_KERNEL_FUNCTIONS[metric] 333 # Test with Y=None 334 K1 = pairwise_kernels(X, metric=metric) 335 K2 = function(X) 336 assert_array_almost_equal(K1, K2) 337 # Test with Y=Y 338 K1 = pairwise_kernels(X, Y=Y, metric=metric) 339 K2 = function(X, Y=Y) 340 assert_array_almost_equal(K1, K2) 341 # Test with tuples as X and Y 342 X_tuples = tuple([tuple([v for v in row]) for row in X]) 343 Y_tuples = tuple([tuple([v for v in row]) for row in Y]) 344 K2 = pairwise_kernels(X_tuples, Y_tuples, metric=metric) 345 assert_array_almost_equal(K1, K2) 346 347 # Test with sparse X and Y 348 X_sparse = csr_matrix(X) 349 Y_sparse = csr_matrix(Y) 350 if metric in ["chi2", "additive_chi2"]: 351 # these don't support sparse matrices yet 352 with pytest.raises(ValueError): 353 pairwise_kernels(X_sparse, Y=Y_sparse, metric=metric) 354 return 355 K1 = pairwise_kernels(X_sparse, Y=Y_sparse, metric=metric) 356 assert_array_almost_equal(K1, K2) 357 358 359def test_pairwise_kernels_callable(): 360 # Test the pairwise_kernels helper function 361 # with a callable function, with given keywords. 362 rng = np.random.RandomState(0) 363 X = rng.random_sample((5, 4)) 364 Y = rng.random_sample((2, 4)) 365 366 metric = callable_rbf_kernel 367 kwds = {"gamma": 0.1} 368 K1 = pairwise_kernels(X, Y=Y, metric=metric, **kwds) 369 K2 = rbf_kernel(X, Y=Y, **kwds) 370 assert_array_almost_equal(K1, K2) 371 372 # callable function, X=Y 373 K1 = pairwise_kernels(X, Y=X, metric=metric, **kwds) 374 K2 = rbf_kernel(X, Y=X, **kwds) 375 assert_array_almost_equal(K1, K2) 376 377 378def test_pairwise_kernels_filter_param(): 379 rng = np.random.RandomState(0) 380 X = rng.random_sample((5, 4)) 381 Y = rng.random_sample((2, 4)) 382 K = rbf_kernel(X, Y, gamma=0.1) 383 params = {"gamma": 0.1, "blabla": ":)"} 384 K2 = pairwise_kernels(X, Y, metric="rbf", filter_params=True, **params) 385 assert_array_almost_equal(K, K2) 386 387 with pytest.raises(TypeError): 388 pairwise_kernels(X, Y, metric="rbf", **params) 389 390 391@pytest.mark.parametrize("metric, func", PAIRED_DISTANCES.items()) 392def test_paired_distances(metric, func): 393 # Test the pairwise_distance helper function. 394 rng = np.random.RandomState(0) 395 # Euclidean distance should be equivalent to calling the function. 396 X = rng.random_sample((5, 4)) 397 # Euclidean distance, with Y != X. 398 Y = rng.random_sample((5, 4)) 399 400 S = paired_distances(X, Y, metric=metric) 401 S2 = func(X, Y) 402 assert_array_almost_equal(S, S2) 403 S3 = func(csr_matrix(X), csr_matrix(Y)) 404 assert_array_almost_equal(S, S3) 405 if metric in PAIRWISE_DISTANCE_FUNCTIONS: 406 # Check the pairwise_distances implementation 407 # gives the same value 408 distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y) 409 distances = np.diag(distances) 410 assert_array_almost_equal(distances, S) 411 412 413def test_paired_distances_callable(): 414 # Test the pairwise_distance helper function 415 # with the callable implementation 416 rng = np.random.RandomState(0) 417 # Euclidean distance should be equivalent to calling the function. 418 X = rng.random_sample((5, 4)) 419 # Euclidean distance, with Y != X. 420 Y = rng.random_sample((5, 4)) 421 422 S = paired_distances(X, Y, metric="manhattan") 423 S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x - y).sum(axis=0)) 424 assert_array_almost_equal(S, S2) 425 426 # Test that a value error is raised when the lengths of X and Y should not 427 # differ 428 Y = rng.random_sample((3, 4)) 429 with pytest.raises(ValueError): 430 paired_distances(X, Y) 431 432 433def test_pairwise_distances_argmin_min(): 434 # Check pairwise minimum distances computation for any metric 435 X = [[0], [1]] 436 Y = [[-2], [3]] 437 438 Xsp = dok_matrix(X) 439 Ysp = csr_matrix(Y, dtype=np.float32) 440 441 expected_idx = [0, 1] 442 expected_vals = [2, 2] 443 expected_vals_sq = [4, 4] 444 445 # euclidean metric 446 idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean") 447 idx2 = pairwise_distances_argmin(X, Y, metric="euclidean") 448 assert_array_almost_equal(idx, expected_idx) 449 assert_array_almost_equal(idx2, expected_idx) 450 assert_array_almost_equal(vals, expected_vals) 451 # sparse matrix case 452 idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean") 453 assert_array_almost_equal(idxsp, expected_idx) 454 assert_array_almost_equal(valssp, expected_vals) 455 # We don't want np.matrix here 456 assert type(idxsp) == np.ndarray 457 assert type(valssp) == np.ndarray 458 459 # euclidean metric squared 460 idx, vals = pairwise_distances_argmin_min( 461 X, Y, metric="euclidean", metric_kwargs={"squared": True} 462 ) 463 assert_array_almost_equal(idx, expected_idx) 464 assert_array_almost_equal(vals, expected_vals_sq) 465 466 # Non-euclidean scikit-learn metric 467 idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan") 468 idx2 = pairwise_distances_argmin(X, Y, metric="manhattan") 469 assert_array_almost_equal(idx, expected_idx) 470 assert_array_almost_equal(idx2, expected_idx) 471 assert_array_almost_equal(vals, expected_vals) 472 # sparse matrix case 473 idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan") 474 assert_array_almost_equal(idxsp, expected_idx) 475 assert_array_almost_equal(valssp, expected_vals) 476 477 # Non-euclidean Scipy distance (callable) 478 idx, vals = pairwise_distances_argmin_min( 479 X, Y, metric=minkowski, metric_kwargs={"p": 2} 480 ) 481 assert_array_almost_equal(idx, expected_idx) 482 assert_array_almost_equal(vals, expected_vals) 483 484 # Non-euclidean Scipy distance (string) 485 idx, vals = pairwise_distances_argmin_min( 486 X, Y, metric="minkowski", metric_kwargs={"p": 2} 487 ) 488 assert_array_almost_equal(idx, expected_idx) 489 assert_array_almost_equal(vals, expected_vals) 490 491 # Compare with naive implementation 492 rng = np.random.RandomState(0) 493 X = rng.randn(97, 149) 494 Y = rng.randn(111, 149) 495 496 dist = pairwise_distances(X, Y, metric="manhattan") 497 dist_orig_ind = dist.argmin(axis=0) 498 dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))] 499 500 dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min( 501 X, Y, axis=0, metric="manhattan" 502 ) 503 np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7) 504 np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7) 505 506 507def _reduce_func(dist, start): 508 return dist[:, :100] 509 510 511def test_pairwise_distances_chunked_reduce(): 512 rng = np.random.RandomState(0) 513 X = rng.random_sample((400, 4)) 514 # Reduced Euclidean distance 515 S = pairwise_distances(X)[:, :100] 516 S_chunks = pairwise_distances_chunked( 517 X, None, reduce_func=_reduce_func, working_memory=2 ** -16 518 ) 519 assert isinstance(S_chunks, GeneratorType) 520 S_chunks = list(S_chunks) 521 assert len(S_chunks) > 1 522 # atol is for diagonal where S is explicitly zeroed on the diagonal 523 assert_allclose(np.vstack(S_chunks), S, atol=1e-7) 524 525 526def test_pairwise_distances_chunked_reduce_none(): 527 # check that the reduce func is allowed to return None 528 rng = np.random.RandomState(0) 529 X = rng.random_sample((10, 4)) 530 S_chunks = pairwise_distances_chunked( 531 X, None, reduce_func=lambda dist, start: None, working_memory=2 ** -16 532 ) 533 assert isinstance(S_chunks, GeneratorType) 534 S_chunks = list(S_chunks) 535 assert len(S_chunks) > 1 536 assert all(chunk is None for chunk in S_chunks) 537 538 539@pytest.mark.parametrize( 540 "good_reduce", 541 [ 542 lambda D, start: list(D), 543 lambda D, start: np.array(D), 544 lambda D, start: csr_matrix(D), 545 lambda D, start: (list(D), list(D)), 546 lambda D, start: (dok_matrix(D), np.array(D), list(D)), 547 ], 548) 549def test_pairwise_distances_chunked_reduce_valid(good_reduce): 550 X = np.arange(10).reshape(-1, 1) 551 S_chunks = pairwise_distances_chunked( 552 X, None, reduce_func=good_reduce, working_memory=64 553 ) 554 next(S_chunks) 555 556 557@pytest.mark.parametrize( 558 ("bad_reduce", "err_type", "message"), 559 [ 560 ( 561 lambda D, s: np.concatenate([D, D[-1:]]), 562 ValueError, 563 r"length 11\..* input: 10\.", 564 ), 565 ( 566 lambda D, s: (D, np.concatenate([D, D[-1:]])), 567 ValueError, 568 r"length \(10, 11\)\..* input: 10\.", 569 ), 570 (lambda D, s: (D[:9], D), ValueError, r"length \(9, 10\)\..* input: 10\."), 571 ( 572 lambda D, s: 7, 573 TypeError, 574 r"returned 7\. Expected sequence\(s\) of length 10\.", 575 ), 576 ( 577 lambda D, s: (7, 8), 578 TypeError, 579 r"returned \(7, 8\)\. Expected sequence\(s\) of length 10\.", 580 ), 581 ( 582 lambda D, s: (np.arange(10), 9), 583 TypeError, 584 r", 9\)\. Expected sequence\(s\) of length 10\.", 585 ), 586 ], 587) 588def test_pairwise_distances_chunked_reduce_invalid(bad_reduce, err_type, message): 589 X = np.arange(10).reshape(-1, 1) 590 S_chunks = pairwise_distances_chunked( 591 X, None, reduce_func=bad_reduce, working_memory=64 592 ) 593 with pytest.raises(err_type, match=message): 594 next(S_chunks) 595 596 597def check_pairwise_distances_chunked(X, Y, working_memory, metric="euclidean"): 598 gen = pairwise_distances_chunked(X, Y, working_memory=working_memory, metric=metric) 599 assert isinstance(gen, GeneratorType) 600 blockwise_distances = list(gen) 601 Y = X if Y is None else Y 602 min_block_mib = len(Y) * 8 * 2 ** -20 603 604 for block in blockwise_distances: 605 memory_used = block.nbytes 606 assert memory_used <= max(working_memory, min_block_mib) * 2 ** 20 607 608 blockwise_distances = np.vstack(blockwise_distances) 609 S = pairwise_distances(X, Y, metric=metric) 610 assert_array_almost_equal(blockwise_distances, S) 611 612 613@pytest.mark.parametrize("metric", ("euclidean", "l2", "sqeuclidean")) 614def test_pairwise_distances_chunked_diagonal(metric): 615 rng = np.random.RandomState(0) 616 X = rng.normal(size=(1000, 10), scale=1e10) 617 chunks = list(pairwise_distances_chunked(X, working_memory=1, metric=metric)) 618 assert len(chunks) > 1 619 assert_array_almost_equal(np.diag(np.vstack(chunks)), 0, decimal=10) 620 621 622@pytest.mark.parametrize("metric", ("euclidean", "l2", "sqeuclidean")) 623def test_parallel_pairwise_distances_diagonal(metric): 624 rng = np.random.RandomState(0) 625 X = rng.normal(size=(1000, 10), scale=1e10) 626 distances = pairwise_distances(X, metric=metric, n_jobs=2) 627 assert_allclose(np.diag(distances), 0, atol=1e-10) 628 629 630@ignore_warnings 631def test_pairwise_distances_chunked(): 632 # Test the pairwise_distance helper function. 633 rng = np.random.RandomState(0) 634 # Euclidean distance should be equivalent to calling the function. 635 X = rng.random_sample((200, 4)) 636 check_pairwise_distances_chunked(X, None, working_memory=1, metric="euclidean") 637 # Test small amounts of memory 638 for power in range(-16, 0): 639 check_pairwise_distances_chunked( 640 X, None, working_memory=2 ** power, metric="euclidean" 641 ) 642 # X as list 643 check_pairwise_distances_chunked( 644 X.tolist(), None, working_memory=1, metric="euclidean" 645 ) 646 # Euclidean distance, with Y != X. 647 Y = rng.random_sample((100, 4)) 648 check_pairwise_distances_chunked(X, Y, working_memory=1, metric="euclidean") 649 check_pairwise_distances_chunked( 650 X.tolist(), Y.tolist(), working_memory=1, metric="euclidean" 651 ) 652 # absurdly large working_memory 653 check_pairwise_distances_chunked(X, Y, working_memory=10000, metric="euclidean") 654 # "cityblock" uses scikit-learn metric, cityblock (function) is 655 # scipy.spatial. 656 check_pairwise_distances_chunked(X, Y, working_memory=1, metric="cityblock") 657 # Test that a value error is raised if the metric is unknown 658 with pytest.raises(ValueError): 659 next(pairwise_distances_chunked(X, Y, metric="blah")) 660 661 # Test precomputed returns all at once 662 D = pairwise_distances(X) 663 gen = pairwise_distances_chunked(D, working_memory=2 ** -16, metric="precomputed") 664 assert isinstance(gen, GeneratorType) 665 assert next(gen) is D 666 with pytest.raises(StopIteration): 667 next(gen) 668 669 670@pytest.mark.parametrize( 671 "x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"] 672) 673@pytest.mark.parametrize( 674 "y_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"] 675) 676def test_euclidean_distances_known_result(x_array_constr, y_array_constr): 677 # Check the pairwise Euclidean distances computation on known result 678 X = x_array_constr([[0]]) 679 Y = y_array_constr([[1], [2]]) 680 D = euclidean_distances(X, Y) 681 assert_allclose(D, [[1.0, 2.0]]) 682 683 684@pytest.mark.parametrize("dtype", [np.float32, np.float64]) 685@pytest.mark.parametrize( 686 "y_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"] 687) 688def test_euclidean_distances_with_norms(dtype, y_array_constr): 689 # check that we still get the right answers with {X,Y}_norm_squared 690 # and that we get a wrong answer with wrong {X,Y}_norm_squared 691 rng = np.random.RandomState(0) 692 X = rng.random_sample((10, 10)).astype(dtype, copy=False) 693 Y = rng.random_sample((20, 10)).astype(dtype, copy=False) 694 695 # norms will only be used if their dtype is float64 696 X_norm_sq = (X.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1) 697 Y_norm_sq = (Y.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1) 698 699 Y = y_array_constr(Y) 700 701 D1 = euclidean_distances(X, Y) 702 D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq) 703 D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq) 704 D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq, Y_norm_squared=Y_norm_sq) 705 assert_allclose(D2, D1) 706 assert_allclose(D3, D1) 707 assert_allclose(D4, D1) 708 709 # check we get the wrong answer with wrong {X,Y}_norm_squared 710 wrong_D = euclidean_distances( 711 X, 712 Y, 713 X_norm_squared=np.zeros_like(X_norm_sq), 714 Y_norm_squared=np.zeros_like(Y_norm_sq), 715 ) 716 with pytest.raises(AssertionError): 717 assert_allclose(wrong_D, D1) 718 719 720def test_euclidean_distances_norm_shapes(): 721 # Check all accepted shapes for the norms or appropriate error messages. 722 rng = np.random.RandomState(0) 723 X = rng.random_sample((10, 10)) 724 Y = rng.random_sample((20, 10)) 725 726 X_norm_squared = (X ** 2).sum(axis=1) 727 Y_norm_squared = (Y ** 2).sum(axis=1) 728 729 D1 = euclidean_distances( 730 X, Y, X_norm_squared=X_norm_squared, Y_norm_squared=Y_norm_squared 731 ) 732 D2 = euclidean_distances( 733 X, 734 Y, 735 X_norm_squared=X_norm_squared.reshape(-1, 1), 736 Y_norm_squared=Y_norm_squared.reshape(-1, 1), 737 ) 738 D3 = euclidean_distances( 739 X, 740 Y, 741 X_norm_squared=X_norm_squared.reshape(1, -1), 742 Y_norm_squared=Y_norm_squared.reshape(1, -1), 743 ) 744 745 assert_allclose(D2, D1) 746 assert_allclose(D3, D1) 747 748 with pytest.raises(ValueError, match="Incompatible dimensions for X"): 749 euclidean_distances(X, Y, X_norm_squared=X_norm_squared[:5]) 750 with pytest.raises(ValueError, match="Incompatible dimensions for Y"): 751 euclidean_distances(X, Y, Y_norm_squared=Y_norm_squared[:5]) 752 753 754@pytest.mark.parametrize("dtype", [np.float32, np.float64]) 755@pytest.mark.parametrize( 756 "x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"] 757) 758@pytest.mark.parametrize( 759 "y_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"] 760) 761def test_euclidean_distances(dtype, x_array_constr, y_array_constr): 762 # check that euclidean distances gives same result as scipy cdist 763 # when X and Y != X are provided 764 rng = np.random.RandomState(0) 765 X = rng.random_sample((100, 10)).astype(dtype, copy=False) 766 X[X < 0.8] = 0 767 Y = rng.random_sample((10, 10)).astype(dtype, copy=False) 768 Y[Y < 0.8] = 0 769 770 expected = cdist(X, Y) 771 772 X = x_array_constr(X) 773 Y = y_array_constr(Y) 774 distances = euclidean_distances(X, Y) 775 776 # the default rtol=1e-7 is too close to the float32 precision 777 # and fails due to rounding errors. 778 assert_allclose(distances, expected, rtol=1e-6) 779 assert distances.dtype == dtype 780 781 782@pytest.mark.parametrize("dtype", [np.float32, np.float64]) 783@pytest.mark.parametrize( 784 "x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"] 785) 786def test_euclidean_distances_sym(dtype, x_array_constr): 787 # check that euclidean distances gives same result as scipy pdist 788 # when only X is provided 789 rng = np.random.RandomState(0) 790 X = rng.random_sample((100, 10)).astype(dtype, copy=False) 791 X[X < 0.8] = 0 792 793 expected = squareform(pdist(X)) 794 795 X = x_array_constr(X) 796 distances = euclidean_distances(X) 797 798 # the default rtol=1e-7 is too close to the float32 precision 799 # and fails due to rounding errors. 800 assert_allclose(distances, expected, rtol=1e-6) 801 assert distances.dtype == dtype 802 803 804@pytest.mark.parametrize("batch_size", [None, 5, 7, 101]) 805@pytest.mark.parametrize( 806 "x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"] 807) 808@pytest.mark.parametrize( 809 "y_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"] 810) 811def test_euclidean_distances_upcast(batch_size, x_array_constr, y_array_constr): 812 # check batches handling when Y != X (#13910) 813 rng = np.random.RandomState(0) 814 X = rng.random_sample((100, 10)).astype(np.float32) 815 X[X < 0.8] = 0 816 Y = rng.random_sample((10, 10)).astype(np.float32) 817 Y[Y < 0.8] = 0 818 819 expected = cdist(X, Y) 820 821 X = x_array_constr(X) 822 Y = y_array_constr(Y) 823 distances = _euclidean_distances_upcast(X, Y=Y, batch_size=batch_size) 824 distances = np.sqrt(np.maximum(distances, 0)) 825 826 # the default rtol=1e-7 is too close to the float32 precision 827 # and fails due to rounding errors. 828 assert_allclose(distances, expected, rtol=1e-6) 829 830 831@pytest.mark.parametrize("batch_size", [None, 5, 7, 101]) 832@pytest.mark.parametrize( 833 "x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"] 834) 835def test_euclidean_distances_upcast_sym(batch_size, x_array_constr): 836 # check batches handling when X is Y (#13910) 837 rng = np.random.RandomState(0) 838 X = rng.random_sample((100, 10)).astype(np.float32) 839 X[X < 0.8] = 0 840 841 expected = squareform(pdist(X)) 842 843 X = x_array_constr(X) 844 distances = _euclidean_distances_upcast(X, Y=X, batch_size=batch_size) 845 distances = np.sqrt(np.maximum(distances, 0)) 846 847 # the default rtol=1e-7 is too close to the float32 precision 848 # and fails due to rounding errors. 849 assert_allclose(distances, expected, rtol=1e-6) 850 851 852@pytest.mark.parametrize( 853 "dtype, eps, rtol", 854 [ 855 (np.float32, 1e-4, 1e-5), 856 pytest.param( 857 np.float64, 858 1e-8, 859 0.99, 860 marks=pytest.mark.xfail(reason="failing due to lack of precision"), 861 ), 862 ], 863) 864@pytest.mark.parametrize("dim", [1, 1000000]) 865def test_euclidean_distances_extreme_values(dtype, eps, rtol, dim): 866 # check that euclidean distances is correct with float32 input thanks to 867 # upcasting. On float64 there are still precision issues. 868 X = np.array([[1.0] * dim], dtype=dtype) 869 Y = np.array([[1.0 + eps] * dim], dtype=dtype) 870 871 distances = euclidean_distances(X, Y) 872 expected = cdist(X, Y) 873 874 assert_allclose(distances, expected, rtol=1e-5) 875 876 877@pytest.mark.parametrize("squared", [True, False]) 878def test_nan_euclidean_distances_equal_to_euclidean_distance(squared): 879 # with no nan values 880 rng = np.random.RandomState(1337) 881 X = rng.randn(3, 4) 882 Y = rng.randn(4, 4) 883 884 normal_distance = euclidean_distances(X, Y=Y, squared=squared) 885 nan_distance = nan_euclidean_distances(X, Y=Y, squared=squared) 886 assert_allclose(normal_distance, nan_distance) 887 888 889@pytest.mark.parametrize("X", [np.array([[np.inf, 0]]), np.array([[0, -np.inf]])]) 890@pytest.mark.parametrize("Y", [np.array([[np.inf, 0]]), np.array([[0, -np.inf]]), None]) 891def test_nan_euclidean_distances_infinite_values(X, Y): 892 893 with pytest.raises(ValueError) as excinfo: 894 nan_euclidean_distances(X, Y=Y) 895 896 exp_msg = "Input contains infinity or a value too large for dtype('float64')." 897 assert exp_msg == str(excinfo.value) 898 899 900@pytest.mark.parametrize( 901 "X, X_diag, missing_value", 902 [ 903 (np.array([[0, 1], [1, 0]]), np.sqrt(2), np.nan), 904 (np.array([[0, 1], [1, np.nan]]), np.sqrt(2), np.nan), 905 (np.array([[np.nan, 1], [1, np.nan]]), np.nan, np.nan), 906 (np.array([[np.nan, 1], [np.nan, 0]]), np.sqrt(2), np.nan), 907 (np.array([[0, np.nan], [1, np.nan]]), np.sqrt(2), np.nan), 908 (np.array([[0, 1], [1, 0]]), np.sqrt(2), -1), 909 (np.array([[0, 1], [1, -1]]), np.sqrt(2), -1), 910 (np.array([[-1, 1], [1, -1]]), np.nan, -1), 911 (np.array([[-1, 1], [-1, 0]]), np.sqrt(2), -1), 912 (np.array([[0, -1], [1, -1]]), np.sqrt(2), -1), 913 ], 914) 915def test_nan_euclidean_distances_2x2(X, X_diag, missing_value): 916 917 exp_dist = np.array([[0.0, X_diag], [X_diag, 0]]) 918 919 dist = nan_euclidean_distances(X, missing_values=missing_value) 920 assert_allclose(exp_dist, dist) 921 922 dist_sq = nan_euclidean_distances(X, squared=True, missing_values=missing_value) 923 assert_allclose(exp_dist ** 2, dist_sq) 924 925 dist_two = nan_euclidean_distances(X, X, missing_values=missing_value) 926 assert_allclose(exp_dist, dist_two) 927 928 dist_two_copy = nan_euclidean_distances(X, X.copy(), missing_values=missing_value) 929 assert_allclose(exp_dist, dist_two_copy) 930 931 932@pytest.mark.parametrize("missing_value", [np.nan, -1]) 933def test_nan_euclidean_distances_complete_nan(missing_value): 934 X = np.array([[missing_value, missing_value], [0, 1]]) 935 936 exp_dist = np.array([[np.nan, np.nan], [np.nan, 0]]) 937 938 dist = nan_euclidean_distances(X, missing_values=missing_value) 939 assert_allclose(exp_dist, dist) 940 941 dist = nan_euclidean_distances(X, X.copy(), missing_values=missing_value) 942 assert_allclose(exp_dist, dist) 943 944 945@pytest.mark.parametrize("missing_value", [np.nan, -1]) 946def test_nan_euclidean_distances_not_trival(missing_value): 947 X = np.array( 948 [ 949 [1.0, missing_value, 3.0, 4.0, 2.0], 950 [missing_value, 4.0, 6.0, 1.0, missing_value], 951 [3.0, missing_value, missing_value, missing_value, 1.0], 952 ] 953 ) 954 955 Y = np.array( 956 [ 957 [missing_value, 7.0, 7.0, missing_value, 2.0], 958 [missing_value, missing_value, 5.0, 4.0, 7.0], 959 [missing_value, missing_value, missing_value, 4.0, 5.0], 960 ] 961 ) 962 963 # Check for symmetry 964 D1 = nan_euclidean_distances(X, Y, missing_values=missing_value) 965 D2 = nan_euclidean_distances(Y, X, missing_values=missing_value) 966 967 assert_almost_equal(D1, D2.T) 968 969 # Check with explicit formula and squared=True 970 assert_allclose( 971 nan_euclidean_distances( 972 X[:1], Y[:1], squared=True, missing_values=missing_value 973 ), 974 [[5.0 / 2.0 * ((7 - 3) ** 2 + (2 - 2) ** 2)]], 975 ) 976 977 # Check with explicit formula and squared=False 978 assert_allclose( 979 nan_euclidean_distances( 980 X[1:2], Y[1:2], squared=False, missing_values=missing_value 981 ), 982 [[np.sqrt(5.0 / 2.0 * ((6 - 5) ** 2 + (1 - 4) ** 2))]], 983 ) 984 985 # Check when Y = X is explicitly passed 986 D3 = nan_euclidean_distances(X, missing_values=missing_value) 987 D4 = nan_euclidean_distances(X, X, missing_values=missing_value) 988 D5 = nan_euclidean_distances(X, X.copy(), missing_values=missing_value) 989 assert_allclose(D3, D4) 990 assert_allclose(D4, D5) 991 992 # Check copy = True against copy = False 993 D6 = nan_euclidean_distances(X, Y, copy=True) 994 D7 = nan_euclidean_distances(X, Y, copy=False) 995 assert_allclose(D6, D7) 996 997 998@pytest.mark.parametrize("missing_value", [np.nan, -1]) 999def test_nan_euclidean_distances_one_feature_match_positive(missing_value): 1000 # First feature is the only feature that is non-nan and in both 1001 # samples. The result of `nan_euclidean_distances` with squared=True 1002 # should be non-negative. The non-squared version should all be close to 0. 1003 X = np.array( 1004 [ 1005 [-122.27, 648.0, missing_value, 37.85], 1006 [-122.27, missing_value, 2.34701493, missing_value], 1007 ] 1008 ) 1009 1010 dist_squared = nan_euclidean_distances( 1011 X, missing_values=missing_value, squared=True 1012 ) 1013 assert np.all(dist_squared >= 0) 1014 1015 dist = nan_euclidean_distances(X, missing_values=missing_value, squared=False) 1016 assert_allclose(dist, 0.0) 1017 1018 1019def test_cosine_distances(): 1020 # Check the pairwise Cosine distances computation 1021 rng = np.random.RandomState(1337) 1022 x = np.abs(rng.rand(910)) 1023 XA = np.vstack([x, x]) 1024 D = cosine_distances(XA) 1025 assert_array_almost_equal(D, [[0.0, 0.0], [0.0, 0.0]]) 1026 # check that all elements are in [0, 2] 1027 assert np.all(D >= 0.0) 1028 assert np.all(D <= 2.0) 1029 # check that diagonal elements are equal to 0 1030 assert_array_almost_equal(D[np.diag_indices_from(D)], [0.0, 0.0]) 1031 1032 XB = np.vstack([x, -x]) 1033 D2 = cosine_distances(XB) 1034 # check that all elements are in [0, 2] 1035 assert np.all(D2 >= 0.0) 1036 assert np.all(D2 <= 2.0) 1037 # check that diagonal elements are equal to 0 and non diagonal to 2 1038 assert_array_almost_equal(D2, [[0.0, 2.0], [2.0, 0.0]]) 1039 1040 # check large random matrix 1041 X = np.abs(rng.rand(1000, 5000)) 1042 D = cosine_distances(X) 1043 # check that diagonal elements are equal to 0 1044 assert_array_almost_equal(D[np.diag_indices_from(D)], [0.0] * D.shape[0]) 1045 assert np.all(D >= 0.0) 1046 assert np.all(D <= 2.0) 1047 1048 1049def test_haversine_distances(): 1050 # Check haversine distance with distances computation 1051 def slow_haversine_distances(x, y): 1052 diff_lat = y[0] - x[0] 1053 diff_lon = y[1] - x[1] 1054 a = np.sin(diff_lat / 2) ** 2 + ( 1055 np.cos(x[0]) * np.cos(y[0]) * np.sin(diff_lon / 2) ** 2 1056 ) 1057 c = 2 * np.arcsin(np.sqrt(a)) 1058 return c 1059 1060 rng = np.random.RandomState(0) 1061 X = rng.random_sample((5, 2)) 1062 Y = rng.random_sample((10, 2)) 1063 D1 = np.array([[slow_haversine_distances(x, y) for y in Y] for x in X]) 1064 D2 = haversine_distances(X, Y) 1065 assert_array_almost_equal(D1, D2) 1066 # Test haversine distance does not accept X where n_feature != 2 1067 X = rng.random_sample((10, 3)) 1068 err_msg = "Haversine distance only valid in 2 dimensions" 1069 with pytest.raises(ValueError, match=err_msg): 1070 haversine_distances(X) 1071 1072 1073# Paired distances 1074 1075 1076def test_paired_euclidean_distances(): 1077 # Check the paired Euclidean distances computation 1078 X = [[0], [0]] 1079 Y = [[1], [2]] 1080 D = paired_euclidean_distances(X, Y) 1081 assert_array_almost_equal(D, [1.0, 2.0]) 1082 1083 1084def test_paired_manhattan_distances(): 1085 # Check the paired manhattan distances computation 1086 X = [[0], [0]] 1087 Y = [[1], [2]] 1088 D = paired_manhattan_distances(X, Y) 1089 assert_array_almost_equal(D, [1.0, 2.0]) 1090 1091 1092def test_chi_square_kernel(): 1093 rng = np.random.RandomState(0) 1094 X = rng.random_sample((5, 4)) 1095 Y = rng.random_sample((10, 4)) 1096 K_add = additive_chi2_kernel(X, Y) 1097 gamma = 0.1 1098 K = chi2_kernel(X, Y, gamma=gamma) 1099 assert K.dtype == float 1100 for i, x in enumerate(X): 1101 for j, y in enumerate(Y): 1102 chi2 = -np.sum((x - y) ** 2 / (x + y)) 1103 chi2_exp = np.exp(gamma * chi2) 1104 assert_almost_equal(K_add[i, j], chi2) 1105 assert_almost_equal(K[i, j], chi2_exp) 1106 1107 # check diagonal is ones for data with itself 1108 K = chi2_kernel(Y) 1109 assert_array_equal(np.diag(K), 1) 1110 # check off-diagonal is < 1 but > 0: 1111 assert np.all(K > 0) 1112 assert np.all(K - np.diag(np.diag(K)) < 1) 1113 # check that float32 is preserved 1114 X = rng.random_sample((5, 4)).astype(np.float32) 1115 Y = rng.random_sample((10, 4)).astype(np.float32) 1116 K = chi2_kernel(X, Y) 1117 assert K.dtype == np.float32 1118 1119 # check integer type gets converted, 1120 # check that zeros are handled 1121 X = rng.random_sample((10, 4)).astype(np.int32) 1122 K = chi2_kernel(X, X) 1123 assert np.isfinite(K).all() 1124 assert K.dtype == float 1125 1126 # check that kernel of similar things is greater than dissimilar ones 1127 X = [[0.3, 0.7], [1.0, 0]] 1128 Y = [[0, 1], [0.9, 0.1]] 1129 K = chi2_kernel(X, Y) 1130 assert K[0, 0] > K[0, 1] 1131 assert K[1, 1] > K[1, 0] 1132 1133 # test negative input 1134 with pytest.raises(ValueError): 1135 chi2_kernel([[0, -1]]) 1136 with pytest.raises(ValueError): 1137 chi2_kernel([[0, -1]], [[-1, -1]]) 1138 with pytest.raises(ValueError): 1139 chi2_kernel([[0, 1]], [[-1, -1]]) 1140 1141 # different n_features in X and Y 1142 with pytest.raises(ValueError): 1143 chi2_kernel([[0, 1]], [[0.2, 0.2, 0.6]]) 1144 1145 # sparse matrices 1146 with pytest.raises(ValueError): 1147 chi2_kernel(csr_matrix(X), csr_matrix(Y)) 1148 with pytest.raises(ValueError): 1149 additive_chi2_kernel(csr_matrix(X), csr_matrix(Y)) 1150 1151 1152@pytest.mark.parametrize( 1153 "kernel", 1154 ( 1155 linear_kernel, 1156 polynomial_kernel, 1157 rbf_kernel, 1158 laplacian_kernel, 1159 sigmoid_kernel, 1160 cosine_similarity, 1161 ), 1162) 1163def test_kernel_symmetry(kernel): 1164 # Valid kernels should be symmetric 1165 rng = np.random.RandomState(0) 1166 X = rng.random_sample((5, 4)) 1167 K = kernel(X, X) 1168 assert_array_almost_equal(K, K.T, 15) 1169 1170 1171@pytest.mark.parametrize( 1172 "kernel", 1173 ( 1174 linear_kernel, 1175 polynomial_kernel, 1176 rbf_kernel, 1177 laplacian_kernel, 1178 sigmoid_kernel, 1179 cosine_similarity, 1180 ), 1181) 1182def test_kernel_sparse(kernel): 1183 rng = np.random.RandomState(0) 1184 X = rng.random_sample((5, 4)) 1185 X_sparse = csr_matrix(X) 1186 K = kernel(X, X) 1187 K2 = kernel(X_sparse, X_sparse) 1188 assert_array_almost_equal(K, K2) 1189 1190 1191def test_linear_kernel(): 1192 rng = np.random.RandomState(0) 1193 X = rng.random_sample((5, 4)) 1194 K = linear_kernel(X, X) 1195 # the diagonal elements of a linear kernel are their squared norm 1196 assert_array_almost_equal(K.flat[::6], [linalg.norm(x) ** 2 for x in X]) 1197 1198 1199def test_rbf_kernel(): 1200 rng = np.random.RandomState(0) 1201 X = rng.random_sample((5, 4)) 1202 K = rbf_kernel(X, X) 1203 # the diagonal elements of a rbf kernel are 1 1204 assert_array_almost_equal(K.flat[::6], np.ones(5)) 1205 1206 1207def test_laplacian_kernel(): 1208 rng = np.random.RandomState(0) 1209 X = rng.random_sample((5, 4)) 1210 K = laplacian_kernel(X, X) 1211 # the diagonal elements of a laplacian kernel are 1 1212 assert_array_almost_equal(np.diag(K), np.ones(5)) 1213 1214 # off-diagonal elements are < 1 but > 0: 1215 assert np.all(K > 0) 1216 assert np.all(K - np.diag(np.diag(K)) < 1) 1217 1218 1219@pytest.mark.parametrize( 1220 "metric, pairwise_func", [("linear", linear_kernel), ("cosine", cosine_similarity)] 1221) 1222def test_pairwise_similarity_sparse_output(metric, pairwise_func): 1223 rng = np.random.RandomState(0) 1224 X = rng.random_sample((5, 4)) 1225 Y = rng.random_sample((3, 4)) 1226 Xcsr = csr_matrix(X) 1227 Ycsr = csr_matrix(Y) 1228 1229 # should be sparse 1230 K1 = pairwise_func(Xcsr, Ycsr, dense_output=False) 1231 assert issparse(K1) 1232 1233 # should be dense, and equal to K1 1234 K2 = pairwise_func(X, Y, dense_output=True) 1235 assert not issparse(K2) 1236 assert_array_almost_equal(K1.todense(), K2) 1237 1238 # show the kernel output equal to the sparse.todense() 1239 K3 = pairwise_kernels(X, Y=Y, metric=metric) 1240 assert_array_almost_equal(K1.todense(), K3) 1241 1242 1243def test_cosine_similarity(): 1244 # Test the cosine_similarity. 1245 1246 rng = np.random.RandomState(0) 1247 X = rng.random_sample((5, 4)) 1248 Y = rng.random_sample((3, 4)) 1249 Xcsr = csr_matrix(X) 1250 Ycsr = csr_matrix(Y) 1251 1252 for X_, Y_ in ((X, None), (X, Y), (Xcsr, None), (Xcsr, Ycsr)): 1253 # Test that the cosine is kernel is equal to a linear kernel when data 1254 # has been previously normalized by L2-norm. 1255 K1 = pairwise_kernels(X_, Y=Y_, metric="cosine") 1256 X_ = normalize(X_) 1257 if Y_ is not None: 1258 Y_ = normalize(Y_) 1259 K2 = pairwise_kernels(X_, Y=Y_, metric="linear") 1260 assert_array_almost_equal(K1, K2) 1261 1262 1263def test_check_dense_matrices(): 1264 # Ensure that pairwise array check works for dense matrices. 1265 # Check that if XB is None, XB is returned as reference to XA 1266 XA = np.resize(np.arange(40), (5, 8)) 1267 XA_checked, XB_checked = check_pairwise_arrays(XA, None) 1268 assert XA_checked is XB_checked 1269 assert_array_equal(XA, XA_checked) 1270 1271 1272def test_check_XB_returned(): 1273 # Ensure that if XA and XB are given correctly, they return as equal. 1274 # Check that if XB is not None, it is returned equal. 1275 # Note that the second dimension of XB is the same as XA. 1276 XA = np.resize(np.arange(40), (5, 8)) 1277 XB = np.resize(np.arange(32), (4, 8)) 1278 XA_checked, XB_checked = check_pairwise_arrays(XA, XB) 1279 assert_array_equal(XA, XA_checked) 1280 assert_array_equal(XB, XB_checked) 1281 1282 XB = np.resize(np.arange(40), (5, 8)) 1283 XA_checked, XB_checked = check_paired_arrays(XA, XB) 1284 assert_array_equal(XA, XA_checked) 1285 assert_array_equal(XB, XB_checked) 1286 1287 1288def test_check_different_dimensions(): 1289 # Ensure an error is raised if the dimensions are different. 1290 XA = np.resize(np.arange(45), (5, 9)) 1291 XB = np.resize(np.arange(32), (4, 8)) 1292 with pytest.raises(ValueError): 1293 check_pairwise_arrays(XA, XB) 1294 1295 XB = np.resize(np.arange(4 * 9), (4, 9)) 1296 with pytest.raises(ValueError): 1297 check_paired_arrays(XA, XB) 1298 1299 1300def test_check_invalid_dimensions(): 1301 # Ensure an error is raised on 1D input arrays. 1302 # The modified tests are not 1D. In the old test, the array was internally 1303 # converted to 2D anyways 1304 XA = np.arange(45).reshape(9, 5) 1305 XB = np.arange(32).reshape(4, 8) 1306 with pytest.raises(ValueError): 1307 check_pairwise_arrays(XA, XB) 1308 XA = np.arange(45).reshape(9, 5) 1309 XB = np.arange(32).reshape(4, 8) 1310 with pytest.raises(ValueError): 1311 check_pairwise_arrays(XA, XB) 1312 1313 1314def test_check_sparse_arrays(): 1315 # Ensures that checks return valid sparse matrices. 1316 rng = np.random.RandomState(0) 1317 XA = rng.random_sample((5, 4)) 1318 XA_sparse = csr_matrix(XA) 1319 XB = rng.random_sample((5, 4)) 1320 XB_sparse = csr_matrix(XB) 1321 XA_checked, XB_checked = check_pairwise_arrays(XA_sparse, XB_sparse) 1322 # compare their difference because testing csr matrices for 1323 # equality with '==' does not work as expected. 1324 assert issparse(XA_checked) 1325 assert abs(XA_sparse - XA_checked).sum() == 0 1326 assert issparse(XB_checked) 1327 assert abs(XB_sparse - XB_checked).sum() == 0 1328 1329 XA_checked, XA_2_checked = check_pairwise_arrays(XA_sparse, XA_sparse) 1330 assert issparse(XA_checked) 1331 assert abs(XA_sparse - XA_checked).sum() == 0 1332 assert issparse(XA_2_checked) 1333 assert abs(XA_2_checked - XA_checked).sum() == 0 1334 1335 1336def tuplify(X): 1337 # Turns a numpy matrix (any n-dimensional array) into tuples. 1338 s = X.shape 1339 if len(s) > 1: 1340 # Tuplify each sub-array in the input. 1341 return tuple(tuplify(row) for row in X) 1342 else: 1343 # Single dimension input, just return tuple of contents. 1344 return tuple(r for r in X) 1345 1346 1347def test_check_tuple_input(): 1348 # Ensures that checks return valid tuples. 1349 rng = np.random.RandomState(0) 1350 XA = rng.random_sample((5, 4)) 1351 XA_tuples = tuplify(XA) 1352 XB = rng.random_sample((5, 4)) 1353 XB_tuples = tuplify(XB) 1354 XA_checked, XB_checked = check_pairwise_arrays(XA_tuples, XB_tuples) 1355 assert_array_equal(XA_tuples, XA_checked) 1356 assert_array_equal(XB_tuples, XB_checked) 1357 1358 1359def test_check_preserve_type(): 1360 # Ensures that type float32 is preserved. 1361 XA = np.resize(np.arange(40), (5, 8)).astype(np.float32) 1362 XB = np.resize(np.arange(40), (5, 8)).astype(np.float32) 1363 1364 XA_checked, XB_checked = check_pairwise_arrays(XA, None) 1365 assert XA_checked.dtype == np.float32 1366 1367 # both float32 1368 XA_checked, XB_checked = check_pairwise_arrays(XA, XB) 1369 assert XA_checked.dtype == np.float32 1370 assert XB_checked.dtype == np.float32 1371 1372 # mismatched A 1373 XA_checked, XB_checked = check_pairwise_arrays(XA.astype(float), XB) 1374 assert XA_checked.dtype == float 1375 assert XB_checked.dtype == float 1376 1377 # mismatched B 1378 XA_checked, XB_checked = check_pairwise_arrays(XA, XB.astype(float)) 1379 assert XA_checked.dtype == float 1380 assert XB_checked.dtype == float 1381 1382 1383@pytest.mark.parametrize("n_jobs", [1, 2]) 1384@pytest.mark.parametrize("metric", ["seuclidean", "mahalanobis"]) 1385@pytest.mark.parametrize( 1386 "dist_function", [pairwise_distances, pairwise_distances_chunked] 1387) 1388def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function): 1389 # check that pairwise_distances give the same result in sequential and 1390 # parallel, when metric has data-derived parameters. 1391 with config_context(working_memory=0.1): # to have more than 1 chunk 1392 rng = np.random.RandomState(0) 1393 X = rng.random_sample((100, 10)) 1394 1395 expected_dist = squareform(pdist(X, metric=metric)) 1396 dist = np.vstack(tuple(dist_function(X, metric=metric, n_jobs=n_jobs))) 1397 1398 assert_allclose(dist, expected_dist) 1399 1400 1401@pytest.mark.parametrize("metric", ["seuclidean", "mahalanobis"]) 1402def test_pairwise_distances_data_derived_params_error(metric): 1403 # check that pairwise_distances raises an error when Y is passed but 1404 # metric has data-derived params that are not provided by the user. 1405 rng = np.random.RandomState(0) 1406 X = rng.random_sample((100, 10)) 1407 Y = rng.random_sample((100, 10)) 1408 1409 with pytest.raises( 1410 ValueError, 1411 match=fr"The '(V|VI)' parameter is required for the " fr"{metric} metric", 1412 ): 1413 pairwise_distances(X, Y, metric=metric) 1414 1415 1416@pytest.mark.parametrize( 1417 "metric", 1418 [ 1419 "braycurtis", 1420 "canberra", 1421 "chebyshev", 1422 "correlation", 1423 "hamming", 1424 "mahalanobis", 1425 "minkowski", 1426 "seuclidean", 1427 "sqeuclidean", 1428 "cityblock", 1429 "cosine", 1430 "euclidean", 1431 ], 1432) 1433@pytest.mark.parametrize("dtype", [np.float32, np.float64]) 1434@pytest.mark.parametrize("y_is_x", [True, False], ids=["Y is X", "Y is not X"]) 1435def test_numeric_pairwise_distances_datatypes(metric, dtype, y_is_x): 1436 # Check that pairwise distances gives the same result as pdist and cdist 1437 # regardless of input datatype when using any scipy metric for comparing 1438 # numeric vectors 1439 # 1440 # This test is necessary because pairwise_distances used to throw an 1441 # error when using metric='seuclidean' and the input data was not 1442 # of type np.float64 (#15730) 1443 1444 rng = np.random.RandomState(0) 1445 1446 X = rng.random_sample((5, 4)).astype(dtype) 1447 1448 params = {} 1449 if y_is_x: 1450 Y = X 1451 expected_dist = squareform(pdist(X, metric=metric)) 1452 else: 1453 Y = rng.random_sample((5, 4)).astype(dtype) 1454 expected_dist = cdist(X, Y, metric=metric) 1455 # precompute parameters for seuclidean & mahalanobis when x is not y 1456 if metric == "seuclidean": 1457 params = {"V": np.var(np.vstack([X, Y]), axis=0, ddof=1, dtype=np.float64)} 1458 elif metric == "mahalanobis": 1459 params = {"VI": np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T} 1460 1461 dist = pairwise_distances(X, Y, metric=metric, **params) 1462 1463 # the default rtol=1e-7 is too close to the float32 precision 1464 # and fails due to rounding errors 1465 rtol = 1e-5 if dtype is np.float32 else 1e-7 1466 assert_allclose(dist, expected_dist, rtol=rtol) 1467