1from types import GeneratorType
2
3import numpy as np
4from numpy import linalg
5
6from scipy.sparse import dok_matrix, csr_matrix, issparse
7from scipy.spatial.distance import cosine, cityblock, minkowski
8from scipy.spatial.distance import cdist, pdist, squareform
9
10try:
11    from scipy.spatial.distance import wminkowski
12except ImportError:
13    # In scipy 1.6.0, wminkowski is deprecated and minkowski
14    # should be used instead.
15    from scipy.spatial.distance import minkowski as wminkowski
16
17from sklearn.utils.fixes import sp_version, parse_version
18
19import pytest
20
21from sklearn import config_context
22
23from sklearn.utils._testing import assert_array_almost_equal
24from sklearn.utils._testing import assert_allclose
25from sklearn.utils._testing import assert_almost_equal
26from sklearn.utils._testing import assert_array_equal
27from sklearn.utils._testing import ignore_warnings
28
29from sklearn.metrics.pairwise import euclidean_distances
30from sklearn.metrics.pairwise import nan_euclidean_distances
31from sklearn.metrics.pairwise import manhattan_distances
32from sklearn.metrics.pairwise import haversine_distances
33from sklearn.metrics.pairwise import linear_kernel
34from sklearn.metrics.pairwise import chi2_kernel, additive_chi2_kernel
35from sklearn.metrics.pairwise import polynomial_kernel
36from sklearn.metrics.pairwise import rbf_kernel
37from sklearn.metrics.pairwise import laplacian_kernel
38from sklearn.metrics.pairwise import sigmoid_kernel
39from sklearn.metrics.pairwise import cosine_similarity
40from sklearn.metrics.pairwise import cosine_distances
41from sklearn.metrics.pairwise import pairwise_distances
42from sklearn.metrics.pairwise import pairwise_distances_chunked
43from sklearn.metrics.pairwise import pairwise_distances_argmin_min
44from sklearn.metrics.pairwise import pairwise_distances_argmin
45from sklearn.metrics.pairwise import pairwise_kernels
46from sklearn.metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS
47from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
48from sklearn.metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS
49from sklearn.metrics.pairwise import PAIRED_DISTANCES
50from sklearn.metrics.pairwise import check_pairwise_arrays
51from sklearn.metrics.pairwise import check_paired_arrays
52from sklearn.metrics.pairwise import paired_distances
53from sklearn.metrics.pairwise import paired_euclidean_distances
54from sklearn.metrics.pairwise import paired_manhattan_distances
55from sklearn.metrics.pairwise import _euclidean_distances_upcast
56from sklearn.preprocessing import normalize
57from sklearn.exceptions import DataConversionWarning
58
59
60def test_pairwise_distances():
61    # Test the pairwise_distance helper function.
62    rng = np.random.RandomState(0)
63
64    # Euclidean distance should be equivalent to calling the function.
65    X = rng.random_sample((5, 4))
66    S = pairwise_distances(X, metric="euclidean")
67    S2 = euclidean_distances(X)
68    assert_array_almost_equal(S, S2)
69
70    # Euclidean distance, with Y != X.
71    Y = rng.random_sample((2, 4))
72    S = pairwise_distances(X, Y, metric="euclidean")
73    S2 = euclidean_distances(X, Y)
74    assert_array_almost_equal(S, S2)
75    # Check to ensure NaNs work with pairwise_distances.
76    X_masked = rng.random_sample((5, 4))
77    Y_masked = rng.random_sample((2, 4))
78    X_masked[0, 0] = np.nan
79    Y_masked[0, 0] = np.nan
80    S_masked = pairwise_distances(X_masked, Y_masked, metric="nan_euclidean")
81    S2_masked = nan_euclidean_distances(X_masked, Y_masked)
82    assert_array_almost_equal(S_masked, S2_masked)
83    # Test with tuples as X and Y
84    X_tuples = tuple([tuple([v for v in row]) for row in X])
85    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
86    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
87    assert_array_almost_equal(S, S2)
88
89    # Test haversine distance
90    # The data should be valid latitude and longitude
91    X = rng.random_sample((5, 2))
92    X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi / 2
93    X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi
94    S = pairwise_distances(X, metric="haversine")
95    S2 = haversine_distances(X)
96    assert_array_almost_equal(S, S2)
97
98    # Test haversine distance, with Y != X
99    Y = rng.random_sample((2, 2))
100    Y[:, 0] = (Y[:, 0] - 0.5) * 2 * np.pi / 2
101    Y[:, 1] = (Y[:, 1] - 0.5) * 2 * np.pi
102    S = pairwise_distances(X, Y, metric="haversine")
103    S2 = haversine_distances(X, Y)
104    assert_array_almost_equal(S, S2)
105
106    # "cityblock" uses scikit-learn metric, cityblock (function) is
107    # scipy.spatial.
108    S = pairwise_distances(X, metric="cityblock")
109    S2 = pairwise_distances(X, metric=cityblock)
110    assert S.shape[0] == S.shape[1]
111    assert S.shape[0] == X.shape[0]
112    assert_array_almost_equal(S, S2)
113
114    # The manhattan metric should be equivalent to cityblock.
115    S = pairwise_distances(X, Y, metric="manhattan")
116    S2 = pairwise_distances(X, Y, metric=cityblock)
117    assert S.shape[0] == X.shape[0]
118    assert S.shape[1] == Y.shape[0]
119    assert_array_almost_equal(S, S2)
120
121    # Test cosine as a string metric versus cosine callable
122    # The string "cosine" uses sklearn.metric,
123    # while the function cosine is scipy.spatial
124    S = pairwise_distances(X, Y, metric="cosine")
125    S2 = pairwise_distances(X, Y, metric=cosine)
126    assert S.shape[0] == X.shape[0]
127    assert S.shape[1] == Y.shape[0]
128    assert_array_almost_equal(S, S2)
129
130    # Test with sparse X and Y,
131    # currently only supported for Euclidean, L1 and cosine.
132    X_sparse = csr_matrix(X)
133    Y_sparse = csr_matrix(Y)
134    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
135    S2 = euclidean_distances(X_sparse, Y_sparse)
136    assert_array_almost_equal(S, S2)
137    S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
138    S2 = cosine_distances(X_sparse, Y_sparse)
139    assert_array_almost_equal(S, S2)
140    S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan")
141    S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo())
142    assert_array_almost_equal(S, S2)
143    S2 = manhattan_distances(X, Y)
144    assert_array_almost_equal(S, S2)
145
146    # Test with scipy.spatial.distance metric, with a kwd
147    kwds = {"p": 2.0}
148    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
149    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
150    assert_array_almost_equal(S, S2)
151
152    # same with Y = None
153    kwds = {"p": 2.0}
154    S = pairwise_distances(X, metric="minkowski", **kwds)
155    S2 = pairwise_distances(X, metric=minkowski, **kwds)
156    assert_array_almost_equal(S, S2)
157
158    # Test that scipy distance metrics throw an error if sparse matrix given
159    with pytest.raises(TypeError):
160        pairwise_distances(X_sparse, metric="minkowski")
161    with pytest.raises(TypeError):
162        pairwise_distances(X, Y_sparse, metric="minkowski")
163
164    # Test that a value error is raised if the metric is unknown
165    with pytest.raises(ValueError):
166        pairwise_distances(X, Y, metric="blah")
167
168
169@pytest.mark.parametrize("metric", PAIRWISE_BOOLEAN_FUNCTIONS)
170def test_pairwise_boolean_distance(metric):
171    # test that we convert to boolean arrays for boolean distances
172    rng = np.random.RandomState(0)
173    X = rng.randn(5, 4)
174    Y = X.copy()
175    Y[0, 0] = 1 - Y[0, 0]
176
177    # ignore conversion to boolean in pairwise_distances
178    with ignore_warnings(category=DataConversionWarning):
179        for Z in [Y, None]:
180            res = pairwise_distances(X, Z, metric=metric)
181            res[np.isnan(res)] = 0
182            assert np.sum(res != 0) == 0
183
184    # non-boolean arrays are converted to boolean for boolean
185    # distance metrics with a data conversion warning
186    msg = "Data was converted to boolean for metric %s" % metric
187    with pytest.warns(DataConversionWarning, match=msg):
188        pairwise_distances(X, metric=metric)
189
190    # Check that the warning is raised if X is boolean by Y is not boolean:
191    with pytest.warns(DataConversionWarning, match=msg):
192        pairwise_distances(X.astype(bool), Y=Y, metric=metric)
193
194    # Check that no warning is raised if X is already boolean and Y is None:
195    with pytest.warns(None) as records:
196        pairwise_distances(X.astype(bool), metric=metric)
197    assert len(records) == 0
198
199
200def test_no_data_conversion_warning():
201    # No warnings issued if metric is not a boolean distance function
202    rng = np.random.RandomState(0)
203    X = rng.randn(5, 4)
204    with pytest.warns(None) as records:
205        pairwise_distances(X, metric="minkowski")
206    assert len(records) == 0
207
208
209@pytest.mark.parametrize("func", [pairwise_distances, pairwise_kernels])
210def test_pairwise_precomputed(func):
211    # Test correct shape
212    with pytest.raises(ValueError, match=".* shape .*"):
213        func(np.zeros((5, 3)), metric="precomputed")
214    # with two args
215    with pytest.raises(ValueError, match=".* shape .*"):
216        func(np.zeros((5, 3)), np.zeros((4, 4)), metric="precomputed")
217    # even if shape[1] agrees (although thus second arg is spurious)
218    with pytest.raises(ValueError, match=".* shape .*"):
219        func(np.zeros((5, 3)), np.zeros((4, 3)), metric="precomputed")
220
221    # Test not copied (if appropriate dtype)
222    S = np.zeros((5, 5))
223    S2 = func(S, metric="precomputed")
224    assert S is S2
225    # with two args
226    S = np.zeros((5, 3))
227    S2 = func(S, np.zeros((3, 3)), metric="precomputed")
228    assert S is S2
229
230    # Test always returns float dtype
231    S = func(np.array([[1]], dtype="int"), metric="precomputed")
232    assert "f" == S.dtype.kind
233
234    # Test converts list to array-like
235    S = func([[1.0]], metric="precomputed")
236    assert isinstance(S, np.ndarray)
237
238
239def test_pairwise_precomputed_non_negative():
240    # Test non-negative values
241    with pytest.raises(ValueError, match=".* non-negative values.*"):
242        pairwise_distances(np.full((5, 5), -1), metric="precomputed")
243
244
245_minkowski_kwds = {"w": np.arange(1, 5).astype("double", copy=False), "p": 1}
246_wminkowski_kwds = {"w": np.arange(1, 5).astype("double", copy=False), "p": 1}
247
248
249def callable_rbf_kernel(x, y, **kwds):
250    # Callable version of pairwise.rbf_kernel.
251    K = rbf_kernel(np.atleast_2d(x), np.atleast_2d(y), **kwds)
252    return K
253
254
255@pytest.mark.parametrize(
256    "func, metric, kwds",
257    [
258        (pairwise_distances, "euclidean", {}),
259        pytest.param(
260            pairwise_distances,
261            minkowski,
262            _minkowski_kwds,
263            marks=pytest.mark.skipif(
264                sp_version < parse_version("1.0"),
265                reason="minkowski does not accept the w parameter prior to scipy 1.0.",
266            ),
267        ),
268        pytest.param(
269            pairwise_distances,
270            "minkowski",
271            _minkowski_kwds,
272            marks=pytest.mark.skipif(
273                sp_version < parse_version("1.0"),
274                reason="minkowski does not accept the w parameter prior to scipy 1.0.",
275            ),
276        ),
277        pytest.param(
278            pairwise_distances,
279            wminkowski,
280            _wminkowski_kwds,
281            marks=pytest.mark.skipif(
282                sp_version >= parse_version("1.6.0"),
283                reason="wminkowski is now minkowski and it has been already tested.",
284            ),
285        ),
286        pytest.param(
287            pairwise_distances,
288            "wminkowski",
289            _wminkowski_kwds,
290            marks=pytest.mark.skipif(
291                sp_version >= parse_version("1.6.0"),
292                reason="wminkowski is now minkowski and it has been already tested.",
293            ),
294        ),
295        (pairwise_kernels, "polynomial", {"degree": 1}),
296        (pairwise_kernels, callable_rbf_kernel, {"gamma": 0.1}),
297    ],
298)
299@pytest.mark.parametrize("dtype", [np.float64, int])
300def test_pairwise_parallel(func, metric, kwds, dtype):
301    rng = np.random.RandomState(0)
302    X = np.array(5 * rng.random_sample((5, 4)), dtype=dtype)
303    Y = np.array(5 * rng.random_sample((3, 4)), dtype=dtype)
304
305    S = func(X, metric=metric, n_jobs=1, **kwds)
306    S2 = func(X, metric=metric, n_jobs=2, **kwds)
307    assert_allclose(S, S2)
308
309    S = func(X, Y, metric=metric, n_jobs=1, **kwds)
310    S2 = func(X, Y, metric=metric, n_jobs=2, **kwds)
311    assert_allclose(S, S2)
312
313
314def test_pairwise_callable_nonstrict_metric():
315    # paired_distances should allow callable metric where metric(x, x) != 0
316    # Knowing that the callable is a strict metric would allow the diagonal to
317    # be left uncalculated and set to 0.
318    assert pairwise_distances([[1.0]], metric=lambda x, y: 5)[0, 0] == 5
319
320
321# Test with all metrics that should be in PAIRWISE_KERNEL_FUNCTIONS.
322@pytest.mark.parametrize(
323    "metric",
324    ["rbf", "laplacian", "sigmoid", "polynomial", "linear", "chi2", "additive_chi2"],
325)
326def test_pairwise_kernels(metric):
327    # Test the pairwise_kernels helper function.
328
329    rng = np.random.RandomState(0)
330    X = rng.random_sample((5, 4))
331    Y = rng.random_sample((2, 4))
332    function = PAIRWISE_KERNEL_FUNCTIONS[metric]
333    # Test with Y=None
334    K1 = pairwise_kernels(X, metric=metric)
335    K2 = function(X)
336    assert_array_almost_equal(K1, K2)
337    # Test with Y=Y
338    K1 = pairwise_kernels(X, Y=Y, metric=metric)
339    K2 = function(X, Y=Y)
340    assert_array_almost_equal(K1, K2)
341    # Test with tuples as X and Y
342    X_tuples = tuple([tuple([v for v in row]) for row in X])
343    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
344    K2 = pairwise_kernels(X_tuples, Y_tuples, metric=metric)
345    assert_array_almost_equal(K1, K2)
346
347    # Test with sparse X and Y
348    X_sparse = csr_matrix(X)
349    Y_sparse = csr_matrix(Y)
350    if metric in ["chi2", "additive_chi2"]:
351        # these don't support sparse matrices yet
352        with pytest.raises(ValueError):
353            pairwise_kernels(X_sparse, Y=Y_sparse, metric=metric)
354        return
355    K1 = pairwise_kernels(X_sparse, Y=Y_sparse, metric=metric)
356    assert_array_almost_equal(K1, K2)
357
358
359def test_pairwise_kernels_callable():
360    # Test the pairwise_kernels helper function
361    # with a callable function, with given keywords.
362    rng = np.random.RandomState(0)
363    X = rng.random_sample((5, 4))
364    Y = rng.random_sample((2, 4))
365
366    metric = callable_rbf_kernel
367    kwds = {"gamma": 0.1}
368    K1 = pairwise_kernels(X, Y=Y, metric=metric, **kwds)
369    K2 = rbf_kernel(X, Y=Y, **kwds)
370    assert_array_almost_equal(K1, K2)
371
372    # callable function, X=Y
373    K1 = pairwise_kernels(X, Y=X, metric=metric, **kwds)
374    K2 = rbf_kernel(X, Y=X, **kwds)
375    assert_array_almost_equal(K1, K2)
376
377
378def test_pairwise_kernels_filter_param():
379    rng = np.random.RandomState(0)
380    X = rng.random_sample((5, 4))
381    Y = rng.random_sample((2, 4))
382    K = rbf_kernel(X, Y, gamma=0.1)
383    params = {"gamma": 0.1, "blabla": ":)"}
384    K2 = pairwise_kernels(X, Y, metric="rbf", filter_params=True, **params)
385    assert_array_almost_equal(K, K2)
386
387    with pytest.raises(TypeError):
388        pairwise_kernels(X, Y, metric="rbf", **params)
389
390
391@pytest.mark.parametrize("metric, func", PAIRED_DISTANCES.items())
392def test_paired_distances(metric, func):
393    # Test the pairwise_distance helper function.
394    rng = np.random.RandomState(0)
395    # Euclidean distance should be equivalent to calling the function.
396    X = rng.random_sample((5, 4))
397    # Euclidean distance, with Y != X.
398    Y = rng.random_sample((5, 4))
399
400    S = paired_distances(X, Y, metric=metric)
401    S2 = func(X, Y)
402    assert_array_almost_equal(S, S2)
403    S3 = func(csr_matrix(X), csr_matrix(Y))
404    assert_array_almost_equal(S, S3)
405    if metric in PAIRWISE_DISTANCE_FUNCTIONS:
406        # Check the pairwise_distances implementation
407        # gives the same value
408        distances = PAIRWISE_DISTANCE_FUNCTIONS[metric](X, Y)
409        distances = np.diag(distances)
410        assert_array_almost_equal(distances, S)
411
412
413def test_paired_distances_callable():
414    # Test the pairwise_distance helper function
415    # with the callable implementation
416    rng = np.random.RandomState(0)
417    # Euclidean distance should be equivalent to calling the function.
418    X = rng.random_sample((5, 4))
419    # Euclidean distance, with Y != X.
420    Y = rng.random_sample((5, 4))
421
422    S = paired_distances(X, Y, metric="manhattan")
423    S2 = paired_distances(X, Y, metric=lambda x, y: np.abs(x - y).sum(axis=0))
424    assert_array_almost_equal(S, S2)
425
426    # Test that a value error is raised when the lengths of X and Y should not
427    # differ
428    Y = rng.random_sample((3, 4))
429    with pytest.raises(ValueError):
430        paired_distances(X, Y)
431
432
433def test_pairwise_distances_argmin_min():
434    # Check pairwise minimum distances computation for any metric
435    X = [[0], [1]]
436    Y = [[-2], [3]]
437
438    Xsp = dok_matrix(X)
439    Ysp = csr_matrix(Y, dtype=np.float32)
440
441    expected_idx = [0, 1]
442    expected_vals = [2, 2]
443    expected_vals_sq = [4, 4]
444
445    # euclidean metric
446    idx, vals = pairwise_distances_argmin_min(X, Y, metric="euclidean")
447    idx2 = pairwise_distances_argmin(X, Y, metric="euclidean")
448    assert_array_almost_equal(idx, expected_idx)
449    assert_array_almost_equal(idx2, expected_idx)
450    assert_array_almost_equal(vals, expected_vals)
451    # sparse matrix case
452    idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="euclidean")
453    assert_array_almost_equal(idxsp, expected_idx)
454    assert_array_almost_equal(valssp, expected_vals)
455    # We don't want np.matrix here
456    assert type(idxsp) == np.ndarray
457    assert type(valssp) == np.ndarray
458
459    # euclidean metric squared
460    idx, vals = pairwise_distances_argmin_min(
461        X, Y, metric="euclidean", metric_kwargs={"squared": True}
462    )
463    assert_array_almost_equal(idx, expected_idx)
464    assert_array_almost_equal(vals, expected_vals_sq)
465
466    # Non-euclidean scikit-learn metric
467    idx, vals = pairwise_distances_argmin_min(X, Y, metric="manhattan")
468    idx2 = pairwise_distances_argmin(X, Y, metric="manhattan")
469    assert_array_almost_equal(idx, expected_idx)
470    assert_array_almost_equal(idx2, expected_idx)
471    assert_array_almost_equal(vals, expected_vals)
472    # sparse matrix case
473    idxsp, valssp = pairwise_distances_argmin_min(Xsp, Ysp, metric="manhattan")
474    assert_array_almost_equal(idxsp, expected_idx)
475    assert_array_almost_equal(valssp, expected_vals)
476
477    # Non-euclidean Scipy distance (callable)
478    idx, vals = pairwise_distances_argmin_min(
479        X, Y, metric=minkowski, metric_kwargs={"p": 2}
480    )
481    assert_array_almost_equal(idx, expected_idx)
482    assert_array_almost_equal(vals, expected_vals)
483
484    # Non-euclidean Scipy distance (string)
485    idx, vals = pairwise_distances_argmin_min(
486        X, Y, metric="minkowski", metric_kwargs={"p": 2}
487    )
488    assert_array_almost_equal(idx, expected_idx)
489    assert_array_almost_equal(vals, expected_vals)
490
491    # Compare with naive implementation
492    rng = np.random.RandomState(0)
493    X = rng.randn(97, 149)
494    Y = rng.randn(111, 149)
495
496    dist = pairwise_distances(X, Y, metric="manhattan")
497    dist_orig_ind = dist.argmin(axis=0)
498    dist_orig_val = dist[dist_orig_ind, range(len(dist_orig_ind))]
499
500    dist_chunked_ind, dist_chunked_val = pairwise_distances_argmin_min(
501        X, Y, axis=0, metric="manhattan"
502    )
503    np.testing.assert_almost_equal(dist_orig_ind, dist_chunked_ind, decimal=7)
504    np.testing.assert_almost_equal(dist_orig_val, dist_chunked_val, decimal=7)
505
506
507def _reduce_func(dist, start):
508    return dist[:, :100]
509
510
511def test_pairwise_distances_chunked_reduce():
512    rng = np.random.RandomState(0)
513    X = rng.random_sample((400, 4))
514    # Reduced Euclidean distance
515    S = pairwise_distances(X)[:, :100]
516    S_chunks = pairwise_distances_chunked(
517        X, None, reduce_func=_reduce_func, working_memory=2 ** -16
518    )
519    assert isinstance(S_chunks, GeneratorType)
520    S_chunks = list(S_chunks)
521    assert len(S_chunks) > 1
522    # atol is for diagonal where S is explicitly zeroed on the diagonal
523    assert_allclose(np.vstack(S_chunks), S, atol=1e-7)
524
525
526def test_pairwise_distances_chunked_reduce_none():
527    # check that the reduce func is allowed to return None
528    rng = np.random.RandomState(0)
529    X = rng.random_sample((10, 4))
530    S_chunks = pairwise_distances_chunked(
531        X, None, reduce_func=lambda dist, start: None, working_memory=2 ** -16
532    )
533    assert isinstance(S_chunks, GeneratorType)
534    S_chunks = list(S_chunks)
535    assert len(S_chunks) > 1
536    assert all(chunk is None for chunk in S_chunks)
537
538
539@pytest.mark.parametrize(
540    "good_reduce",
541    [
542        lambda D, start: list(D),
543        lambda D, start: np.array(D),
544        lambda D, start: csr_matrix(D),
545        lambda D, start: (list(D), list(D)),
546        lambda D, start: (dok_matrix(D), np.array(D), list(D)),
547    ],
548)
549def test_pairwise_distances_chunked_reduce_valid(good_reduce):
550    X = np.arange(10).reshape(-1, 1)
551    S_chunks = pairwise_distances_chunked(
552        X, None, reduce_func=good_reduce, working_memory=64
553    )
554    next(S_chunks)
555
556
557@pytest.mark.parametrize(
558    ("bad_reduce", "err_type", "message"),
559    [
560        (
561            lambda D, s: np.concatenate([D, D[-1:]]),
562            ValueError,
563            r"length 11\..* input: 10\.",
564        ),
565        (
566            lambda D, s: (D, np.concatenate([D, D[-1:]])),
567            ValueError,
568            r"length \(10, 11\)\..* input: 10\.",
569        ),
570        (lambda D, s: (D[:9], D), ValueError, r"length \(9, 10\)\..* input: 10\."),
571        (
572            lambda D, s: 7,
573            TypeError,
574            r"returned 7\. Expected sequence\(s\) of length 10\.",
575        ),
576        (
577            lambda D, s: (7, 8),
578            TypeError,
579            r"returned \(7, 8\)\. Expected sequence\(s\) of length 10\.",
580        ),
581        (
582            lambda D, s: (np.arange(10), 9),
583            TypeError,
584            r", 9\)\. Expected sequence\(s\) of length 10\.",
585        ),
586    ],
587)
588def test_pairwise_distances_chunked_reduce_invalid(bad_reduce, err_type, message):
589    X = np.arange(10).reshape(-1, 1)
590    S_chunks = pairwise_distances_chunked(
591        X, None, reduce_func=bad_reduce, working_memory=64
592    )
593    with pytest.raises(err_type, match=message):
594        next(S_chunks)
595
596
597def check_pairwise_distances_chunked(X, Y, working_memory, metric="euclidean"):
598    gen = pairwise_distances_chunked(X, Y, working_memory=working_memory, metric=metric)
599    assert isinstance(gen, GeneratorType)
600    blockwise_distances = list(gen)
601    Y = X if Y is None else Y
602    min_block_mib = len(Y) * 8 * 2 ** -20
603
604    for block in blockwise_distances:
605        memory_used = block.nbytes
606        assert memory_used <= max(working_memory, min_block_mib) * 2 ** 20
607
608    blockwise_distances = np.vstack(blockwise_distances)
609    S = pairwise_distances(X, Y, metric=metric)
610    assert_array_almost_equal(blockwise_distances, S)
611
612
613@pytest.mark.parametrize("metric", ("euclidean", "l2", "sqeuclidean"))
614def test_pairwise_distances_chunked_diagonal(metric):
615    rng = np.random.RandomState(0)
616    X = rng.normal(size=(1000, 10), scale=1e10)
617    chunks = list(pairwise_distances_chunked(X, working_memory=1, metric=metric))
618    assert len(chunks) > 1
619    assert_array_almost_equal(np.diag(np.vstack(chunks)), 0, decimal=10)
620
621
622@pytest.mark.parametrize("metric", ("euclidean", "l2", "sqeuclidean"))
623def test_parallel_pairwise_distances_diagonal(metric):
624    rng = np.random.RandomState(0)
625    X = rng.normal(size=(1000, 10), scale=1e10)
626    distances = pairwise_distances(X, metric=metric, n_jobs=2)
627    assert_allclose(np.diag(distances), 0, atol=1e-10)
628
629
630@ignore_warnings
631def test_pairwise_distances_chunked():
632    # Test the pairwise_distance helper function.
633    rng = np.random.RandomState(0)
634    # Euclidean distance should be equivalent to calling the function.
635    X = rng.random_sample((200, 4))
636    check_pairwise_distances_chunked(X, None, working_memory=1, metric="euclidean")
637    # Test small amounts of memory
638    for power in range(-16, 0):
639        check_pairwise_distances_chunked(
640            X, None, working_memory=2 ** power, metric="euclidean"
641        )
642    # X as list
643    check_pairwise_distances_chunked(
644        X.tolist(), None, working_memory=1, metric="euclidean"
645    )
646    # Euclidean distance, with Y != X.
647    Y = rng.random_sample((100, 4))
648    check_pairwise_distances_chunked(X, Y, working_memory=1, metric="euclidean")
649    check_pairwise_distances_chunked(
650        X.tolist(), Y.tolist(), working_memory=1, metric="euclidean"
651    )
652    # absurdly large working_memory
653    check_pairwise_distances_chunked(X, Y, working_memory=10000, metric="euclidean")
654    # "cityblock" uses scikit-learn metric, cityblock (function) is
655    # scipy.spatial.
656    check_pairwise_distances_chunked(X, Y, working_memory=1, metric="cityblock")
657    # Test that a value error is raised if the metric is unknown
658    with pytest.raises(ValueError):
659        next(pairwise_distances_chunked(X, Y, metric="blah"))
660
661    # Test precomputed returns all at once
662    D = pairwise_distances(X)
663    gen = pairwise_distances_chunked(D, working_memory=2 ** -16, metric="precomputed")
664    assert isinstance(gen, GeneratorType)
665    assert next(gen) is D
666    with pytest.raises(StopIteration):
667        next(gen)
668
669
670@pytest.mark.parametrize(
671    "x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
672)
673@pytest.mark.parametrize(
674    "y_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
675)
676def test_euclidean_distances_known_result(x_array_constr, y_array_constr):
677    # Check the pairwise Euclidean distances computation on known result
678    X = x_array_constr([[0]])
679    Y = y_array_constr([[1], [2]])
680    D = euclidean_distances(X, Y)
681    assert_allclose(D, [[1.0, 2.0]])
682
683
684@pytest.mark.parametrize("dtype", [np.float32, np.float64])
685@pytest.mark.parametrize(
686    "y_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
687)
688def test_euclidean_distances_with_norms(dtype, y_array_constr):
689    # check that we still get the right answers with {X,Y}_norm_squared
690    # and that we get a wrong answer with wrong {X,Y}_norm_squared
691    rng = np.random.RandomState(0)
692    X = rng.random_sample((10, 10)).astype(dtype, copy=False)
693    Y = rng.random_sample((20, 10)).astype(dtype, copy=False)
694
695    # norms will only be used if their dtype is float64
696    X_norm_sq = (X.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1)
697    Y_norm_sq = (Y.astype(np.float64) ** 2).sum(axis=1).reshape(1, -1)
698
699    Y = y_array_constr(Y)
700
701    D1 = euclidean_distances(X, Y)
702    D2 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq)
703    D3 = euclidean_distances(X, Y, Y_norm_squared=Y_norm_sq)
704    D4 = euclidean_distances(X, Y, X_norm_squared=X_norm_sq, Y_norm_squared=Y_norm_sq)
705    assert_allclose(D2, D1)
706    assert_allclose(D3, D1)
707    assert_allclose(D4, D1)
708
709    # check we get the wrong answer with wrong {X,Y}_norm_squared
710    wrong_D = euclidean_distances(
711        X,
712        Y,
713        X_norm_squared=np.zeros_like(X_norm_sq),
714        Y_norm_squared=np.zeros_like(Y_norm_sq),
715    )
716    with pytest.raises(AssertionError):
717        assert_allclose(wrong_D, D1)
718
719
720def test_euclidean_distances_norm_shapes():
721    # Check all accepted shapes for the norms or appropriate error messages.
722    rng = np.random.RandomState(0)
723    X = rng.random_sample((10, 10))
724    Y = rng.random_sample((20, 10))
725
726    X_norm_squared = (X ** 2).sum(axis=1)
727    Y_norm_squared = (Y ** 2).sum(axis=1)
728
729    D1 = euclidean_distances(
730        X, Y, X_norm_squared=X_norm_squared, Y_norm_squared=Y_norm_squared
731    )
732    D2 = euclidean_distances(
733        X,
734        Y,
735        X_norm_squared=X_norm_squared.reshape(-1, 1),
736        Y_norm_squared=Y_norm_squared.reshape(-1, 1),
737    )
738    D3 = euclidean_distances(
739        X,
740        Y,
741        X_norm_squared=X_norm_squared.reshape(1, -1),
742        Y_norm_squared=Y_norm_squared.reshape(1, -1),
743    )
744
745    assert_allclose(D2, D1)
746    assert_allclose(D3, D1)
747
748    with pytest.raises(ValueError, match="Incompatible dimensions for X"):
749        euclidean_distances(X, Y, X_norm_squared=X_norm_squared[:5])
750    with pytest.raises(ValueError, match="Incompatible dimensions for Y"):
751        euclidean_distances(X, Y, Y_norm_squared=Y_norm_squared[:5])
752
753
754@pytest.mark.parametrize("dtype", [np.float32, np.float64])
755@pytest.mark.parametrize(
756    "x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
757)
758@pytest.mark.parametrize(
759    "y_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
760)
761def test_euclidean_distances(dtype, x_array_constr, y_array_constr):
762    # check that euclidean distances gives same result as scipy cdist
763    # when X and Y != X are provided
764    rng = np.random.RandomState(0)
765    X = rng.random_sample((100, 10)).astype(dtype, copy=False)
766    X[X < 0.8] = 0
767    Y = rng.random_sample((10, 10)).astype(dtype, copy=False)
768    Y[Y < 0.8] = 0
769
770    expected = cdist(X, Y)
771
772    X = x_array_constr(X)
773    Y = y_array_constr(Y)
774    distances = euclidean_distances(X, Y)
775
776    # the default rtol=1e-7 is too close to the float32 precision
777    # and fails due to rounding errors.
778    assert_allclose(distances, expected, rtol=1e-6)
779    assert distances.dtype == dtype
780
781
782@pytest.mark.parametrize("dtype", [np.float32, np.float64])
783@pytest.mark.parametrize(
784    "x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
785)
786def test_euclidean_distances_sym(dtype, x_array_constr):
787    # check that euclidean distances gives same result as scipy pdist
788    # when only X is provided
789    rng = np.random.RandomState(0)
790    X = rng.random_sample((100, 10)).astype(dtype, copy=False)
791    X[X < 0.8] = 0
792
793    expected = squareform(pdist(X))
794
795    X = x_array_constr(X)
796    distances = euclidean_distances(X)
797
798    # the default rtol=1e-7 is too close to the float32 precision
799    # and fails due to rounding errors.
800    assert_allclose(distances, expected, rtol=1e-6)
801    assert distances.dtype == dtype
802
803
804@pytest.mark.parametrize("batch_size", [None, 5, 7, 101])
805@pytest.mark.parametrize(
806    "x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
807)
808@pytest.mark.parametrize(
809    "y_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
810)
811def test_euclidean_distances_upcast(batch_size, x_array_constr, y_array_constr):
812    # check batches handling when Y != X (#13910)
813    rng = np.random.RandomState(0)
814    X = rng.random_sample((100, 10)).astype(np.float32)
815    X[X < 0.8] = 0
816    Y = rng.random_sample((10, 10)).astype(np.float32)
817    Y[Y < 0.8] = 0
818
819    expected = cdist(X, Y)
820
821    X = x_array_constr(X)
822    Y = y_array_constr(Y)
823    distances = _euclidean_distances_upcast(X, Y=Y, batch_size=batch_size)
824    distances = np.sqrt(np.maximum(distances, 0))
825
826    # the default rtol=1e-7 is too close to the float32 precision
827    # and fails due to rounding errors.
828    assert_allclose(distances, expected, rtol=1e-6)
829
830
831@pytest.mark.parametrize("batch_size", [None, 5, 7, 101])
832@pytest.mark.parametrize(
833    "x_array_constr", [np.array, csr_matrix], ids=["dense", "sparse"]
834)
835def test_euclidean_distances_upcast_sym(batch_size, x_array_constr):
836    # check batches handling when X is Y (#13910)
837    rng = np.random.RandomState(0)
838    X = rng.random_sample((100, 10)).astype(np.float32)
839    X[X < 0.8] = 0
840
841    expected = squareform(pdist(X))
842
843    X = x_array_constr(X)
844    distances = _euclidean_distances_upcast(X, Y=X, batch_size=batch_size)
845    distances = np.sqrt(np.maximum(distances, 0))
846
847    # the default rtol=1e-7 is too close to the float32 precision
848    # and fails due to rounding errors.
849    assert_allclose(distances, expected, rtol=1e-6)
850
851
852@pytest.mark.parametrize(
853    "dtype, eps, rtol",
854    [
855        (np.float32, 1e-4, 1e-5),
856        pytest.param(
857            np.float64,
858            1e-8,
859            0.99,
860            marks=pytest.mark.xfail(reason="failing due to lack of precision"),
861        ),
862    ],
863)
864@pytest.mark.parametrize("dim", [1, 1000000])
865def test_euclidean_distances_extreme_values(dtype, eps, rtol, dim):
866    # check that euclidean distances is correct with float32 input thanks to
867    # upcasting. On float64 there are still precision issues.
868    X = np.array([[1.0] * dim], dtype=dtype)
869    Y = np.array([[1.0 + eps] * dim], dtype=dtype)
870
871    distances = euclidean_distances(X, Y)
872    expected = cdist(X, Y)
873
874    assert_allclose(distances, expected, rtol=1e-5)
875
876
877@pytest.mark.parametrize("squared", [True, False])
878def test_nan_euclidean_distances_equal_to_euclidean_distance(squared):
879    # with no nan values
880    rng = np.random.RandomState(1337)
881    X = rng.randn(3, 4)
882    Y = rng.randn(4, 4)
883
884    normal_distance = euclidean_distances(X, Y=Y, squared=squared)
885    nan_distance = nan_euclidean_distances(X, Y=Y, squared=squared)
886    assert_allclose(normal_distance, nan_distance)
887
888
889@pytest.mark.parametrize("X", [np.array([[np.inf, 0]]), np.array([[0, -np.inf]])])
890@pytest.mark.parametrize("Y", [np.array([[np.inf, 0]]), np.array([[0, -np.inf]]), None])
891def test_nan_euclidean_distances_infinite_values(X, Y):
892
893    with pytest.raises(ValueError) as excinfo:
894        nan_euclidean_distances(X, Y=Y)
895
896    exp_msg = "Input contains infinity or a value too large for dtype('float64')."
897    assert exp_msg == str(excinfo.value)
898
899
900@pytest.mark.parametrize(
901    "X, X_diag, missing_value",
902    [
903        (np.array([[0, 1], [1, 0]]), np.sqrt(2), np.nan),
904        (np.array([[0, 1], [1, np.nan]]), np.sqrt(2), np.nan),
905        (np.array([[np.nan, 1], [1, np.nan]]), np.nan, np.nan),
906        (np.array([[np.nan, 1], [np.nan, 0]]), np.sqrt(2), np.nan),
907        (np.array([[0, np.nan], [1, np.nan]]), np.sqrt(2), np.nan),
908        (np.array([[0, 1], [1, 0]]), np.sqrt(2), -1),
909        (np.array([[0, 1], [1, -1]]), np.sqrt(2), -1),
910        (np.array([[-1, 1], [1, -1]]), np.nan, -1),
911        (np.array([[-1, 1], [-1, 0]]), np.sqrt(2), -1),
912        (np.array([[0, -1], [1, -1]]), np.sqrt(2), -1),
913    ],
914)
915def test_nan_euclidean_distances_2x2(X, X_diag, missing_value):
916
917    exp_dist = np.array([[0.0, X_diag], [X_diag, 0]])
918
919    dist = nan_euclidean_distances(X, missing_values=missing_value)
920    assert_allclose(exp_dist, dist)
921
922    dist_sq = nan_euclidean_distances(X, squared=True, missing_values=missing_value)
923    assert_allclose(exp_dist ** 2, dist_sq)
924
925    dist_two = nan_euclidean_distances(X, X, missing_values=missing_value)
926    assert_allclose(exp_dist, dist_two)
927
928    dist_two_copy = nan_euclidean_distances(X, X.copy(), missing_values=missing_value)
929    assert_allclose(exp_dist, dist_two_copy)
930
931
932@pytest.mark.parametrize("missing_value", [np.nan, -1])
933def test_nan_euclidean_distances_complete_nan(missing_value):
934    X = np.array([[missing_value, missing_value], [0, 1]])
935
936    exp_dist = np.array([[np.nan, np.nan], [np.nan, 0]])
937
938    dist = nan_euclidean_distances(X, missing_values=missing_value)
939    assert_allclose(exp_dist, dist)
940
941    dist = nan_euclidean_distances(X, X.copy(), missing_values=missing_value)
942    assert_allclose(exp_dist, dist)
943
944
945@pytest.mark.parametrize("missing_value", [np.nan, -1])
946def test_nan_euclidean_distances_not_trival(missing_value):
947    X = np.array(
948        [
949            [1.0, missing_value, 3.0, 4.0, 2.0],
950            [missing_value, 4.0, 6.0, 1.0, missing_value],
951            [3.0, missing_value, missing_value, missing_value, 1.0],
952        ]
953    )
954
955    Y = np.array(
956        [
957            [missing_value, 7.0, 7.0, missing_value, 2.0],
958            [missing_value, missing_value, 5.0, 4.0, 7.0],
959            [missing_value, missing_value, missing_value, 4.0, 5.0],
960        ]
961    )
962
963    # Check for symmetry
964    D1 = nan_euclidean_distances(X, Y, missing_values=missing_value)
965    D2 = nan_euclidean_distances(Y, X, missing_values=missing_value)
966
967    assert_almost_equal(D1, D2.T)
968
969    # Check with explicit formula and squared=True
970    assert_allclose(
971        nan_euclidean_distances(
972            X[:1], Y[:1], squared=True, missing_values=missing_value
973        ),
974        [[5.0 / 2.0 * ((7 - 3) ** 2 + (2 - 2) ** 2)]],
975    )
976
977    # Check with explicit formula and squared=False
978    assert_allclose(
979        nan_euclidean_distances(
980            X[1:2], Y[1:2], squared=False, missing_values=missing_value
981        ),
982        [[np.sqrt(5.0 / 2.0 * ((6 - 5) ** 2 + (1 - 4) ** 2))]],
983    )
984
985    # Check when Y = X is explicitly passed
986    D3 = nan_euclidean_distances(X, missing_values=missing_value)
987    D4 = nan_euclidean_distances(X, X, missing_values=missing_value)
988    D5 = nan_euclidean_distances(X, X.copy(), missing_values=missing_value)
989    assert_allclose(D3, D4)
990    assert_allclose(D4, D5)
991
992    # Check copy = True against copy = False
993    D6 = nan_euclidean_distances(X, Y, copy=True)
994    D7 = nan_euclidean_distances(X, Y, copy=False)
995    assert_allclose(D6, D7)
996
997
998@pytest.mark.parametrize("missing_value", [np.nan, -1])
999def test_nan_euclidean_distances_one_feature_match_positive(missing_value):
1000    # First feature is the only feature that is non-nan and in both
1001    # samples. The result of `nan_euclidean_distances` with squared=True
1002    # should be non-negative. The non-squared version should all be close to 0.
1003    X = np.array(
1004        [
1005            [-122.27, 648.0, missing_value, 37.85],
1006            [-122.27, missing_value, 2.34701493, missing_value],
1007        ]
1008    )
1009
1010    dist_squared = nan_euclidean_distances(
1011        X, missing_values=missing_value, squared=True
1012    )
1013    assert np.all(dist_squared >= 0)
1014
1015    dist = nan_euclidean_distances(X, missing_values=missing_value, squared=False)
1016    assert_allclose(dist, 0.0)
1017
1018
1019def test_cosine_distances():
1020    # Check the pairwise Cosine distances computation
1021    rng = np.random.RandomState(1337)
1022    x = np.abs(rng.rand(910))
1023    XA = np.vstack([x, x])
1024    D = cosine_distances(XA)
1025    assert_array_almost_equal(D, [[0.0, 0.0], [0.0, 0.0]])
1026    # check that all elements are in [0, 2]
1027    assert np.all(D >= 0.0)
1028    assert np.all(D <= 2.0)
1029    # check that diagonal elements are equal to 0
1030    assert_array_almost_equal(D[np.diag_indices_from(D)], [0.0, 0.0])
1031
1032    XB = np.vstack([x, -x])
1033    D2 = cosine_distances(XB)
1034    # check that all elements are in [0, 2]
1035    assert np.all(D2 >= 0.0)
1036    assert np.all(D2 <= 2.0)
1037    # check that diagonal elements are equal to 0 and non diagonal to 2
1038    assert_array_almost_equal(D2, [[0.0, 2.0], [2.0, 0.0]])
1039
1040    # check large random matrix
1041    X = np.abs(rng.rand(1000, 5000))
1042    D = cosine_distances(X)
1043    # check that diagonal elements are equal to 0
1044    assert_array_almost_equal(D[np.diag_indices_from(D)], [0.0] * D.shape[0])
1045    assert np.all(D >= 0.0)
1046    assert np.all(D <= 2.0)
1047
1048
1049def test_haversine_distances():
1050    # Check haversine distance with distances computation
1051    def slow_haversine_distances(x, y):
1052        diff_lat = y[0] - x[0]
1053        diff_lon = y[1] - x[1]
1054        a = np.sin(diff_lat / 2) ** 2 + (
1055            np.cos(x[0]) * np.cos(y[0]) * np.sin(diff_lon / 2) ** 2
1056        )
1057        c = 2 * np.arcsin(np.sqrt(a))
1058        return c
1059
1060    rng = np.random.RandomState(0)
1061    X = rng.random_sample((5, 2))
1062    Y = rng.random_sample((10, 2))
1063    D1 = np.array([[slow_haversine_distances(x, y) for y in Y] for x in X])
1064    D2 = haversine_distances(X, Y)
1065    assert_array_almost_equal(D1, D2)
1066    # Test haversine distance does not accept X where n_feature != 2
1067    X = rng.random_sample((10, 3))
1068    err_msg = "Haversine distance only valid in 2 dimensions"
1069    with pytest.raises(ValueError, match=err_msg):
1070        haversine_distances(X)
1071
1072
1073# Paired distances
1074
1075
1076def test_paired_euclidean_distances():
1077    # Check the paired Euclidean distances computation
1078    X = [[0], [0]]
1079    Y = [[1], [2]]
1080    D = paired_euclidean_distances(X, Y)
1081    assert_array_almost_equal(D, [1.0, 2.0])
1082
1083
1084def test_paired_manhattan_distances():
1085    # Check the paired manhattan distances computation
1086    X = [[0], [0]]
1087    Y = [[1], [2]]
1088    D = paired_manhattan_distances(X, Y)
1089    assert_array_almost_equal(D, [1.0, 2.0])
1090
1091
1092def test_chi_square_kernel():
1093    rng = np.random.RandomState(0)
1094    X = rng.random_sample((5, 4))
1095    Y = rng.random_sample((10, 4))
1096    K_add = additive_chi2_kernel(X, Y)
1097    gamma = 0.1
1098    K = chi2_kernel(X, Y, gamma=gamma)
1099    assert K.dtype == float
1100    for i, x in enumerate(X):
1101        for j, y in enumerate(Y):
1102            chi2 = -np.sum((x - y) ** 2 / (x + y))
1103            chi2_exp = np.exp(gamma * chi2)
1104            assert_almost_equal(K_add[i, j], chi2)
1105            assert_almost_equal(K[i, j], chi2_exp)
1106
1107    # check diagonal is ones for data with itself
1108    K = chi2_kernel(Y)
1109    assert_array_equal(np.diag(K), 1)
1110    # check off-diagonal is < 1 but > 0:
1111    assert np.all(K > 0)
1112    assert np.all(K - np.diag(np.diag(K)) < 1)
1113    # check that float32 is preserved
1114    X = rng.random_sample((5, 4)).astype(np.float32)
1115    Y = rng.random_sample((10, 4)).astype(np.float32)
1116    K = chi2_kernel(X, Y)
1117    assert K.dtype == np.float32
1118
1119    # check integer type gets converted,
1120    # check that zeros are handled
1121    X = rng.random_sample((10, 4)).astype(np.int32)
1122    K = chi2_kernel(X, X)
1123    assert np.isfinite(K).all()
1124    assert K.dtype == float
1125
1126    # check that kernel of similar things is greater than dissimilar ones
1127    X = [[0.3, 0.7], [1.0, 0]]
1128    Y = [[0, 1], [0.9, 0.1]]
1129    K = chi2_kernel(X, Y)
1130    assert K[0, 0] > K[0, 1]
1131    assert K[1, 1] > K[1, 0]
1132
1133    # test negative input
1134    with pytest.raises(ValueError):
1135        chi2_kernel([[0, -1]])
1136    with pytest.raises(ValueError):
1137        chi2_kernel([[0, -1]], [[-1, -1]])
1138    with pytest.raises(ValueError):
1139        chi2_kernel([[0, 1]], [[-1, -1]])
1140
1141    # different n_features in X and Y
1142    with pytest.raises(ValueError):
1143        chi2_kernel([[0, 1]], [[0.2, 0.2, 0.6]])
1144
1145    # sparse matrices
1146    with pytest.raises(ValueError):
1147        chi2_kernel(csr_matrix(X), csr_matrix(Y))
1148    with pytest.raises(ValueError):
1149        additive_chi2_kernel(csr_matrix(X), csr_matrix(Y))
1150
1151
1152@pytest.mark.parametrize(
1153    "kernel",
1154    (
1155        linear_kernel,
1156        polynomial_kernel,
1157        rbf_kernel,
1158        laplacian_kernel,
1159        sigmoid_kernel,
1160        cosine_similarity,
1161    ),
1162)
1163def test_kernel_symmetry(kernel):
1164    # Valid kernels should be symmetric
1165    rng = np.random.RandomState(0)
1166    X = rng.random_sample((5, 4))
1167    K = kernel(X, X)
1168    assert_array_almost_equal(K, K.T, 15)
1169
1170
1171@pytest.mark.parametrize(
1172    "kernel",
1173    (
1174        linear_kernel,
1175        polynomial_kernel,
1176        rbf_kernel,
1177        laplacian_kernel,
1178        sigmoid_kernel,
1179        cosine_similarity,
1180    ),
1181)
1182def test_kernel_sparse(kernel):
1183    rng = np.random.RandomState(0)
1184    X = rng.random_sample((5, 4))
1185    X_sparse = csr_matrix(X)
1186    K = kernel(X, X)
1187    K2 = kernel(X_sparse, X_sparse)
1188    assert_array_almost_equal(K, K2)
1189
1190
1191def test_linear_kernel():
1192    rng = np.random.RandomState(0)
1193    X = rng.random_sample((5, 4))
1194    K = linear_kernel(X, X)
1195    # the diagonal elements of a linear kernel are their squared norm
1196    assert_array_almost_equal(K.flat[::6], [linalg.norm(x) ** 2 for x in X])
1197
1198
1199def test_rbf_kernel():
1200    rng = np.random.RandomState(0)
1201    X = rng.random_sample((5, 4))
1202    K = rbf_kernel(X, X)
1203    # the diagonal elements of a rbf kernel are 1
1204    assert_array_almost_equal(K.flat[::6], np.ones(5))
1205
1206
1207def test_laplacian_kernel():
1208    rng = np.random.RandomState(0)
1209    X = rng.random_sample((5, 4))
1210    K = laplacian_kernel(X, X)
1211    # the diagonal elements of a laplacian kernel are 1
1212    assert_array_almost_equal(np.diag(K), np.ones(5))
1213
1214    # off-diagonal elements are < 1 but > 0:
1215    assert np.all(K > 0)
1216    assert np.all(K - np.diag(np.diag(K)) < 1)
1217
1218
1219@pytest.mark.parametrize(
1220    "metric, pairwise_func", [("linear", linear_kernel), ("cosine", cosine_similarity)]
1221)
1222def test_pairwise_similarity_sparse_output(metric, pairwise_func):
1223    rng = np.random.RandomState(0)
1224    X = rng.random_sample((5, 4))
1225    Y = rng.random_sample((3, 4))
1226    Xcsr = csr_matrix(X)
1227    Ycsr = csr_matrix(Y)
1228
1229    # should be sparse
1230    K1 = pairwise_func(Xcsr, Ycsr, dense_output=False)
1231    assert issparse(K1)
1232
1233    # should be dense, and equal to K1
1234    K2 = pairwise_func(X, Y, dense_output=True)
1235    assert not issparse(K2)
1236    assert_array_almost_equal(K1.todense(), K2)
1237
1238    # show the kernel output equal to the sparse.todense()
1239    K3 = pairwise_kernels(X, Y=Y, metric=metric)
1240    assert_array_almost_equal(K1.todense(), K3)
1241
1242
1243def test_cosine_similarity():
1244    # Test the cosine_similarity.
1245
1246    rng = np.random.RandomState(0)
1247    X = rng.random_sample((5, 4))
1248    Y = rng.random_sample((3, 4))
1249    Xcsr = csr_matrix(X)
1250    Ycsr = csr_matrix(Y)
1251
1252    for X_, Y_ in ((X, None), (X, Y), (Xcsr, None), (Xcsr, Ycsr)):
1253        # Test that the cosine is kernel is equal to a linear kernel when data
1254        # has been previously normalized by L2-norm.
1255        K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
1256        X_ = normalize(X_)
1257        if Y_ is not None:
1258            Y_ = normalize(Y_)
1259        K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
1260        assert_array_almost_equal(K1, K2)
1261
1262
1263def test_check_dense_matrices():
1264    # Ensure that pairwise array check works for dense matrices.
1265    # Check that if XB is None, XB is returned as reference to XA
1266    XA = np.resize(np.arange(40), (5, 8))
1267    XA_checked, XB_checked = check_pairwise_arrays(XA, None)
1268    assert XA_checked is XB_checked
1269    assert_array_equal(XA, XA_checked)
1270
1271
1272def test_check_XB_returned():
1273    # Ensure that if XA and XB are given correctly, they return as equal.
1274    # Check that if XB is not None, it is returned equal.
1275    # Note that the second dimension of XB is the same as XA.
1276    XA = np.resize(np.arange(40), (5, 8))
1277    XB = np.resize(np.arange(32), (4, 8))
1278    XA_checked, XB_checked = check_pairwise_arrays(XA, XB)
1279    assert_array_equal(XA, XA_checked)
1280    assert_array_equal(XB, XB_checked)
1281
1282    XB = np.resize(np.arange(40), (5, 8))
1283    XA_checked, XB_checked = check_paired_arrays(XA, XB)
1284    assert_array_equal(XA, XA_checked)
1285    assert_array_equal(XB, XB_checked)
1286
1287
1288def test_check_different_dimensions():
1289    # Ensure an error is raised if the dimensions are different.
1290    XA = np.resize(np.arange(45), (5, 9))
1291    XB = np.resize(np.arange(32), (4, 8))
1292    with pytest.raises(ValueError):
1293        check_pairwise_arrays(XA, XB)
1294
1295    XB = np.resize(np.arange(4 * 9), (4, 9))
1296    with pytest.raises(ValueError):
1297        check_paired_arrays(XA, XB)
1298
1299
1300def test_check_invalid_dimensions():
1301    # Ensure an error is raised on 1D input arrays.
1302    # The modified tests are not 1D. In the old test, the array was internally
1303    # converted to 2D anyways
1304    XA = np.arange(45).reshape(9, 5)
1305    XB = np.arange(32).reshape(4, 8)
1306    with pytest.raises(ValueError):
1307        check_pairwise_arrays(XA, XB)
1308    XA = np.arange(45).reshape(9, 5)
1309    XB = np.arange(32).reshape(4, 8)
1310    with pytest.raises(ValueError):
1311        check_pairwise_arrays(XA, XB)
1312
1313
1314def test_check_sparse_arrays():
1315    # Ensures that checks return valid sparse matrices.
1316    rng = np.random.RandomState(0)
1317    XA = rng.random_sample((5, 4))
1318    XA_sparse = csr_matrix(XA)
1319    XB = rng.random_sample((5, 4))
1320    XB_sparse = csr_matrix(XB)
1321    XA_checked, XB_checked = check_pairwise_arrays(XA_sparse, XB_sparse)
1322    # compare their difference because testing csr matrices for
1323    # equality with '==' does not work as expected.
1324    assert issparse(XA_checked)
1325    assert abs(XA_sparse - XA_checked).sum() == 0
1326    assert issparse(XB_checked)
1327    assert abs(XB_sparse - XB_checked).sum() == 0
1328
1329    XA_checked, XA_2_checked = check_pairwise_arrays(XA_sparse, XA_sparse)
1330    assert issparse(XA_checked)
1331    assert abs(XA_sparse - XA_checked).sum() == 0
1332    assert issparse(XA_2_checked)
1333    assert abs(XA_2_checked - XA_checked).sum() == 0
1334
1335
1336def tuplify(X):
1337    # Turns a numpy matrix (any n-dimensional array) into tuples.
1338    s = X.shape
1339    if len(s) > 1:
1340        # Tuplify each sub-array in the input.
1341        return tuple(tuplify(row) for row in X)
1342    else:
1343        # Single dimension input, just return tuple of contents.
1344        return tuple(r for r in X)
1345
1346
1347def test_check_tuple_input():
1348    # Ensures that checks return valid tuples.
1349    rng = np.random.RandomState(0)
1350    XA = rng.random_sample((5, 4))
1351    XA_tuples = tuplify(XA)
1352    XB = rng.random_sample((5, 4))
1353    XB_tuples = tuplify(XB)
1354    XA_checked, XB_checked = check_pairwise_arrays(XA_tuples, XB_tuples)
1355    assert_array_equal(XA_tuples, XA_checked)
1356    assert_array_equal(XB_tuples, XB_checked)
1357
1358
1359def test_check_preserve_type():
1360    # Ensures that type float32 is preserved.
1361    XA = np.resize(np.arange(40), (5, 8)).astype(np.float32)
1362    XB = np.resize(np.arange(40), (5, 8)).astype(np.float32)
1363
1364    XA_checked, XB_checked = check_pairwise_arrays(XA, None)
1365    assert XA_checked.dtype == np.float32
1366
1367    # both float32
1368    XA_checked, XB_checked = check_pairwise_arrays(XA, XB)
1369    assert XA_checked.dtype == np.float32
1370    assert XB_checked.dtype == np.float32
1371
1372    # mismatched A
1373    XA_checked, XB_checked = check_pairwise_arrays(XA.astype(float), XB)
1374    assert XA_checked.dtype == float
1375    assert XB_checked.dtype == float
1376
1377    # mismatched B
1378    XA_checked, XB_checked = check_pairwise_arrays(XA, XB.astype(float))
1379    assert XA_checked.dtype == float
1380    assert XB_checked.dtype == float
1381
1382
1383@pytest.mark.parametrize("n_jobs", [1, 2])
1384@pytest.mark.parametrize("metric", ["seuclidean", "mahalanobis"])
1385@pytest.mark.parametrize(
1386    "dist_function", [pairwise_distances, pairwise_distances_chunked]
1387)
1388def test_pairwise_distances_data_derived_params(n_jobs, metric, dist_function):
1389    # check that pairwise_distances give the same result in sequential and
1390    # parallel, when metric has data-derived parameters.
1391    with config_context(working_memory=0.1):  # to have more than 1 chunk
1392        rng = np.random.RandomState(0)
1393        X = rng.random_sample((100, 10))
1394
1395        expected_dist = squareform(pdist(X, metric=metric))
1396        dist = np.vstack(tuple(dist_function(X, metric=metric, n_jobs=n_jobs)))
1397
1398        assert_allclose(dist, expected_dist)
1399
1400
1401@pytest.mark.parametrize("metric", ["seuclidean", "mahalanobis"])
1402def test_pairwise_distances_data_derived_params_error(metric):
1403    # check that pairwise_distances raises an error when Y is passed but
1404    # metric has data-derived params that are not provided by the user.
1405    rng = np.random.RandomState(0)
1406    X = rng.random_sample((100, 10))
1407    Y = rng.random_sample((100, 10))
1408
1409    with pytest.raises(
1410        ValueError,
1411        match=fr"The '(V|VI)' parameter is required for the " fr"{metric} metric",
1412    ):
1413        pairwise_distances(X, Y, metric=metric)
1414
1415
1416@pytest.mark.parametrize(
1417    "metric",
1418    [
1419        "braycurtis",
1420        "canberra",
1421        "chebyshev",
1422        "correlation",
1423        "hamming",
1424        "mahalanobis",
1425        "minkowski",
1426        "seuclidean",
1427        "sqeuclidean",
1428        "cityblock",
1429        "cosine",
1430        "euclidean",
1431    ],
1432)
1433@pytest.mark.parametrize("dtype", [np.float32, np.float64])
1434@pytest.mark.parametrize("y_is_x", [True, False], ids=["Y is X", "Y is not X"])
1435def test_numeric_pairwise_distances_datatypes(metric, dtype, y_is_x):
1436    # Check that pairwise distances gives the same result as pdist and cdist
1437    # regardless of input datatype when using any scipy metric for comparing
1438    # numeric vectors
1439    #
1440    # This test is necessary because pairwise_distances used to throw an
1441    # error when using metric='seuclidean' and the input data was not
1442    # of type np.float64 (#15730)
1443
1444    rng = np.random.RandomState(0)
1445
1446    X = rng.random_sample((5, 4)).astype(dtype)
1447
1448    params = {}
1449    if y_is_x:
1450        Y = X
1451        expected_dist = squareform(pdist(X, metric=metric))
1452    else:
1453        Y = rng.random_sample((5, 4)).astype(dtype)
1454        expected_dist = cdist(X, Y, metric=metric)
1455        # precompute parameters for seuclidean & mahalanobis when x is not y
1456        if metric == "seuclidean":
1457            params = {"V": np.var(np.vstack([X, Y]), axis=0, ddof=1, dtype=np.float64)}
1458        elif metric == "mahalanobis":
1459            params = {"VI": np.linalg.inv(np.cov(np.vstack([X, Y]).T)).T}
1460
1461    dist = pairwise_distances(X, Y, metric=metric, **params)
1462
1463    # the default rtol=1e-7 is too close to the float32 precision
1464    # and fails due to rounding errors
1465    rtol = 1e-5 if dtype is np.float32 else 1e-7
1466    assert_allclose(dist, expected_dist, rtol=rtol)
1467