1"""
2Several basic tests for hierarchical clustering procedures
3
4"""
5# Authors: Vincent Michel, 2010, Gael Varoquaux 2012,
6#          Matteo Visconti di Oleggio Castello 2014
7# License: BSD 3 clause
8import itertools
9from tempfile import mkdtemp
10import shutil
11import pytest
12from functools import partial
13
14import numpy as np
15from scipy import sparse
16from scipy.cluster import hierarchy
17from scipy.sparse.csgraph import connected_components
18
19from sklearn.metrics.cluster import adjusted_rand_score
20from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS
21from sklearn.utils._testing import assert_almost_equal, create_memmap_backed_data
22from sklearn.utils._testing import assert_array_almost_equal
23from sklearn.utils._testing import ignore_warnings
24
25from sklearn.cluster import ward_tree
26from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration
27from sklearn.cluster._agglomerative import (
28    _hc_cut,
29    _TREE_BUILDERS,
30    linkage_tree,
31    _fix_connectivity,
32)
33from sklearn.feature_extraction.image import grid_to_graph
34from sklearn.metrics import DistanceMetric
35from sklearn.metrics.pairwise import (
36    PAIRED_DISTANCES,
37    cosine_distances,
38    manhattan_distances,
39    pairwise_distances,
40)
41from sklearn.metrics.cluster import normalized_mutual_info_score
42from sklearn.neighbors import kneighbors_graph
43from sklearn.cluster._hierarchical_fast import (
44    average_merge,
45    max_merge,
46    mst_linkage_core,
47)
48from sklearn.utils._fast_dict import IntFloatDict
49from sklearn.utils._testing import assert_array_equal
50from sklearn.datasets import make_moons, make_circles
51
52
53def test_linkage_misc():
54    # Misc tests on linkage
55    rng = np.random.RandomState(42)
56    X = rng.normal(size=(5, 5))
57    with pytest.raises(ValueError):
58        AgglomerativeClustering(linkage="foo").fit(X)
59
60    with pytest.raises(ValueError):
61        linkage_tree(X, linkage="foo")
62
63    with pytest.raises(ValueError):
64        linkage_tree(X, connectivity=np.ones((4, 4)))
65
66    # Smoke test FeatureAgglomeration
67    FeatureAgglomeration().fit(X)
68
69    # test hierarchical clustering on a precomputed distances matrix
70    dis = cosine_distances(X)
71
72    res = linkage_tree(dis, affinity="precomputed")
73    assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0])
74
75    # test hierarchical clustering on a precomputed distances matrix
76    res = linkage_tree(X, affinity=manhattan_distances)
77    assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
78
79
80def test_structured_linkage_tree():
81    # Check that we obtain the correct solution for structured linkage trees.
82    rng = np.random.RandomState(0)
83    mask = np.ones([10, 10], dtype=bool)
84    # Avoiding a mask with only 'True' entries
85    mask[4:7, 4:7] = 0
86    X = rng.randn(50, 100)
87    connectivity = grid_to_graph(*mask.shape)
88    for tree_builder in _TREE_BUILDERS.values():
89        children, n_components, n_leaves, parent = tree_builder(
90            X.T, connectivity=connectivity
91        )
92        n_nodes = 2 * X.shape[1] - 1
93        assert len(children) + n_leaves == n_nodes
94        # Check that ward_tree raises a ValueError with a connectivity matrix
95        # of the wrong shape
96        with pytest.raises(ValueError):
97            tree_builder(X.T, connectivity=np.ones((4, 4)))
98        # Check that fitting with no samples raises an error
99        with pytest.raises(ValueError):
100            tree_builder(X.T[:0], connectivity=connectivity)
101
102
103def test_unstructured_linkage_tree():
104    # Check that we obtain the correct solution for unstructured linkage trees.
105    rng = np.random.RandomState(0)
106    X = rng.randn(50, 100)
107    for this_X in (X, X[0]):
108        # With specified a number of clusters just for the sake of
109        # raising a warning and testing the warning code
110        with ignore_warnings():
111            with pytest.warns(UserWarning):
112                children, n_nodes, n_leaves, parent = ward_tree(this_X.T, n_clusters=10)
113        n_nodes = 2 * X.shape[1] - 1
114        assert len(children) + n_leaves == n_nodes
115
116    for tree_builder in _TREE_BUILDERS.values():
117        for this_X in (X, X[0]):
118            with ignore_warnings():
119                with pytest.warns(UserWarning):
120                    children, n_nodes, n_leaves, parent = tree_builder(
121                        this_X.T, n_clusters=10
122                    )
123            n_nodes = 2 * X.shape[1] - 1
124            assert len(children) + n_leaves == n_nodes
125
126
127def test_height_linkage_tree():
128    # Check that the height of the results of linkage tree is sorted.
129    rng = np.random.RandomState(0)
130    mask = np.ones([10, 10], dtype=bool)
131    X = rng.randn(50, 100)
132    connectivity = grid_to_graph(*mask.shape)
133    for linkage_func in _TREE_BUILDERS.values():
134        children, n_nodes, n_leaves, parent = linkage_func(
135            X.T, connectivity=connectivity
136        )
137        n_nodes = 2 * X.shape[1] - 1
138        assert len(children) + n_leaves == n_nodes
139
140
141def test_agglomerative_clustering_wrong_arg_memory():
142    # Test either if an error is raised when memory is not
143    # either a str or a joblib.Memory instance
144    rng = np.random.RandomState(0)
145    n_samples = 100
146    X = rng.randn(n_samples, 50)
147    memory = 5
148    clustering = AgglomerativeClustering(memory=memory)
149    with pytest.raises(ValueError):
150        clustering.fit(X)
151
152
153def test_zero_cosine_linkage_tree():
154    # Check that zero vectors in X produce an error when
155    # 'cosine' affinity is used
156    X = np.array([[0, 1], [0, 0]])
157    msg = "Cosine affinity cannot be used when X contains zero vectors"
158    with pytest.raises(ValueError, match=msg):
159        linkage_tree(X, affinity="cosine")
160
161
162@pytest.mark.parametrize("n_clusters, distance_threshold", [(None, 0.5), (10, None)])
163@pytest.mark.parametrize("compute_distances", [True, False])
164@pytest.mark.parametrize("linkage", ["ward", "complete", "average", "single"])
165def test_agglomerative_clustering_distances(
166    n_clusters, compute_distances, distance_threshold, linkage
167):
168    # Check that when `compute_distances` is True or `distance_threshold` is
169    # given, the fitted model has an attribute `distances_`.
170    rng = np.random.RandomState(0)
171    mask = np.ones([10, 10], dtype=bool)
172    n_samples = 100
173    X = rng.randn(n_samples, 50)
174    connectivity = grid_to_graph(*mask.shape)
175
176    clustering = AgglomerativeClustering(
177        n_clusters=n_clusters,
178        connectivity=connectivity,
179        linkage=linkage,
180        distance_threshold=distance_threshold,
181        compute_distances=compute_distances,
182    )
183    clustering.fit(X)
184    if compute_distances or (distance_threshold is not None):
185        assert hasattr(clustering, "distances_")
186        n_children = clustering.children_.shape[0]
187        n_nodes = n_children + 1
188        assert clustering.distances_.shape == (n_nodes - 1,)
189    else:
190        assert not hasattr(clustering, "distances_")
191
192
193def test_agglomerative_clustering():
194    # Check that we obtain the correct number of clusters with
195    # agglomerative clustering.
196    rng = np.random.RandomState(0)
197    mask = np.ones([10, 10], dtype=bool)
198    n_samples = 100
199    X = rng.randn(n_samples, 50)
200    connectivity = grid_to_graph(*mask.shape)
201    for linkage in ("ward", "complete", "average", "single"):
202        clustering = AgglomerativeClustering(
203            n_clusters=10, connectivity=connectivity, linkage=linkage
204        )
205        clustering.fit(X)
206        # test caching
207        try:
208            tempdir = mkdtemp()
209            clustering = AgglomerativeClustering(
210                n_clusters=10,
211                connectivity=connectivity,
212                memory=tempdir,
213                linkage=linkage,
214            )
215            clustering.fit(X)
216            labels = clustering.labels_
217            assert np.size(np.unique(labels)) == 10
218        finally:
219            shutil.rmtree(tempdir)
220        # Turn caching off now
221        clustering = AgglomerativeClustering(
222            n_clusters=10, connectivity=connectivity, linkage=linkage
223        )
224        # Check that we obtain the same solution with early-stopping of the
225        # tree building
226        clustering.compute_full_tree = False
227        clustering.fit(X)
228        assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1)
229        clustering.connectivity = None
230        clustering.fit(X)
231        assert np.size(np.unique(clustering.labels_)) == 10
232        # Check that we raise a TypeError on dense matrices
233        clustering = AgglomerativeClustering(
234            n_clusters=10,
235            connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]),
236            linkage=linkage,
237        )
238        with pytest.raises(ValueError):
239            clustering.fit(X)
240
241    # Test that using ward with another metric than euclidean raises an
242    # exception
243    clustering = AgglomerativeClustering(
244        n_clusters=10,
245        connectivity=connectivity.toarray(),
246        affinity="manhattan",
247        linkage="ward",
248    )
249    with pytest.raises(ValueError):
250        clustering.fit(X)
251
252    # Test using another metric than euclidean works with linkage complete
253    for affinity in PAIRED_DISTANCES.keys():
254        # Compare our (structured) implementation to scipy
255        clustering = AgglomerativeClustering(
256            n_clusters=10,
257            connectivity=np.ones((n_samples, n_samples)),
258            affinity=affinity,
259            linkage="complete",
260        )
261        clustering.fit(X)
262        clustering2 = AgglomerativeClustering(
263            n_clusters=10, connectivity=None, affinity=affinity, linkage="complete"
264        )
265        clustering2.fit(X)
266        assert_almost_equal(
267            normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1
268        )
269
270    # Test that using a distance matrix (affinity = 'precomputed') has same
271    # results (with connectivity constraints)
272    clustering = AgglomerativeClustering(
273        n_clusters=10, connectivity=connectivity, linkage="complete"
274    )
275    clustering.fit(X)
276    X_dist = pairwise_distances(X)
277    clustering2 = AgglomerativeClustering(
278        n_clusters=10,
279        connectivity=connectivity,
280        affinity="precomputed",
281        linkage="complete",
282    )
283    clustering2.fit(X_dist)
284    assert_array_equal(clustering.labels_, clustering2.labels_)
285
286
287def test_agglomerative_clustering_memory_mapped():
288    """AgglomerativeClustering must work on mem-mapped dataset.
289
290    Non-regression test for issue #19875.
291    """
292    rng = np.random.RandomState(0)
293    Xmm = create_memmap_backed_data(rng.randn(50, 100))
294    AgglomerativeClustering(affinity="euclidean", linkage="single").fit(Xmm)
295
296
297def test_ward_agglomeration():
298    # Check that we obtain the correct solution in a simplistic case
299    rng = np.random.RandomState(0)
300    mask = np.ones([10, 10], dtype=bool)
301    X = rng.randn(50, 100)
302    connectivity = grid_to_graph(*mask.shape)
303    agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity)
304    agglo.fit(X)
305    assert np.size(np.unique(agglo.labels_)) == 5
306
307    X_red = agglo.transform(X)
308    assert X_red.shape[1] == 5
309    X_full = agglo.inverse_transform(X_red)
310    assert np.unique(X_full[0]).size == 5
311    assert_array_almost_equal(agglo.transform(X_full), X_red)
312
313    # Check that fitting with no samples raises a ValueError
314    with pytest.raises(ValueError):
315        agglo.fit(X[:0])
316
317
318def test_single_linkage_clustering():
319    # Check that we get the correct result in two emblematic cases
320    moons, moon_labels = make_moons(noise=0.05, random_state=42)
321    clustering = AgglomerativeClustering(n_clusters=2, linkage="single")
322    clustering.fit(moons)
323    assert_almost_equal(
324        normalized_mutual_info_score(clustering.labels_, moon_labels), 1
325    )
326
327    circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42)
328    clustering = AgglomerativeClustering(n_clusters=2, linkage="single")
329    clustering.fit(circles)
330    assert_almost_equal(
331        normalized_mutual_info_score(clustering.labels_, circle_labels), 1
332    )
333
334
335def assess_same_labelling(cut1, cut2):
336    """Util for comparison with scipy"""
337    co_clust = []
338    for cut in [cut1, cut2]:
339        n = len(cut)
340        k = cut.max() + 1
341        ecut = np.zeros((n, k))
342        ecut[np.arange(n), cut] = 1
343        co_clust.append(np.dot(ecut, ecut.T))
344    assert (co_clust[0] == co_clust[1]).all()
345
346
347def test_sparse_scikit_vs_scipy():
348    # Test scikit linkage with full connectivity (i.e. unstructured) vs scipy
349    n, p, k = 10, 5, 3
350    rng = np.random.RandomState(0)
351
352    # Not using a lil_matrix here, just to check that non sparse
353    # matrices are well handled
354    connectivity = np.ones((n, n))
355    for linkage in _TREE_BUILDERS.keys():
356        for i in range(5):
357            X = 0.1 * rng.normal(size=(n, p))
358            X -= 4.0 * np.arange(n)[:, np.newaxis]
359            X -= X.mean(axis=1)[:, np.newaxis]
360
361            out = hierarchy.linkage(X, method=linkage)
362
363            children_ = out[:, :2].astype(int, copy=False)
364            children, _, n_leaves, _ = _TREE_BUILDERS[linkage](
365                X, connectivity=connectivity
366            )
367
368            # Sort the order of child nodes per row for consistency
369            children.sort(axis=1)
370            assert_array_equal(
371                children,
372                children_,
373                "linkage tree differs from scipy impl for linkage: " + linkage,
374            )
375
376            cut = _hc_cut(k, children, n_leaves)
377            cut_ = _hc_cut(k, children_, n_leaves)
378            assess_same_labelling(cut, cut_)
379
380    # Test error management in _hc_cut
381    with pytest.raises(ValueError):
382        _hc_cut(n_leaves + 1, children, n_leaves)
383
384
385# Make sure our custom mst_linkage_core gives
386# the same results as scipy's builtin
387@pytest.mark.parametrize("seed", range(5))
388def test_vector_scikit_single_vs_scipy_single(seed):
389    n_samples, n_features, n_clusters = 10, 5, 3
390    rng = np.random.RandomState(seed)
391    X = 0.1 * rng.normal(size=(n_samples, n_features))
392    X -= 4.0 * np.arange(n_samples)[:, np.newaxis]
393    X -= X.mean(axis=1)[:, np.newaxis]
394
395    out = hierarchy.linkage(X, method="single")
396    children_scipy = out[:, :2].astype(int)
397
398    children, _, n_leaves, _ = _TREE_BUILDERS["single"](X)
399
400    # Sort the order of child nodes per row for consistency
401    children.sort(axis=1)
402    assert_array_equal(
403        children,
404        children_scipy,
405        "linkage tree differs from scipy impl for single linkage.",
406    )
407
408    cut = _hc_cut(n_clusters, children, n_leaves)
409    cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves)
410    assess_same_labelling(cut, cut_scipy)
411
412
413@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS)
414def test_mst_linkage_core_memory_mapped(metric_param_grid):
415    """The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset.
416
417    Non-regression test for issue #19875.
418    """
419    rng = np.random.RandomState(seed=1)
420    X = rng.normal(size=(20, 4))
421    Xmm = create_memmap_backed_data(X)
422    metric, param_grid = metric_param_grid
423    keys = param_grid.keys()
424    for vals in itertools.product(*param_grid.values()):
425        kwargs = dict(zip(keys, vals))
426        distance_metric = DistanceMetric.get_metric(metric, **kwargs)
427        mst = mst_linkage_core(X, distance_metric)
428        mst_mm = mst_linkage_core(Xmm, distance_metric)
429        np.testing.assert_equal(mst, mst_mm)
430
431
432def test_identical_points():
433    # Ensure identical points are handled correctly when using mst with
434    # a sparse connectivity matrix
435    X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]])
436    true_labels = np.array([0, 0, 1, 1, 2, 2])
437    connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False)
438    connectivity = 0.5 * (connectivity + connectivity.T)
439    connectivity, n_components = _fix_connectivity(X, connectivity, "euclidean")
440
441    for linkage in ("single", "average", "average", "ward"):
442        clustering = AgglomerativeClustering(
443            n_clusters=3, linkage=linkage, connectivity=connectivity
444        )
445        clustering.fit(X)
446
447        assert_almost_equal(
448            normalized_mutual_info_score(clustering.labels_, true_labels), 1
449        )
450
451
452def test_connectivity_propagation():
453    # Check that connectivity in the ward tree is propagated correctly during
454    # merging.
455    X = np.array(
456        [
457            (0.014, 0.120),
458            (0.014, 0.099),
459            (0.014, 0.097),
460            (0.017, 0.153),
461            (0.017, 0.153),
462            (0.018, 0.153),
463            (0.018, 0.153),
464            (0.018, 0.153),
465            (0.018, 0.153),
466            (0.018, 0.153),
467            (0.018, 0.153),
468            (0.018, 0.153),
469            (0.018, 0.152),
470            (0.018, 0.149),
471            (0.018, 0.144),
472        ]
473    )
474    connectivity = kneighbors_graph(X, 10, include_self=False)
475    ward = AgglomerativeClustering(
476        n_clusters=4, connectivity=connectivity, linkage="ward"
477    )
478    # If changes are not propagated correctly, fit crashes with an
479    # IndexError
480    ward.fit(X)
481
482
483def test_ward_tree_children_order():
484    # Check that children are ordered in the same way for both structured and
485    # unstructured versions of ward_tree.
486
487    # test on five random datasets
488    n, p = 10, 5
489    rng = np.random.RandomState(0)
490
491    connectivity = np.ones((n, n))
492    for i in range(5):
493        X = 0.1 * rng.normal(size=(n, p))
494        X -= 4.0 * np.arange(n)[:, np.newaxis]
495        X -= X.mean(axis=1)[:, np.newaxis]
496
497        out_unstructured = ward_tree(X)
498        out_structured = ward_tree(X, connectivity=connectivity)
499
500        assert_array_equal(out_unstructured[0], out_structured[0])
501
502
503def test_ward_linkage_tree_return_distance():
504    # Test return_distance option on linkage and ward trees
505
506    # test that return_distance when set true, gives same
507    # output on both structured and unstructured clustering.
508    n, p = 10, 5
509    rng = np.random.RandomState(0)
510
511    connectivity = np.ones((n, n))
512    for i in range(5):
513        X = 0.1 * rng.normal(size=(n, p))
514        X -= 4.0 * np.arange(n)[:, np.newaxis]
515        X -= X.mean(axis=1)[:, np.newaxis]
516
517        out_unstructured = ward_tree(X, return_distance=True)
518        out_structured = ward_tree(X, connectivity=connectivity, return_distance=True)
519
520        # get children
521        children_unstructured = out_unstructured[0]
522        children_structured = out_structured[0]
523
524        # check if we got the same clusters
525        assert_array_equal(children_unstructured, children_structured)
526
527        # check if the distances are the same
528        dist_unstructured = out_unstructured[-1]
529        dist_structured = out_structured[-1]
530
531        assert_array_almost_equal(dist_unstructured, dist_structured)
532
533        for linkage in ["average", "complete", "single"]:
534            structured_items = linkage_tree(
535                X, connectivity=connectivity, linkage=linkage, return_distance=True
536            )[-1]
537            unstructured_items = linkage_tree(X, linkage=linkage, return_distance=True)[
538                -1
539            ]
540            structured_dist = structured_items[-1]
541            unstructured_dist = unstructured_items[-1]
542            structured_children = structured_items[0]
543            unstructured_children = unstructured_items[0]
544            assert_array_almost_equal(structured_dist, unstructured_dist)
545            assert_array_almost_equal(structured_children, unstructured_children)
546
547    # test on the following dataset where we know the truth
548    # taken from scipy/cluster/tests/hierarchy_test_data.py
549    X = np.array(
550        [
551            [1.43054825, -7.5693489],
552            [6.95887839, 6.82293382],
553            [2.87137846, -9.68248579],
554            [7.87974764, -6.05485803],
555            [8.24018364, -6.09495602],
556            [7.39020262, 8.54004355],
557        ]
558    )
559    # truth
560    linkage_X_ward = np.array(
561        [
562            [3.0, 4.0, 0.36265956, 2.0],
563            [1.0, 5.0, 1.77045373, 2.0],
564            [0.0, 2.0, 2.55760419, 2.0],
565            [6.0, 8.0, 9.10208346, 4.0],
566            [7.0, 9.0, 24.7784379, 6.0],
567        ]
568    )
569
570    linkage_X_complete = np.array(
571        [
572            [3.0, 4.0, 0.36265956, 2.0],
573            [1.0, 5.0, 1.77045373, 2.0],
574            [0.0, 2.0, 2.55760419, 2.0],
575            [6.0, 8.0, 6.96742194, 4.0],
576            [7.0, 9.0, 18.77445997, 6.0],
577        ]
578    )
579
580    linkage_X_average = np.array(
581        [
582            [3.0, 4.0, 0.36265956, 2.0],
583            [1.0, 5.0, 1.77045373, 2.0],
584            [0.0, 2.0, 2.55760419, 2.0],
585            [6.0, 8.0, 6.55832839, 4.0],
586            [7.0, 9.0, 15.44089605, 6.0],
587        ]
588    )
589
590    n_samples, n_features = np.shape(X)
591    connectivity_X = np.ones((n_samples, n_samples))
592
593    out_X_unstructured = ward_tree(X, return_distance=True)
594    out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True)
595
596    # check that the labels are the same
597    assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0])
598    assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0])
599
600    # check that the distances are correct
601    assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4])
602    assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4])
603
604    linkage_options = ["complete", "average", "single"]
605    X_linkage_truth = [linkage_X_complete, linkage_X_average]
606    for (linkage, X_truth) in zip(linkage_options, X_linkage_truth):
607        out_X_unstructured = linkage_tree(X, return_distance=True, linkage=linkage)
608        out_X_structured = linkage_tree(
609            X, connectivity=connectivity_X, linkage=linkage, return_distance=True
610        )
611
612        # check that the labels are the same
613        assert_array_equal(X_truth[:, :2], out_X_unstructured[0])
614        assert_array_equal(X_truth[:, :2], out_X_structured[0])
615
616        # check that the distances are correct
617        assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4])
618        assert_array_almost_equal(X_truth[:, 2], out_X_structured[4])
619
620
621def test_connectivity_fixing_non_lil():
622    # Check non regression of a bug if a non item assignable connectivity is
623    # provided with more than one component.
624    # create dummy data
625    x = np.array([[0, 0], [1, 1]])
626    # create a mask with several components to force connectivity fixing
627    m = np.array([[True, False], [False, True]])
628    c = grid_to_graph(n_x=2, n_y=2, mask=m)
629    w = AgglomerativeClustering(connectivity=c, linkage="ward")
630    with pytest.warns(UserWarning):
631        w.fit(x)
632
633
634def test_int_float_dict():
635    rng = np.random.RandomState(0)
636    keys = np.unique(rng.randint(100, size=10).astype(np.intp, copy=False))
637    values = rng.rand(len(keys))
638
639    d = IntFloatDict(keys, values)
640    for key, value in zip(keys, values):
641        assert d[key] == value
642
643    other_keys = np.arange(50, dtype=np.intp)[::2]
644    other_values = np.full(50, 0.5)[::2]
645    other = IntFloatDict(other_keys, other_values)
646    # Complete smoke test
647    max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
648    average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1)
649
650
651def test_connectivity_callable():
652    rng = np.random.RandomState(0)
653    X = rng.rand(20, 5)
654    connectivity = kneighbors_graph(X, 3, include_self=False)
655    aglc1 = AgglomerativeClustering(connectivity=connectivity)
656    aglc2 = AgglomerativeClustering(
657        connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False)
658    )
659    aglc1.fit(X)
660    aglc2.fit(X)
661    assert_array_equal(aglc1.labels_, aglc2.labels_)
662
663
664def test_connectivity_ignores_diagonal():
665    rng = np.random.RandomState(0)
666    X = rng.rand(20, 5)
667    connectivity = kneighbors_graph(X, 3, include_self=False)
668    connectivity_include_self = kneighbors_graph(X, 3, include_self=True)
669    aglc1 = AgglomerativeClustering(connectivity=connectivity)
670    aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self)
671    aglc1.fit(X)
672    aglc2.fit(X)
673    assert_array_equal(aglc1.labels_, aglc2.labels_)
674
675
676def test_compute_full_tree():
677    # Test that the full tree is computed if n_clusters is small
678    rng = np.random.RandomState(0)
679    X = rng.randn(10, 2)
680    connectivity = kneighbors_graph(X, 5, include_self=False)
681
682    # When n_clusters is less, the full tree should be built
683    # that is the number of merges should be n_samples - 1
684    agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity)
685    agc.fit(X)
686    n_samples = X.shape[0]
687    n_nodes = agc.children_.shape[0]
688    assert n_nodes == n_samples - 1
689
690    # When n_clusters is large, greater than max of 100 and 0.02 * n_samples.
691    # we should stop when there are n_clusters.
692    n_clusters = 101
693    X = rng.randn(200, 2)
694    connectivity = kneighbors_graph(X, 10, include_self=False)
695    agc = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity)
696    agc.fit(X)
697    n_samples = X.shape[0]
698    n_nodes = agc.children_.shape[0]
699    assert n_nodes == n_samples - n_clusters
700
701
702def test_n_components():
703    # Test n_components returned by linkage, average and ward tree
704    rng = np.random.RandomState(0)
705    X = rng.rand(5, 5)
706
707    # Connectivity matrix having five components.
708    connectivity = np.eye(5)
709
710    for linkage_func in _TREE_BUILDERS.values():
711        assert ignore_warnings(linkage_func)(X, connectivity=connectivity)[1] == 5
712
713
714def test_agg_n_clusters():
715    # Test that an error is raised when n_clusters <= 0
716
717    rng = np.random.RandomState(0)
718    X = rng.rand(20, 10)
719    for n_clus in [-1, 0]:
720        agc = AgglomerativeClustering(n_clusters=n_clus)
721        msg = "n_clusters should be an integer greater than 0. %s was provided." % str(
722            agc.n_clusters
723        )
724        with pytest.raises(ValueError, match=msg):
725            agc.fit(X)
726
727
728def test_affinity_passed_to_fix_connectivity():
729    # Test that the affinity parameter is actually passed to the pairwise
730    # function
731
732    size = 2
733    rng = np.random.RandomState(0)
734    X = rng.randn(size, size)
735    mask = np.array([True, False, False, True])
736
737    connectivity = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray)
738
739    class FakeAffinity:
740        def __init__(self):
741            self.counter = 0
742
743        def increment(self, *args, **kwargs):
744            self.counter += 1
745            return self.counter
746
747    fa = FakeAffinity()
748
749    linkage_tree(X, connectivity=connectivity, affinity=fa.increment)
750
751    assert fa.counter == 3
752
753
754@pytest.mark.parametrize("linkage", ["ward", "complete", "average"])
755def test_agglomerative_clustering_with_distance_threshold(linkage):
756    # Check that we obtain the correct number of clusters with
757    # agglomerative clustering with distance_threshold.
758    rng = np.random.RandomState(0)
759    mask = np.ones([10, 10], dtype=bool)
760    n_samples = 100
761    X = rng.randn(n_samples, 50)
762    connectivity = grid_to_graph(*mask.shape)
763    # test when distance threshold is set to 10
764    distance_threshold = 10
765    for conn in [None, connectivity]:
766        clustering = AgglomerativeClustering(
767            n_clusters=None,
768            distance_threshold=distance_threshold,
769            connectivity=conn,
770            linkage=linkage,
771        )
772        clustering.fit(X)
773        clusters_produced = clustering.labels_
774        num_clusters_produced = len(np.unique(clustering.labels_))
775        # test if the clusters produced match the point in the linkage tree
776        # where the distance exceeds the threshold
777        tree_builder = _TREE_BUILDERS[linkage]
778        children, n_components, n_leaves, parent, distances = tree_builder(
779            X, connectivity=conn, n_clusters=None, return_distance=True
780        )
781        num_clusters_at_threshold = (
782            np.count_nonzero(distances >= distance_threshold) + 1
783        )
784        # test number of clusters produced
785        assert num_clusters_at_threshold == num_clusters_produced
786        # test clusters produced
787        clusters_at_threshold = _hc_cut(
788            n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves
789        )
790        assert np.array_equiv(clusters_produced, clusters_at_threshold)
791
792
793def test_small_distance_threshold():
794    rng = np.random.RandomState(0)
795    n_samples = 10
796    X = rng.randint(-300, 300, size=(n_samples, 3))
797    # this should result in all data in their own clusters, given that
798    # their pairwise distances are bigger than .1 (which may not be the case
799    # with a different random seed).
800    clustering = AgglomerativeClustering(
801        n_clusters=None, distance_threshold=1.0, linkage="single"
802    ).fit(X)
803    # check that the pairwise distances are indeed all larger than .1
804    all_distances = pairwise_distances(X, metric="minkowski", p=2)
805    np.fill_diagonal(all_distances, np.inf)
806    assert np.all(all_distances > 0.1)
807    assert clustering.n_clusters_ == n_samples
808
809
810def test_cluster_distances_with_distance_threshold():
811    rng = np.random.RandomState(0)
812    n_samples = 100
813    X = rng.randint(-10, 10, size=(n_samples, 3))
814    # check the distances within the clusters and with other clusters
815    distance_threshold = 4
816    clustering = AgglomerativeClustering(
817        n_clusters=None, distance_threshold=distance_threshold, linkage="single"
818    ).fit(X)
819    labels = clustering.labels_
820    D = pairwise_distances(X, metric="minkowski", p=2)
821    # to avoid taking the 0 diagonal in min()
822    np.fill_diagonal(D, np.inf)
823    for label in np.unique(labels):
824        in_cluster_mask = labels == label
825        max_in_cluster_distance = (
826            D[in_cluster_mask][:, in_cluster_mask].min(axis=0).max()
827        )
828        min_out_cluster_distance = (
829            D[in_cluster_mask][:, ~in_cluster_mask].min(axis=0).min()
830        )
831        # single data point clusters only have that inf diagonal here
832        if in_cluster_mask.sum() > 1:
833            assert max_in_cluster_distance < distance_threshold
834        assert min_out_cluster_distance >= distance_threshold
835
836
837@pytest.mark.parametrize("linkage", ["ward", "complete", "average"])
838@pytest.mark.parametrize(
839    ("threshold", "y_true"), [(0.5, [1, 0]), (1.0, [1, 0]), (1.5, [0, 0])]
840)
841def test_agglomerative_clustering_with_distance_threshold_edge_case(
842    linkage, threshold, y_true
843):
844    # test boundary case of distance_threshold matching the distance
845    X = [[0], [1]]
846    clusterer = AgglomerativeClustering(
847        n_clusters=None, distance_threshold=threshold, linkage=linkage
848    )
849    y_pred = clusterer.fit_predict(X)
850    assert adjusted_rand_score(y_true, y_pred) == 1
851
852
853def test_dist_threshold_invalid_parameters():
854    X = [[0], [1]]
855    with pytest.raises(ValueError, match="Exactly one of "):
856        AgglomerativeClustering(n_clusters=None, distance_threshold=None).fit(X)
857
858    with pytest.raises(ValueError, match="Exactly one of "):
859        AgglomerativeClustering(n_clusters=2, distance_threshold=1).fit(X)
860
861    X = [[0], [1]]
862    with pytest.raises(ValueError, match="compute_full_tree must be True if"):
863        AgglomerativeClustering(
864            n_clusters=None, distance_threshold=1, compute_full_tree=False
865        ).fit(X)
866
867
868def test_invalid_shape_precomputed_dist_matrix():
869    # Check that an error is raised when affinity='precomputed'
870    # and a non square matrix is passed (PR #16257).
871    rng = np.random.RandomState(0)
872    X = rng.rand(5, 3)
873    with pytest.raises(
874        ValueError,
875        match=r"Distance matrix should be square, got matrix of shape \(5, 3\)",
876    ):
877        AgglomerativeClustering(affinity="precomputed", linkage="complete").fit(X)
878
879
880def test_precomputed_connectivity_affinity_with_2_connected_components():
881    """Check that connecting components works when connectivity and
882    affinity are both precomputed and the number of connected components is
883    greater than 1. Non-regression test for #16151.
884    """
885
886    connectivity_matrix = np.array(
887        [
888            [0, 1, 1, 0, 0],
889            [0, 0, 1, 0, 0],
890            [0, 0, 0, 0, 0],
891            [0, 0, 0, 0, 1],
892            [0, 0, 0, 0, 0],
893        ]
894    )
895    # ensure that connectivity_matrix has two connected components
896    assert connected_components(connectivity_matrix)[0] == 2
897
898    rng = np.random.RandomState(0)
899    X = rng.randn(5, 10)
900
901    X_dist = pairwise_distances(X)
902    clusterer_precomputed = AgglomerativeClustering(
903        affinity="precomputed", connectivity=connectivity_matrix, linkage="complete"
904    )
905    msg = "Completing it to avoid stopping the tree early"
906    with pytest.warns(UserWarning, match=msg):
907        clusterer_precomputed.fit(X_dist)
908
909    clusterer = AgglomerativeClustering(
910        connectivity=connectivity_matrix, linkage="complete"
911    )
912    with pytest.warns(UserWarning, match=msg):
913        clusterer.fit(X)
914
915    assert_array_equal(clusterer.labels_, clusterer_precomputed.labels_)
916    assert_array_equal(clusterer.children_, clusterer_precomputed.children_)
917