1""" 2Several basic tests for hierarchical clustering procedures 3 4""" 5# Authors: Vincent Michel, 2010, Gael Varoquaux 2012, 6# Matteo Visconti di Oleggio Castello 2014 7# License: BSD 3 clause 8import itertools 9from tempfile import mkdtemp 10import shutil 11import pytest 12from functools import partial 13 14import numpy as np 15from scipy import sparse 16from scipy.cluster import hierarchy 17from scipy.sparse.csgraph import connected_components 18 19from sklearn.metrics.cluster import adjusted_rand_score 20from sklearn.metrics.tests.test_dist_metrics import METRICS_DEFAULT_PARAMS 21from sklearn.utils._testing import assert_almost_equal, create_memmap_backed_data 22from sklearn.utils._testing import assert_array_almost_equal 23from sklearn.utils._testing import ignore_warnings 24 25from sklearn.cluster import ward_tree 26from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration 27from sklearn.cluster._agglomerative import ( 28 _hc_cut, 29 _TREE_BUILDERS, 30 linkage_tree, 31 _fix_connectivity, 32) 33from sklearn.feature_extraction.image import grid_to_graph 34from sklearn.metrics import DistanceMetric 35from sklearn.metrics.pairwise import ( 36 PAIRED_DISTANCES, 37 cosine_distances, 38 manhattan_distances, 39 pairwise_distances, 40) 41from sklearn.metrics.cluster import normalized_mutual_info_score 42from sklearn.neighbors import kneighbors_graph 43from sklearn.cluster._hierarchical_fast import ( 44 average_merge, 45 max_merge, 46 mst_linkage_core, 47) 48from sklearn.utils._fast_dict import IntFloatDict 49from sklearn.utils._testing import assert_array_equal 50from sklearn.datasets import make_moons, make_circles 51 52 53def test_linkage_misc(): 54 # Misc tests on linkage 55 rng = np.random.RandomState(42) 56 X = rng.normal(size=(5, 5)) 57 with pytest.raises(ValueError): 58 AgglomerativeClustering(linkage="foo").fit(X) 59 60 with pytest.raises(ValueError): 61 linkage_tree(X, linkage="foo") 62 63 with pytest.raises(ValueError): 64 linkage_tree(X, connectivity=np.ones((4, 4))) 65 66 # Smoke test FeatureAgglomeration 67 FeatureAgglomeration().fit(X) 68 69 # test hierarchical clustering on a precomputed distances matrix 70 dis = cosine_distances(X) 71 72 res = linkage_tree(dis, affinity="precomputed") 73 assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) 74 75 # test hierarchical clustering on a precomputed distances matrix 76 res = linkage_tree(X, affinity=manhattan_distances) 77 assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0]) 78 79 80def test_structured_linkage_tree(): 81 # Check that we obtain the correct solution for structured linkage trees. 82 rng = np.random.RandomState(0) 83 mask = np.ones([10, 10], dtype=bool) 84 # Avoiding a mask with only 'True' entries 85 mask[4:7, 4:7] = 0 86 X = rng.randn(50, 100) 87 connectivity = grid_to_graph(*mask.shape) 88 for tree_builder in _TREE_BUILDERS.values(): 89 children, n_components, n_leaves, parent = tree_builder( 90 X.T, connectivity=connectivity 91 ) 92 n_nodes = 2 * X.shape[1] - 1 93 assert len(children) + n_leaves == n_nodes 94 # Check that ward_tree raises a ValueError with a connectivity matrix 95 # of the wrong shape 96 with pytest.raises(ValueError): 97 tree_builder(X.T, connectivity=np.ones((4, 4))) 98 # Check that fitting with no samples raises an error 99 with pytest.raises(ValueError): 100 tree_builder(X.T[:0], connectivity=connectivity) 101 102 103def test_unstructured_linkage_tree(): 104 # Check that we obtain the correct solution for unstructured linkage trees. 105 rng = np.random.RandomState(0) 106 X = rng.randn(50, 100) 107 for this_X in (X, X[0]): 108 # With specified a number of clusters just for the sake of 109 # raising a warning and testing the warning code 110 with ignore_warnings(): 111 with pytest.warns(UserWarning): 112 children, n_nodes, n_leaves, parent = ward_tree(this_X.T, n_clusters=10) 113 n_nodes = 2 * X.shape[1] - 1 114 assert len(children) + n_leaves == n_nodes 115 116 for tree_builder in _TREE_BUILDERS.values(): 117 for this_X in (X, X[0]): 118 with ignore_warnings(): 119 with pytest.warns(UserWarning): 120 children, n_nodes, n_leaves, parent = tree_builder( 121 this_X.T, n_clusters=10 122 ) 123 n_nodes = 2 * X.shape[1] - 1 124 assert len(children) + n_leaves == n_nodes 125 126 127def test_height_linkage_tree(): 128 # Check that the height of the results of linkage tree is sorted. 129 rng = np.random.RandomState(0) 130 mask = np.ones([10, 10], dtype=bool) 131 X = rng.randn(50, 100) 132 connectivity = grid_to_graph(*mask.shape) 133 for linkage_func in _TREE_BUILDERS.values(): 134 children, n_nodes, n_leaves, parent = linkage_func( 135 X.T, connectivity=connectivity 136 ) 137 n_nodes = 2 * X.shape[1] - 1 138 assert len(children) + n_leaves == n_nodes 139 140 141def test_agglomerative_clustering_wrong_arg_memory(): 142 # Test either if an error is raised when memory is not 143 # either a str or a joblib.Memory instance 144 rng = np.random.RandomState(0) 145 n_samples = 100 146 X = rng.randn(n_samples, 50) 147 memory = 5 148 clustering = AgglomerativeClustering(memory=memory) 149 with pytest.raises(ValueError): 150 clustering.fit(X) 151 152 153def test_zero_cosine_linkage_tree(): 154 # Check that zero vectors in X produce an error when 155 # 'cosine' affinity is used 156 X = np.array([[0, 1], [0, 0]]) 157 msg = "Cosine affinity cannot be used when X contains zero vectors" 158 with pytest.raises(ValueError, match=msg): 159 linkage_tree(X, affinity="cosine") 160 161 162@pytest.mark.parametrize("n_clusters, distance_threshold", [(None, 0.5), (10, None)]) 163@pytest.mark.parametrize("compute_distances", [True, False]) 164@pytest.mark.parametrize("linkage", ["ward", "complete", "average", "single"]) 165def test_agglomerative_clustering_distances( 166 n_clusters, compute_distances, distance_threshold, linkage 167): 168 # Check that when `compute_distances` is True or `distance_threshold` is 169 # given, the fitted model has an attribute `distances_`. 170 rng = np.random.RandomState(0) 171 mask = np.ones([10, 10], dtype=bool) 172 n_samples = 100 173 X = rng.randn(n_samples, 50) 174 connectivity = grid_to_graph(*mask.shape) 175 176 clustering = AgglomerativeClustering( 177 n_clusters=n_clusters, 178 connectivity=connectivity, 179 linkage=linkage, 180 distance_threshold=distance_threshold, 181 compute_distances=compute_distances, 182 ) 183 clustering.fit(X) 184 if compute_distances or (distance_threshold is not None): 185 assert hasattr(clustering, "distances_") 186 n_children = clustering.children_.shape[0] 187 n_nodes = n_children + 1 188 assert clustering.distances_.shape == (n_nodes - 1,) 189 else: 190 assert not hasattr(clustering, "distances_") 191 192 193def test_agglomerative_clustering(): 194 # Check that we obtain the correct number of clusters with 195 # agglomerative clustering. 196 rng = np.random.RandomState(0) 197 mask = np.ones([10, 10], dtype=bool) 198 n_samples = 100 199 X = rng.randn(n_samples, 50) 200 connectivity = grid_to_graph(*mask.shape) 201 for linkage in ("ward", "complete", "average", "single"): 202 clustering = AgglomerativeClustering( 203 n_clusters=10, connectivity=connectivity, linkage=linkage 204 ) 205 clustering.fit(X) 206 # test caching 207 try: 208 tempdir = mkdtemp() 209 clustering = AgglomerativeClustering( 210 n_clusters=10, 211 connectivity=connectivity, 212 memory=tempdir, 213 linkage=linkage, 214 ) 215 clustering.fit(X) 216 labels = clustering.labels_ 217 assert np.size(np.unique(labels)) == 10 218 finally: 219 shutil.rmtree(tempdir) 220 # Turn caching off now 221 clustering = AgglomerativeClustering( 222 n_clusters=10, connectivity=connectivity, linkage=linkage 223 ) 224 # Check that we obtain the same solution with early-stopping of the 225 # tree building 226 clustering.compute_full_tree = False 227 clustering.fit(X) 228 assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1) 229 clustering.connectivity = None 230 clustering.fit(X) 231 assert np.size(np.unique(clustering.labels_)) == 10 232 # Check that we raise a TypeError on dense matrices 233 clustering = AgglomerativeClustering( 234 n_clusters=10, 235 connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]), 236 linkage=linkage, 237 ) 238 with pytest.raises(ValueError): 239 clustering.fit(X) 240 241 # Test that using ward with another metric than euclidean raises an 242 # exception 243 clustering = AgglomerativeClustering( 244 n_clusters=10, 245 connectivity=connectivity.toarray(), 246 affinity="manhattan", 247 linkage="ward", 248 ) 249 with pytest.raises(ValueError): 250 clustering.fit(X) 251 252 # Test using another metric than euclidean works with linkage complete 253 for affinity in PAIRED_DISTANCES.keys(): 254 # Compare our (structured) implementation to scipy 255 clustering = AgglomerativeClustering( 256 n_clusters=10, 257 connectivity=np.ones((n_samples, n_samples)), 258 affinity=affinity, 259 linkage="complete", 260 ) 261 clustering.fit(X) 262 clustering2 = AgglomerativeClustering( 263 n_clusters=10, connectivity=None, affinity=affinity, linkage="complete" 264 ) 265 clustering2.fit(X) 266 assert_almost_equal( 267 normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1 268 ) 269 270 # Test that using a distance matrix (affinity = 'precomputed') has same 271 # results (with connectivity constraints) 272 clustering = AgglomerativeClustering( 273 n_clusters=10, connectivity=connectivity, linkage="complete" 274 ) 275 clustering.fit(X) 276 X_dist = pairwise_distances(X) 277 clustering2 = AgglomerativeClustering( 278 n_clusters=10, 279 connectivity=connectivity, 280 affinity="precomputed", 281 linkage="complete", 282 ) 283 clustering2.fit(X_dist) 284 assert_array_equal(clustering.labels_, clustering2.labels_) 285 286 287def test_agglomerative_clustering_memory_mapped(): 288 """AgglomerativeClustering must work on mem-mapped dataset. 289 290 Non-regression test for issue #19875. 291 """ 292 rng = np.random.RandomState(0) 293 Xmm = create_memmap_backed_data(rng.randn(50, 100)) 294 AgglomerativeClustering(affinity="euclidean", linkage="single").fit(Xmm) 295 296 297def test_ward_agglomeration(): 298 # Check that we obtain the correct solution in a simplistic case 299 rng = np.random.RandomState(0) 300 mask = np.ones([10, 10], dtype=bool) 301 X = rng.randn(50, 100) 302 connectivity = grid_to_graph(*mask.shape) 303 agglo = FeatureAgglomeration(n_clusters=5, connectivity=connectivity) 304 agglo.fit(X) 305 assert np.size(np.unique(agglo.labels_)) == 5 306 307 X_red = agglo.transform(X) 308 assert X_red.shape[1] == 5 309 X_full = agglo.inverse_transform(X_red) 310 assert np.unique(X_full[0]).size == 5 311 assert_array_almost_equal(agglo.transform(X_full), X_red) 312 313 # Check that fitting with no samples raises a ValueError 314 with pytest.raises(ValueError): 315 agglo.fit(X[:0]) 316 317 318def test_single_linkage_clustering(): 319 # Check that we get the correct result in two emblematic cases 320 moons, moon_labels = make_moons(noise=0.05, random_state=42) 321 clustering = AgglomerativeClustering(n_clusters=2, linkage="single") 322 clustering.fit(moons) 323 assert_almost_equal( 324 normalized_mutual_info_score(clustering.labels_, moon_labels), 1 325 ) 326 327 circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42) 328 clustering = AgglomerativeClustering(n_clusters=2, linkage="single") 329 clustering.fit(circles) 330 assert_almost_equal( 331 normalized_mutual_info_score(clustering.labels_, circle_labels), 1 332 ) 333 334 335def assess_same_labelling(cut1, cut2): 336 """Util for comparison with scipy""" 337 co_clust = [] 338 for cut in [cut1, cut2]: 339 n = len(cut) 340 k = cut.max() + 1 341 ecut = np.zeros((n, k)) 342 ecut[np.arange(n), cut] = 1 343 co_clust.append(np.dot(ecut, ecut.T)) 344 assert (co_clust[0] == co_clust[1]).all() 345 346 347def test_sparse_scikit_vs_scipy(): 348 # Test scikit linkage with full connectivity (i.e. unstructured) vs scipy 349 n, p, k = 10, 5, 3 350 rng = np.random.RandomState(0) 351 352 # Not using a lil_matrix here, just to check that non sparse 353 # matrices are well handled 354 connectivity = np.ones((n, n)) 355 for linkage in _TREE_BUILDERS.keys(): 356 for i in range(5): 357 X = 0.1 * rng.normal(size=(n, p)) 358 X -= 4.0 * np.arange(n)[:, np.newaxis] 359 X -= X.mean(axis=1)[:, np.newaxis] 360 361 out = hierarchy.linkage(X, method=linkage) 362 363 children_ = out[:, :2].astype(int, copy=False) 364 children, _, n_leaves, _ = _TREE_BUILDERS[linkage]( 365 X, connectivity=connectivity 366 ) 367 368 # Sort the order of child nodes per row for consistency 369 children.sort(axis=1) 370 assert_array_equal( 371 children, 372 children_, 373 "linkage tree differs from scipy impl for linkage: " + linkage, 374 ) 375 376 cut = _hc_cut(k, children, n_leaves) 377 cut_ = _hc_cut(k, children_, n_leaves) 378 assess_same_labelling(cut, cut_) 379 380 # Test error management in _hc_cut 381 with pytest.raises(ValueError): 382 _hc_cut(n_leaves + 1, children, n_leaves) 383 384 385# Make sure our custom mst_linkage_core gives 386# the same results as scipy's builtin 387@pytest.mark.parametrize("seed", range(5)) 388def test_vector_scikit_single_vs_scipy_single(seed): 389 n_samples, n_features, n_clusters = 10, 5, 3 390 rng = np.random.RandomState(seed) 391 X = 0.1 * rng.normal(size=(n_samples, n_features)) 392 X -= 4.0 * np.arange(n_samples)[:, np.newaxis] 393 X -= X.mean(axis=1)[:, np.newaxis] 394 395 out = hierarchy.linkage(X, method="single") 396 children_scipy = out[:, :2].astype(int) 397 398 children, _, n_leaves, _ = _TREE_BUILDERS["single"](X) 399 400 # Sort the order of child nodes per row for consistency 401 children.sort(axis=1) 402 assert_array_equal( 403 children, 404 children_scipy, 405 "linkage tree differs from scipy impl for single linkage.", 406 ) 407 408 cut = _hc_cut(n_clusters, children, n_leaves) 409 cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves) 410 assess_same_labelling(cut, cut_scipy) 411 412 413@pytest.mark.parametrize("metric_param_grid", METRICS_DEFAULT_PARAMS) 414def test_mst_linkage_core_memory_mapped(metric_param_grid): 415 """The MST-LINKAGE-CORE algorithm must work on mem-mapped dataset. 416 417 Non-regression test for issue #19875. 418 """ 419 rng = np.random.RandomState(seed=1) 420 X = rng.normal(size=(20, 4)) 421 Xmm = create_memmap_backed_data(X) 422 metric, param_grid = metric_param_grid 423 keys = param_grid.keys() 424 for vals in itertools.product(*param_grid.values()): 425 kwargs = dict(zip(keys, vals)) 426 distance_metric = DistanceMetric.get_metric(metric, **kwargs) 427 mst = mst_linkage_core(X, distance_metric) 428 mst_mm = mst_linkage_core(Xmm, distance_metric) 429 np.testing.assert_equal(mst, mst_mm) 430 431 432def test_identical_points(): 433 # Ensure identical points are handled correctly when using mst with 434 # a sparse connectivity matrix 435 X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]]) 436 true_labels = np.array([0, 0, 1, 1, 2, 2]) 437 connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False) 438 connectivity = 0.5 * (connectivity + connectivity.T) 439 connectivity, n_components = _fix_connectivity(X, connectivity, "euclidean") 440 441 for linkage in ("single", "average", "average", "ward"): 442 clustering = AgglomerativeClustering( 443 n_clusters=3, linkage=linkage, connectivity=connectivity 444 ) 445 clustering.fit(X) 446 447 assert_almost_equal( 448 normalized_mutual_info_score(clustering.labels_, true_labels), 1 449 ) 450 451 452def test_connectivity_propagation(): 453 # Check that connectivity in the ward tree is propagated correctly during 454 # merging. 455 X = np.array( 456 [ 457 (0.014, 0.120), 458 (0.014, 0.099), 459 (0.014, 0.097), 460 (0.017, 0.153), 461 (0.017, 0.153), 462 (0.018, 0.153), 463 (0.018, 0.153), 464 (0.018, 0.153), 465 (0.018, 0.153), 466 (0.018, 0.153), 467 (0.018, 0.153), 468 (0.018, 0.153), 469 (0.018, 0.152), 470 (0.018, 0.149), 471 (0.018, 0.144), 472 ] 473 ) 474 connectivity = kneighbors_graph(X, 10, include_self=False) 475 ward = AgglomerativeClustering( 476 n_clusters=4, connectivity=connectivity, linkage="ward" 477 ) 478 # If changes are not propagated correctly, fit crashes with an 479 # IndexError 480 ward.fit(X) 481 482 483def test_ward_tree_children_order(): 484 # Check that children are ordered in the same way for both structured and 485 # unstructured versions of ward_tree. 486 487 # test on five random datasets 488 n, p = 10, 5 489 rng = np.random.RandomState(0) 490 491 connectivity = np.ones((n, n)) 492 for i in range(5): 493 X = 0.1 * rng.normal(size=(n, p)) 494 X -= 4.0 * np.arange(n)[:, np.newaxis] 495 X -= X.mean(axis=1)[:, np.newaxis] 496 497 out_unstructured = ward_tree(X) 498 out_structured = ward_tree(X, connectivity=connectivity) 499 500 assert_array_equal(out_unstructured[0], out_structured[0]) 501 502 503def test_ward_linkage_tree_return_distance(): 504 # Test return_distance option on linkage and ward trees 505 506 # test that return_distance when set true, gives same 507 # output on both structured and unstructured clustering. 508 n, p = 10, 5 509 rng = np.random.RandomState(0) 510 511 connectivity = np.ones((n, n)) 512 for i in range(5): 513 X = 0.1 * rng.normal(size=(n, p)) 514 X -= 4.0 * np.arange(n)[:, np.newaxis] 515 X -= X.mean(axis=1)[:, np.newaxis] 516 517 out_unstructured = ward_tree(X, return_distance=True) 518 out_structured = ward_tree(X, connectivity=connectivity, return_distance=True) 519 520 # get children 521 children_unstructured = out_unstructured[0] 522 children_structured = out_structured[0] 523 524 # check if we got the same clusters 525 assert_array_equal(children_unstructured, children_structured) 526 527 # check if the distances are the same 528 dist_unstructured = out_unstructured[-1] 529 dist_structured = out_structured[-1] 530 531 assert_array_almost_equal(dist_unstructured, dist_structured) 532 533 for linkage in ["average", "complete", "single"]: 534 structured_items = linkage_tree( 535 X, connectivity=connectivity, linkage=linkage, return_distance=True 536 )[-1] 537 unstructured_items = linkage_tree(X, linkage=linkage, return_distance=True)[ 538 -1 539 ] 540 structured_dist = structured_items[-1] 541 unstructured_dist = unstructured_items[-1] 542 structured_children = structured_items[0] 543 unstructured_children = unstructured_items[0] 544 assert_array_almost_equal(structured_dist, unstructured_dist) 545 assert_array_almost_equal(structured_children, unstructured_children) 546 547 # test on the following dataset where we know the truth 548 # taken from scipy/cluster/tests/hierarchy_test_data.py 549 X = np.array( 550 [ 551 [1.43054825, -7.5693489], 552 [6.95887839, 6.82293382], 553 [2.87137846, -9.68248579], 554 [7.87974764, -6.05485803], 555 [8.24018364, -6.09495602], 556 [7.39020262, 8.54004355], 557 ] 558 ) 559 # truth 560 linkage_X_ward = np.array( 561 [ 562 [3.0, 4.0, 0.36265956, 2.0], 563 [1.0, 5.0, 1.77045373, 2.0], 564 [0.0, 2.0, 2.55760419, 2.0], 565 [6.0, 8.0, 9.10208346, 4.0], 566 [7.0, 9.0, 24.7784379, 6.0], 567 ] 568 ) 569 570 linkage_X_complete = np.array( 571 [ 572 [3.0, 4.0, 0.36265956, 2.0], 573 [1.0, 5.0, 1.77045373, 2.0], 574 [0.0, 2.0, 2.55760419, 2.0], 575 [6.0, 8.0, 6.96742194, 4.0], 576 [7.0, 9.0, 18.77445997, 6.0], 577 ] 578 ) 579 580 linkage_X_average = np.array( 581 [ 582 [3.0, 4.0, 0.36265956, 2.0], 583 [1.0, 5.0, 1.77045373, 2.0], 584 [0.0, 2.0, 2.55760419, 2.0], 585 [6.0, 8.0, 6.55832839, 4.0], 586 [7.0, 9.0, 15.44089605, 6.0], 587 ] 588 ) 589 590 n_samples, n_features = np.shape(X) 591 connectivity_X = np.ones((n_samples, n_samples)) 592 593 out_X_unstructured = ward_tree(X, return_distance=True) 594 out_X_structured = ward_tree(X, connectivity=connectivity_X, return_distance=True) 595 596 # check that the labels are the same 597 assert_array_equal(linkage_X_ward[:, :2], out_X_unstructured[0]) 598 assert_array_equal(linkage_X_ward[:, :2], out_X_structured[0]) 599 600 # check that the distances are correct 601 assert_array_almost_equal(linkage_X_ward[:, 2], out_X_unstructured[4]) 602 assert_array_almost_equal(linkage_X_ward[:, 2], out_X_structured[4]) 603 604 linkage_options = ["complete", "average", "single"] 605 X_linkage_truth = [linkage_X_complete, linkage_X_average] 606 for (linkage, X_truth) in zip(linkage_options, X_linkage_truth): 607 out_X_unstructured = linkage_tree(X, return_distance=True, linkage=linkage) 608 out_X_structured = linkage_tree( 609 X, connectivity=connectivity_X, linkage=linkage, return_distance=True 610 ) 611 612 # check that the labels are the same 613 assert_array_equal(X_truth[:, :2], out_X_unstructured[0]) 614 assert_array_equal(X_truth[:, :2], out_X_structured[0]) 615 616 # check that the distances are correct 617 assert_array_almost_equal(X_truth[:, 2], out_X_unstructured[4]) 618 assert_array_almost_equal(X_truth[:, 2], out_X_structured[4]) 619 620 621def test_connectivity_fixing_non_lil(): 622 # Check non regression of a bug if a non item assignable connectivity is 623 # provided with more than one component. 624 # create dummy data 625 x = np.array([[0, 0], [1, 1]]) 626 # create a mask with several components to force connectivity fixing 627 m = np.array([[True, False], [False, True]]) 628 c = grid_to_graph(n_x=2, n_y=2, mask=m) 629 w = AgglomerativeClustering(connectivity=c, linkage="ward") 630 with pytest.warns(UserWarning): 631 w.fit(x) 632 633 634def test_int_float_dict(): 635 rng = np.random.RandomState(0) 636 keys = np.unique(rng.randint(100, size=10).astype(np.intp, copy=False)) 637 values = rng.rand(len(keys)) 638 639 d = IntFloatDict(keys, values) 640 for key, value in zip(keys, values): 641 assert d[key] == value 642 643 other_keys = np.arange(50, dtype=np.intp)[::2] 644 other_values = np.full(50, 0.5)[::2] 645 other = IntFloatDict(other_keys, other_values) 646 # Complete smoke test 647 max_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1) 648 average_merge(d, other, mask=np.ones(100, dtype=np.intp), n_a=1, n_b=1) 649 650 651def test_connectivity_callable(): 652 rng = np.random.RandomState(0) 653 X = rng.rand(20, 5) 654 connectivity = kneighbors_graph(X, 3, include_self=False) 655 aglc1 = AgglomerativeClustering(connectivity=connectivity) 656 aglc2 = AgglomerativeClustering( 657 connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False) 658 ) 659 aglc1.fit(X) 660 aglc2.fit(X) 661 assert_array_equal(aglc1.labels_, aglc2.labels_) 662 663 664def test_connectivity_ignores_diagonal(): 665 rng = np.random.RandomState(0) 666 X = rng.rand(20, 5) 667 connectivity = kneighbors_graph(X, 3, include_self=False) 668 connectivity_include_self = kneighbors_graph(X, 3, include_self=True) 669 aglc1 = AgglomerativeClustering(connectivity=connectivity) 670 aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self) 671 aglc1.fit(X) 672 aglc2.fit(X) 673 assert_array_equal(aglc1.labels_, aglc2.labels_) 674 675 676def test_compute_full_tree(): 677 # Test that the full tree is computed if n_clusters is small 678 rng = np.random.RandomState(0) 679 X = rng.randn(10, 2) 680 connectivity = kneighbors_graph(X, 5, include_self=False) 681 682 # When n_clusters is less, the full tree should be built 683 # that is the number of merges should be n_samples - 1 684 agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity) 685 agc.fit(X) 686 n_samples = X.shape[0] 687 n_nodes = agc.children_.shape[0] 688 assert n_nodes == n_samples - 1 689 690 # When n_clusters is large, greater than max of 100 and 0.02 * n_samples. 691 # we should stop when there are n_clusters. 692 n_clusters = 101 693 X = rng.randn(200, 2) 694 connectivity = kneighbors_graph(X, 10, include_self=False) 695 agc = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity) 696 agc.fit(X) 697 n_samples = X.shape[0] 698 n_nodes = agc.children_.shape[0] 699 assert n_nodes == n_samples - n_clusters 700 701 702def test_n_components(): 703 # Test n_components returned by linkage, average and ward tree 704 rng = np.random.RandomState(0) 705 X = rng.rand(5, 5) 706 707 # Connectivity matrix having five components. 708 connectivity = np.eye(5) 709 710 for linkage_func in _TREE_BUILDERS.values(): 711 assert ignore_warnings(linkage_func)(X, connectivity=connectivity)[1] == 5 712 713 714def test_agg_n_clusters(): 715 # Test that an error is raised when n_clusters <= 0 716 717 rng = np.random.RandomState(0) 718 X = rng.rand(20, 10) 719 for n_clus in [-1, 0]: 720 agc = AgglomerativeClustering(n_clusters=n_clus) 721 msg = "n_clusters should be an integer greater than 0. %s was provided." % str( 722 agc.n_clusters 723 ) 724 with pytest.raises(ValueError, match=msg): 725 agc.fit(X) 726 727 728def test_affinity_passed_to_fix_connectivity(): 729 # Test that the affinity parameter is actually passed to the pairwise 730 # function 731 732 size = 2 733 rng = np.random.RandomState(0) 734 X = rng.randn(size, size) 735 mask = np.array([True, False, False, True]) 736 737 connectivity = grid_to_graph(n_x=size, n_y=size, mask=mask, return_as=np.ndarray) 738 739 class FakeAffinity: 740 def __init__(self): 741 self.counter = 0 742 743 def increment(self, *args, **kwargs): 744 self.counter += 1 745 return self.counter 746 747 fa = FakeAffinity() 748 749 linkage_tree(X, connectivity=connectivity, affinity=fa.increment) 750 751 assert fa.counter == 3 752 753 754@pytest.mark.parametrize("linkage", ["ward", "complete", "average"]) 755def test_agglomerative_clustering_with_distance_threshold(linkage): 756 # Check that we obtain the correct number of clusters with 757 # agglomerative clustering with distance_threshold. 758 rng = np.random.RandomState(0) 759 mask = np.ones([10, 10], dtype=bool) 760 n_samples = 100 761 X = rng.randn(n_samples, 50) 762 connectivity = grid_to_graph(*mask.shape) 763 # test when distance threshold is set to 10 764 distance_threshold = 10 765 for conn in [None, connectivity]: 766 clustering = AgglomerativeClustering( 767 n_clusters=None, 768 distance_threshold=distance_threshold, 769 connectivity=conn, 770 linkage=linkage, 771 ) 772 clustering.fit(X) 773 clusters_produced = clustering.labels_ 774 num_clusters_produced = len(np.unique(clustering.labels_)) 775 # test if the clusters produced match the point in the linkage tree 776 # where the distance exceeds the threshold 777 tree_builder = _TREE_BUILDERS[linkage] 778 children, n_components, n_leaves, parent, distances = tree_builder( 779 X, connectivity=conn, n_clusters=None, return_distance=True 780 ) 781 num_clusters_at_threshold = ( 782 np.count_nonzero(distances >= distance_threshold) + 1 783 ) 784 # test number of clusters produced 785 assert num_clusters_at_threshold == num_clusters_produced 786 # test clusters produced 787 clusters_at_threshold = _hc_cut( 788 n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves 789 ) 790 assert np.array_equiv(clusters_produced, clusters_at_threshold) 791 792 793def test_small_distance_threshold(): 794 rng = np.random.RandomState(0) 795 n_samples = 10 796 X = rng.randint(-300, 300, size=(n_samples, 3)) 797 # this should result in all data in their own clusters, given that 798 # their pairwise distances are bigger than .1 (which may not be the case 799 # with a different random seed). 800 clustering = AgglomerativeClustering( 801 n_clusters=None, distance_threshold=1.0, linkage="single" 802 ).fit(X) 803 # check that the pairwise distances are indeed all larger than .1 804 all_distances = pairwise_distances(X, metric="minkowski", p=2) 805 np.fill_diagonal(all_distances, np.inf) 806 assert np.all(all_distances > 0.1) 807 assert clustering.n_clusters_ == n_samples 808 809 810def test_cluster_distances_with_distance_threshold(): 811 rng = np.random.RandomState(0) 812 n_samples = 100 813 X = rng.randint(-10, 10, size=(n_samples, 3)) 814 # check the distances within the clusters and with other clusters 815 distance_threshold = 4 816 clustering = AgglomerativeClustering( 817 n_clusters=None, distance_threshold=distance_threshold, linkage="single" 818 ).fit(X) 819 labels = clustering.labels_ 820 D = pairwise_distances(X, metric="minkowski", p=2) 821 # to avoid taking the 0 diagonal in min() 822 np.fill_diagonal(D, np.inf) 823 for label in np.unique(labels): 824 in_cluster_mask = labels == label 825 max_in_cluster_distance = ( 826 D[in_cluster_mask][:, in_cluster_mask].min(axis=0).max() 827 ) 828 min_out_cluster_distance = ( 829 D[in_cluster_mask][:, ~in_cluster_mask].min(axis=0).min() 830 ) 831 # single data point clusters only have that inf diagonal here 832 if in_cluster_mask.sum() > 1: 833 assert max_in_cluster_distance < distance_threshold 834 assert min_out_cluster_distance >= distance_threshold 835 836 837@pytest.mark.parametrize("linkage", ["ward", "complete", "average"]) 838@pytest.mark.parametrize( 839 ("threshold", "y_true"), [(0.5, [1, 0]), (1.0, [1, 0]), (1.5, [0, 0])] 840) 841def test_agglomerative_clustering_with_distance_threshold_edge_case( 842 linkage, threshold, y_true 843): 844 # test boundary case of distance_threshold matching the distance 845 X = [[0], [1]] 846 clusterer = AgglomerativeClustering( 847 n_clusters=None, distance_threshold=threshold, linkage=linkage 848 ) 849 y_pred = clusterer.fit_predict(X) 850 assert adjusted_rand_score(y_true, y_pred) == 1 851 852 853def test_dist_threshold_invalid_parameters(): 854 X = [[0], [1]] 855 with pytest.raises(ValueError, match="Exactly one of "): 856 AgglomerativeClustering(n_clusters=None, distance_threshold=None).fit(X) 857 858 with pytest.raises(ValueError, match="Exactly one of "): 859 AgglomerativeClustering(n_clusters=2, distance_threshold=1).fit(X) 860 861 X = [[0], [1]] 862 with pytest.raises(ValueError, match="compute_full_tree must be True if"): 863 AgglomerativeClustering( 864 n_clusters=None, distance_threshold=1, compute_full_tree=False 865 ).fit(X) 866 867 868def test_invalid_shape_precomputed_dist_matrix(): 869 # Check that an error is raised when affinity='precomputed' 870 # and a non square matrix is passed (PR #16257). 871 rng = np.random.RandomState(0) 872 X = rng.rand(5, 3) 873 with pytest.raises( 874 ValueError, 875 match=r"Distance matrix should be square, got matrix of shape \(5, 3\)", 876 ): 877 AgglomerativeClustering(affinity="precomputed", linkage="complete").fit(X) 878 879 880def test_precomputed_connectivity_affinity_with_2_connected_components(): 881 """Check that connecting components works when connectivity and 882 affinity are both precomputed and the number of connected components is 883 greater than 1. Non-regression test for #16151. 884 """ 885 886 connectivity_matrix = np.array( 887 [ 888 [0, 1, 1, 0, 0], 889 [0, 0, 1, 0, 0], 890 [0, 0, 0, 0, 0], 891 [0, 0, 0, 0, 1], 892 [0, 0, 0, 0, 0], 893 ] 894 ) 895 # ensure that connectivity_matrix has two connected components 896 assert connected_components(connectivity_matrix)[0] == 2 897 898 rng = np.random.RandomState(0) 899 X = rng.randn(5, 10) 900 901 X_dist = pairwise_distances(X) 902 clusterer_precomputed = AgglomerativeClustering( 903 affinity="precomputed", connectivity=connectivity_matrix, linkage="complete" 904 ) 905 msg = "Completing it to avoid stopping the tree early" 906 with pytest.warns(UserWarning, match=msg): 907 clusterer_precomputed.fit(X_dist) 908 909 clusterer = AgglomerativeClustering( 910 connectivity=connectivity_matrix, linkage="complete" 911 ) 912 with pytest.warns(UserWarning, match=msg): 913 clusterer.fit(X) 914 915 assert_array_equal(clusterer.labels_, clusterer_precomputed.labels_) 916 assert_array_equal(clusterer.children_, clusterer_precomputed.children_) 917