1""" 2Hierarchical clustering (:mod:`scipy.cluster.hierarchy`) 3======================================================== 4 5.. currentmodule:: scipy.cluster.hierarchy 6 7These functions cut hierarchical clusterings into flat clusterings 8or find the roots of the forest formed by a cut by providing the flat 9cluster ids of each observation. 10 11.. autosummary:: 12 :toctree: generated/ 13 14 fcluster 15 fclusterdata 16 leaders 17 18These are routines for agglomerative clustering. 19 20.. autosummary:: 21 :toctree: generated/ 22 23 linkage 24 single 25 complete 26 average 27 weighted 28 centroid 29 median 30 ward 31 32These routines compute statistics on hierarchies. 33 34.. autosummary:: 35 :toctree: generated/ 36 37 cophenet 38 from_mlab_linkage 39 inconsistent 40 maxinconsts 41 maxdists 42 maxRstat 43 to_mlab_linkage 44 45Routines for visualizing flat clusters. 46 47.. autosummary:: 48 :toctree: generated/ 49 50 dendrogram 51 52These are data structures and routines for representing hierarchies as 53tree objects. 54 55.. autosummary:: 56 :toctree: generated/ 57 58 ClusterNode 59 leaves_list 60 to_tree 61 cut_tree 62 optimal_leaf_ordering 63 64These are predicates for checking the validity of linkage and 65inconsistency matrices as well as for checking isomorphism of two 66flat cluster assignments. 67 68.. autosummary:: 69 :toctree: generated/ 70 71 is_valid_im 72 is_valid_linkage 73 is_isomorphic 74 is_monotonic 75 correspond 76 num_obs_linkage 77 78Utility routines for plotting: 79 80.. autosummary:: 81 :toctree: generated/ 82 83 set_link_color_palette 84 85Utility classes: 86 87.. autosummary:: 88 :toctree: generated/ 89 90 DisjointSet -- data structure for incremental connectivity queries 91 92""" 93# Copyright (C) Damian Eads, 2007-2008. New BSD License. 94 95# hierarchy.py (derived from cluster.py, http://scipy-cluster.googlecode.com) 96# 97# Author: Damian Eads 98# Date: September 22, 2007 99# 100# Copyright (c) 2007, 2008, Damian Eads 101# 102# All rights reserved. 103# 104# Redistribution and use in source and binary forms, with or without 105# modification, are permitted provided that the following conditions 106# are met: 107# - Redistributions of source code must retain the above 108# copyright notice, this list of conditions and the 109# following disclaimer. 110# - Redistributions in binary form must reproduce the above copyright 111# notice, this list of conditions and the following disclaimer 112# in the documentation and/or other materials provided with the 113# distribution. 114# - Neither the name of the author nor the names of its 115# contributors may be used to endorse or promote products derived 116# from this software without specific prior written permission. 117# 118# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 119# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 120# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 121# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 122# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 123# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 124# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 125# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 126# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 127# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 128# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 129 130import warnings 131import bisect 132from collections import deque 133 134import numpy as np 135from . import _hierarchy, _optimal_leaf_ordering 136import scipy.spatial.distance as distance 137from scipy._lib._disjoint_set import DisjointSet 138 139 140_LINKAGE_METHODS = {'single': 0, 'complete': 1, 'average': 2, 'centroid': 3, 141 'median': 4, 'ward': 5, 'weighted': 6} 142_EUCLIDEAN_METHODS = ('centroid', 'median', 'ward') 143 144__all__ = ['ClusterNode', 'DisjointSet', 'average', 'centroid', 'complete', 145 'cophenet', 'correspond', 'cut_tree', 'dendrogram', 'fcluster', 146 'fclusterdata', 'from_mlab_linkage', 'inconsistent', 147 'is_isomorphic', 'is_monotonic', 'is_valid_im', 'is_valid_linkage', 148 'leaders', 'leaves_list', 'linkage', 'maxRstat', 'maxdists', 149 'maxinconsts', 'median', 'num_obs_linkage', 'optimal_leaf_ordering', 150 'set_link_color_palette', 'single', 'to_mlab_linkage', 'to_tree', 151 'ward', 'weighted', 'distance'] 152 153 154class ClusterWarning(UserWarning): 155 pass 156 157 158def _warning(s): 159 warnings.warn('scipy.cluster: %s' % s, ClusterWarning, stacklevel=3) 160 161 162def _copy_array_if_base_present(a): 163 """ 164 Copy the array if its base points to a parent array. 165 """ 166 if a.base is not None: 167 return a.copy() 168 elif np.issubsctype(a, np.float32): 169 return np.array(a, dtype=np.double) 170 else: 171 return a 172 173 174def _copy_arrays_if_base_present(T): 175 """ 176 Accept a tuple of arrays T. Copies the array T[i] if its base array 177 points to an actual array. Otherwise, the reference is just copied. 178 This is useful if the arrays are being passed to a C function that 179 does not do proper striding. 180 """ 181 l = [_copy_array_if_base_present(a) for a in T] 182 return l 183 184 185def _randdm(pnts): 186 """ 187 Generate a random distance matrix stored in condensed form. 188 189 Parameters 190 ---------- 191 pnts : int 192 The number of points in the distance matrix. Has to be at least 2. 193 194 Returns 195 ------- 196 D : ndarray 197 A ``pnts * (pnts - 1) / 2`` sized vector is returned. 198 """ 199 if pnts >= 2: 200 D = np.random.rand(pnts * (pnts - 1) / 2) 201 else: 202 raise ValueError("The number of points in the distance matrix " 203 "must be at least 2.") 204 return D 205 206 207def single(y): 208 """ 209 Perform single/min/nearest linkage on the condensed distance matrix ``y``. 210 211 Parameters 212 ---------- 213 y : ndarray 214 The upper triangular of the distance matrix. The result of 215 ``pdist`` is returned in this form. 216 217 Returns 218 ------- 219 Z : ndarray 220 The linkage matrix. 221 222 See Also 223 -------- 224 linkage : for advanced creation of hierarchical clusterings. 225 scipy.spatial.distance.pdist : pairwise distance metrics 226 227 Examples 228 -------- 229 >>> from scipy.cluster.hierarchy import single, fcluster 230 >>> from scipy.spatial.distance import pdist 231 232 First, we need a toy dataset to play with:: 233 234 x x x x 235 x x 236 237 x x 238 x x x x 239 240 >>> X = [[0, 0], [0, 1], [1, 0], 241 ... [0, 4], [0, 3], [1, 4], 242 ... [4, 0], [3, 0], [4, 1], 243 ... [4, 4], [3, 4], [4, 3]] 244 245 Then, we get a condensed distance matrix from this dataset: 246 247 >>> y = pdist(X) 248 249 Finally, we can perform the clustering: 250 251 >>> Z = single(y) 252 >>> Z 253 array([[ 0., 1., 1., 2.], 254 [ 2., 12., 1., 3.], 255 [ 3., 4., 1., 2.], 256 [ 5., 14., 1., 3.], 257 [ 6., 7., 1., 2.], 258 [ 8., 16., 1., 3.], 259 [ 9., 10., 1., 2.], 260 [11., 18., 1., 3.], 261 [13., 15., 2., 6.], 262 [17., 20., 2., 9.], 263 [19., 21., 2., 12.]]) 264 265 The linkage matrix ``Z`` represents a dendrogram - see 266 `scipy.cluster.hierarchy.linkage` for a detailed explanation of its 267 contents. 268 269 We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster 270 each initial point would belong given a distance threshold: 271 272 >>> fcluster(Z, 0.9, criterion='distance') 273 array([ 7, 8, 9, 10, 11, 12, 4, 5, 6, 1, 2, 3], dtype=int32) 274 >>> fcluster(Z, 1, criterion='distance') 275 array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32) 276 >>> fcluster(Z, 2, criterion='distance') 277 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) 278 279 Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a 280 plot of the dendrogram. 281 """ 282 return linkage(y, method='single', metric='euclidean') 283 284 285def complete(y): 286 """ 287 Perform complete/max/farthest point linkage on a condensed distance matrix. 288 289 Parameters 290 ---------- 291 y : ndarray 292 The upper triangular of the distance matrix. The result of 293 ``pdist`` is returned in this form. 294 295 Returns 296 ------- 297 Z : ndarray 298 A linkage matrix containing the hierarchical clustering. See 299 the `linkage` function documentation for more information 300 on its structure. 301 302 See Also 303 -------- 304 linkage : for advanced creation of hierarchical clusterings. 305 scipy.spatial.distance.pdist : pairwise distance metrics 306 307 Examples 308 -------- 309 >>> from scipy.cluster.hierarchy import complete, fcluster 310 >>> from scipy.spatial.distance import pdist 311 312 First, we need a toy dataset to play with:: 313 314 x x x x 315 x x 316 317 x x 318 x x x x 319 320 >>> X = [[0, 0], [0, 1], [1, 0], 321 ... [0, 4], [0, 3], [1, 4], 322 ... [4, 0], [3, 0], [4, 1], 323 ... [4, 4], [3, 4], [4, 3]] 324 325 Then, we get a condensed distance matrix from this dataset: 326 327 >>> y = pdist(X) 328 329 Finally, we can perform the clustering: 330 331 >>> Z = complete(y) 332 >>> Z 333 array([[ 0. , 1. , 1. , 2. ], 334 [ 3. , 4. , 1. , 2. ], 335 [ 6. , 7. , 1. , 2. ], 336 [ 9. , 10. , 1. , 2. ], 337 [ 2. , 12. , 1.41421356, 3. ], 338 [ 5. , 13. , 1.41421356, 3. ], 339 [ 8. , 14. , 1.41421356, 3. ], 340 [11. , 15. , 1.41421356, 3. ], 341 [16. , 17. , 4.12310563, 6. ], 342 [18. , 19. , 4.12310563, 6. ], 343 [20. , 21. , 5.65685425, 12. ]]) 344 345 The linkage matrix ``Z`` represents a dendrogram - see 346 `scipy.cluster.hierarchy.linkage` for a detailed explanation of its 347 contents. 348 349 We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster 350 each initial point would belong given a distance threshold: 351 352 >>> fcluster(Z, 0.9, criterion='distance') 353 array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32) 354 >>> fcluster(Z, 1.5, criterion='distance') 355 array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32) 356 >>> fcluster(Z, 4.5, criterion='distance') 357 array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], dtype=int32) 358 >>> fcluster(Z, 6, criterion='distance') 359 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) 360 361 Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a 362 plot of the dendrogram. 363 """ 364 return linkage(y, method='complete', metric='euclidean') 365 366 367def average(y): 368 """ 369 Perform average/UPGMA linkage on a condensed distance matrix. 370 371 Parameters 372 ---------- 373 y : ndarray 374 The upper triangular of the distance matrix. The result of 375 ``pdist`` is returned in this form. 376 377 Returns 378 ------- 379 Z : ndarray 380 A linkage matrix containing the hierarchical clustering. See 381 `linkage` for more information on its structure. 382 383 See Also 384 -------- 385 linkage : for advanced creation of hierarchical clusterings. 386 scipy.spatial.distance.pdist : pairwise distance metrics 387 388 Examples 389 -------- 390 >>> from scipy.cluster.hierarchy import average, fcluster 391 >>> from scipy.spatial.distance import pdist 392 393 First, we need a toy dataset to play with:: 394 395 x x x x 396 x x 397 398 x x 399 x x x x 400 401 >>> X = [[0, 0], [0, 1], [1, 0], 402 ... [0, 4], [0, 3], [1, 4], 403 ... [4, 0], [3, 0], [4, 1], 404 ... [4, 4], [3, 4], [4, 3]] 405 406 Then, we get a condensed distance matrix from this dataset: 407 408 >>> y = pdist(X) 409 410 Finally, we can perform the clustering: 411 412 >>> Z = average(y) 413 >>> Z 414 array([[ 0. , 1. , 1. , 2. ], 415 [ 3. , 4. , 1. , 2. ], 416 [ 6. , 7. , 1. , 2. ], 417 [ 9. , 10. , 1. , 2. ], 418 [ 2. , 12. , 1.20710678, 3. ], 419 [ 5. , 13. , 1.20710678, 3. ], 420 [ 8. , 14. , 1.20710678, 3. ], 421 [11. , 15. , 1.20710678, 3. ], 422 [16. , 17. , 3.39675184, 6. ], 423 [18. , 19. , 3.39675184, 6. ], 424 [20. , 21. , 4.09206523, 12. ]]) 425 426 The linkage matrix ``Z`` represents a dendrogram - see 427 `scipy.cluster.hierarchy.linkage` for a detailed explanation of its 428 contents. 429 430 We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster 431 each initial point would belong given a distance threshold: 432 433 >>> fcluster(Z, 0.9, criterion='distance') 434 array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32) 435 >>> fcluster(Z, 1.5, criterion='distance') 436 array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32) 437 >>> fcluster(Z, 4, criterion='distance') 438 array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], dtype=int32) 439 >>> fcluster(Z, 6, criterion='distance') 440 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) 441 442 Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a 443 plot of the dendrogram. 444 445 """ 446 return linkage(y, method='average', metric='euclidean') 447 448 449def weighted(y): 450 """ 451 Perform weighted/WPGMA linkage on the condensed distance matrix. 452 453 See `linkage` for more information on the return 454 structure and algorithm. 455 456 Parameters 457 ---------- 458 y : ndarray 459 The upper triangular of the distance matrix. The result of 460 ``pdist`` is returned in this form. 461 462 Returns 463 ------- 464 Z : ndarray 465 A linkage matrix containing the hierarchical clustering. See 466 `linkage` for more information on its structure. 467 468 See Also 469 -------- 470 linkage : for advanced creation of hierarchical clusterings. 471 scipy.spatial.distance.pdist : pairwise distance metrics 472 473 Examples 474 -------- 475 >>> from scipy.cluster.hierarchy import weighted, fcluster 476 >>> from scipy.spatial.distance import pdist 477 478 First, we need a toy dataset to play with:: 479 480 x x x x 481 x x 482 483 x x 484 x x x x 485 486 >>> X = [[0, 0], [0, 1], [1, 0], 487 ... [0, 4], [0, 3], [1, 4], 488 ... [4, 0], [3, 0], [4, 1], 489 ... [4, 4], [3, 4], [4, 3]] 490 491 Then, we get a condensed distance matrix from this dataset: 492 493 >>> y = pdist(X) 494 495 Finally, we can perform the clustering: 496 497 >>> Z = weighted(y) 498 >>> Z 499 array([[ 0. , 1. , 1. , 2. ], 500 [ 6. , 7. , 1. , 2. ], 501 [ 3. , 4. , 1. , 2. ], 502 [ 9. , 11. , 1. , 2. ], 503 [ 2. , 12. , 1.20710678, 3. ], 504 [ 8. , 13. , 1.20710678, 3. ], 505 [ 5. , 14. , 1.20710678, 3. ], 506 [10. , 15. , 1.20710678, 3. ], 507 [18. , 19. , 3.05595762, 6. ], 508 [16. , 17. , 3.32379407, 6. ], 509 [20. , 21. , 4.06357713, 12. ]]) 510 511 The linkage matrix ``Z`` represents a dendrogram - see 512 `scipy.cluster.hierarchy.linkage` for a detailed explanation of its 513 contents. 514 515 We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster 516 each initial point would belong given a distance threshold: 517 518 >>> fcluster(Z, 0.9, criterion='distance') 519 array([ 7, 8, 9, 1, 2, 3, 10, 11, 12, 4, 6, 5], dtype=int32) 520 >>> fcluster(Z, 1.5, criterion='distance') 521 array([3, 3, 3, 1, 1, 1, 4, 4, 4, 2, 2, 2], dtype=int32) 522 >>> fcluster(Z, 4, criterion='distance') 523 array([2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1], dtype=int32) 524 >>> fcluster(Z, 6, criterion='distance') 525 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) 526 527 Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a 528 plot of the dendrogram. 529 530 """ 531 return linkage(y, method='weighted', metric='euclidean') 532 533 534def centroid(y): 535 """ 536 Perform centroid/UPGMC linkage. 537 538 See `linkage` for more information on the input matrix, 539 return structure, and algorithm. 540 541 The following are common calling conventions: 542 543 1. ``Z = centroid(y)`` 544 545 Performs centroid/UPGMC linkage on the condensed distance 546 matrix ``y``. 547 548 2. ``Z = centroid(X)`` 549 550 Performs centroid/UPGMC linkage on the observation matrix ``X`` 551 using Euclidean distance as the distance metric. 552 553 Parameters 554 ---------- 555 y : ndarray 556 A condensed distance matrix. A condensed 557 distance matrix is a flat array containing the upper 558 triangular of the distance matrix. This is the form that 559 ``pdist`` returns. Alternatively, a collection of 560 m observation vectors in n dimensions may be passed as 561 an m by n array. 562 563 Returns 564 ------- 565 Z : ndarray 566 A linkage matrix containing the hierarchical clustering. See 567 the `linkage` function documentation for more information 568 on its structure. 569 570 See Also 571 -------- 572 linkage : for advanced creation of hierarchical clusterings. 573 scipy.spatial.distance.pdist : pairwise distance metrics 574 575 Examples 576 -------- 577 >>> from scipy.cluster.hierarchy import centroid, fcluster 578 >>> from scipy.spatial.distance import pdist 579 580 First, we need a toy dataset to play with:: 581 582 x x x x 583 x x 584 585 x x 586 x x x x 587 588 >>> X = [[0, 0], [0, 1], [1, 0], 589 ... [0, 4], [0, 3], [1, 4], 590 ... [4, 0], [3, 0], [4, 1], 591 ... [4, 4], [3, 4], [4, 3]] 592 593 Then, we get a condensed distance matrix from this dataset: 594 595 >>> y = pdist(X) 596 597 Finally, we can perform the clustering: 598 599 >>> Z = centroid(y) 600 >>> Z 601 array([[ 0. , 1. , 1. , 2. ], 602 [ 3. , 4. , 1. , 2. ], 603 [ 9. , 10. , 1. , 2. ], 604 [ 6. , 7. , 1. , 2. ], 605 [ 2. , 12. , 1.11803399, 3. ], 606 [ 5. , 13. , 1.11803399, 3. ], 607 [ 8. , 15. , 1.11803399, 3. ], 608 [11. , 14. , 1.11803399, 3. ], 609 [18. , 19. , 3.33333333, 6. ], 610 [16. , 17. , 3.33333333, 6. ], 611 [20. , 21. , 3.33333333, 12. ]]) 612 613 The linkage matrix ``Z`` represents a dendrogram - see 614 `scipy.cluster.hierarchy.linkage` for a detailed explanation of its 615 contents. 616 617 We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster 618 each initial point would belong given a distance threshold: 619 620 >>> fcluster(Z, 0.9, criterion='distance') 621 array([ 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6], dtype=int32) 622 >>> fcluster(Z, 1.1, criterion='distance') 623 array([5, 5, 6, 7, 7, 8, 1, 1, 2, 3, 3, 4], dtype=int32) 624 >>> fcluster(Z, 2, criterion='distance') 625 array([3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2], dtype=int32) 626 >>> fcluster(Z, 4, criterion='distance') 627 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) 628 629 Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a 630 plot of the dendrogram. 631 632 """ 633 return linkage(y, method='centroid', metric='euclidean') 634 635 636def median(y): 637 """ 638 Perform median/WPGMC linkage. 639 640 See `linkage` for more information on the return structure 641 and algorithm. 642 643 The following are common calling conventions: 644 645 1. ``Z = median(y)`` 646 647 Performs median/WPGMC linkage on the condensed distance matrix 648 ``y``. See ``linkage`` for more information on the return 649 structure and algorithm. 650 651 2. ``Z = median(X)`` 652 653 Performs median/WPGMC linkage on the observation matrix ``X`` 654 using Euclidean distance as the distance metric. See `linkage` 655 for more information on the return structure and algorithm. 656 657 Parameters 658 ---------- 659 y : ndarray 660 A condensed distance matrix. A condensed 661 distance matrix is a flat array containing the upper 662 triangular of the distance matrix. This is the form that 663 ``pdist`` returns. Alternatively, a collection of 664 m observation vectors in n dimensions may be passed as 665 an m by n array. 666 667 Returns 668 ------- 669 Z : ndarray 670 The hierarchical clustering encoded as a linkage matrix. 671 672 See Also 673 -------- 674 linkage : for advanced creation of hierarchical clusterings. 675 scipy.spatial.distance.pdist : pairwise distance metrics 676 677 Examples 678 -------- 679 >>> from scipy.cluster.hierarchy import median, fcluster 680 >>> from scipy.spatial.distance import pdist 681 682 First, we need a toy dataset to play with:: 683 684 x x x x 685 x x 686 687 x x 688 x x x x 689 690 >>> X = [[0, 0], [0, 1], [1, 0], 691 ... [0, 4], [0, 3], [1, 4], 692 ... [4, 0], [3, 0], [4, 1], 693 ... [4, 4], [3, 4], [4, 3]] 694 695 Then, we get a condensed distance matrix from this dataset: 696 697 >>> y = pdist(X) 698 699 Finally, we can perform the clustering: 700 701 >>> Z = median(y) 702 >>> Z 703 array([[ 0. , 1. , 1. , 2. ], 704 [ 3. , 4. , 1. , 2. ], 705 [ 9. , 10. , 1. , 2. ], 706 [ 6. , 7. , 1. , 2. ], 707 [ 2. , 12. , 1.11803399, 3. ], 708 [ 5. , 13. , 1.11803399, 3. ], 709 [ 8. , 15. , 1.11803399, 3. ], 710 [11. , 14. , 1.11803399, 3. ], 711 [18. , 19. , 3. , 6. ], 712 [16. , 17. , 3.5 , 6. ], 713 [20. , 21. , 3.25 , 12. ]]) 714 715 The linkage matrix ``Z`` represents a dendrogram - see 716 `scipy.cluster.hierarchy.linkage` for a detailed explanation of its 717 contents. 718 719 We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster 720 each initial point would belong given a distance threshold: 721 722 >>> fcluster(Z, 0.9, criterion='distance') 723 array([ 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6], dtype=int32) 724 >>> fcluster(Z, 1.1, criterion='distance') 725 array([5, 5, 6, 7, 7, 8, 1, 1, 2, 3, 3, 4], dtype=int32) 726 >>> fcluster(Z, 2, criterion='distance') 727 array([3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2], dtype=int32) 728 >>> fcluster(Z, 4, criterion='distance') 729 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) 730 731 Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a 732 plot of the dendrogram. 733 734 """ 735 return linkage(y, method='median', metric='euclidean') 736 737 738def ward(y): 739 """ 740 Perform Ward's linkage on a condensed distance matrix. 741 742 See `linkage` for more information on the return structure 743 and algorithm. 744 745 The following are common calling conventions: 746 747 1. ``Z = ward(y)`` 748 Performs Ward's linkage on the condensed distance matrix ``y``. 749 750 2. ``Z = ward(X)`` 751 Performs Ward's linkage on the observation matrix ``X`` using 752 Euclidean distance as the distance metric. 753 754 Parameters 755 ---------- 756 y : ndarray 757 A condensed distance matrix. A condensed 758 distance matrix is a flat array containing the upper 759 triangular of the distance matrix. This is the form that 760 ``pdist`` returns. Alternatively, a collection of 761 m observation vectors in n dimensions may be passed as 762 an m by n array. 763 764 Returns 765 ------- 766 Z : ndarray 767 The hierarchical clustering encoded as a linkage matrix. See 768 `linkage` for more information on the return structure and 769 algorithm. 770 771 See Also 772 -------- 773 linkage : for advanced creation of hierarchical clusterings. 774 scipy.spatial.distance.pdist : pairwise distance metrics 775 776 Examples 777 -------- 778 >>> from scipy.cluster.hierarchy import ward, fcluster 779 >>> from scipy.spatial.distance import pdist 780 781 First, we need a toy dataset to play with:: 782 783 x x x x 784 x x 785 786 x x 787 x x x x 788 789 >>> X = [[0, 0], [0, 1], [1, 0], 790 ... [0, 4], [0, 3], [1, 4], 791 ... [4, 0], [3, 0], [4, 1], 792 ... [4, 4], [3, 4], [4, 3]] 793 794 Then, we get a condensed distance matrix from this dataset: 795 796 >>> y = pdist(X) 797 798 Finally, we can perform the clustering: 799 800 >>> Z = ward(y) 801 >>> Z 802 array([[ 0. , 1. , 1. , 2. ], 803 [ 3. , 4. , 1. , 2. ], 804 [ 6. , 7. , 1. , 2. ], 805 [ 9. , 10. , 1. , 2. ], 806 [ 2. , 12. , 1.29099445, 3. ], 807 [ 5. , 13. , 1.29099445, 3. ], 808 [ 8. , 14. , 1.29099445, 3. ], 809 [11. , 15. , 1.29099445, 3. ], 810 [16. , 17. , 5.77350269, 6. ], 811 [18. , 19. , 5.77350269, 6. ], 812 [20. , 21. , 8.16496581, 12. ]]) 813 814 The linkage matrix ``Z`` represents a dendrogram - see 815 `scipy.cluster.hierarchy.linkage` for a detailed explanation of its 816 contents. 817 818 We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster 819 each initial point would belong given a distance threshold: 820 821 >>> fcluster(Z, 0.9, criterion='distance') 822 array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32) 823 >>> fcluster(Z, 1.1, criterion='distance') 824 array([1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8], dtype=int32) 825 >>> fcluster(Z, 3, criterion='distance') 826 array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32) 827 >>> fcluster(Z, 9, criterion='distance') 828 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) 829 830 Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a 831 plot of the dendrogram. 832 833 """ 834 return linkage(y, method='ward', metric='euclidean') 835 836 837def linkage(y, method='single', metric='euclidean', optimal_ordering=False): 838 """ 839 Perform hierarchical/agglomerative clustering. 840 841 The input y may be either a 1-D condensed distance matrix 842 or a 2-D array of observation vectors. 843 844 If y is a 1-D condensed distance matrix, 845 then y must be a :math:`\\binom{n}{2}` sized 846 vector, where n is the number of original observations paired 847 in the distance matrix. The behavior of this function is very 848 similar to the MATLAB linkage function. 849 850 A :math:`(n-1)` by 4 matrix ``Z`` is returned. At the 851 :math:`i`-th iteration, clusters with indices ``Z[i, 0]`` and 852 ``Z[i, 1]`` are combined to form cluster :math:`n + i`. A 853 cluster with an index less than :math:`n` corresponds to one of 854 the :math:`n` original observations. The distance between 855 clusters ``Z[i, 0]`` and ``Z[i, 1]`` is given by ``Z[i, 2]``. The 856 fourth value ``Z[i, 3]`` represents the number of original 857 observations in the newly formed cluster. 858 859 The following linkage methods are used to compute the distance 860 :math:`d(s, t)` between two clusters :math:`s` and 861 :math:`t`. The algorithm begins with a forest of clusters that 862 have yet to be used in the hierarchy being formed. When two 863 clusters :math:`s` and :math:`t` from this forest are combined 864 into a single cluster :math:`u`, :math:`s` and :math:`t` are 865 removed from the forest, and :math:`u` is added to the 866 forest. When only one cluster remains in the forest, the algorithm 867 stops, and this cluster becomes the root. 868 869 A distance matrix is maintained at each iteration. The ``d[i,j]`` 870 entry corresponds to the distance between cluster :math:`i` and 871 :math:`j` in the original forest. 872 873 At each iteration, the algorithm must update the distance matrix 874 to reflect the distance of the newly formed cluster u with the 875 remaining clusters in the forest. 876 877 Suppose there are :math:`|u|` original observations 878 :math:`u[0], \\ldots, u[|u|-1]` in cluster :math:`u` and 879 :math:`|v|` original objects :math:`v[0], \\ldots, v[|v|-1]` in 880 cluster :math:`v`. Recall, :math:`s` and :math:`t` are 881 combined to form cluster :math:`u`. Let :math:`v` be any 882 remaining cluster in the forest that is not :math:`u`. 883 884 The following are methods for calculating the distance between the 885 newly formed cluster :math:`u` and each :math:`v`. 886 887 * method='single' assigns 888 889 .. math:: 890 d(u,v) = \\min(dist(u[i],v[j])) 891 892 for all points :math:`i` in cluster :math:`u` and 893 :math:`j` in cluster :math:`v`. This is also known as the 894 Nearest Point Algorithm. 895 896 * method='complete' assigns 897 898 .. math:: 899 d(u, v) = \\max(dist(u[i],v[j])) 900 901 for all points :math:`i` in cluster u and :math:`j` in 902 cluster :math:`v`. This is also known by the Farthest Point 903 Algorithm or Voor Hees Algorithm. 904 905 * method='average' assigns 906 907 .. math:: 908 d(u,v) = \\sum_{ij} \\frac{d(u[i], v[j])} 909 {(|u|*|v|)} 910 911 for all points :math:`i` and :math:`j` where :math:`|u|` 912 and :math:`|v|` are the cardinalities of clusters :math:`u` 913 and :math:`v`, respectively. This is also called the UPGMA 914 algorithm. 915 916 * method='weighted' assigns 917 918 .. math:: 919 d(u,v) = (dist(s,v) + dist(t,v))/2 920 921 where cluster u was formed with cluster s and t and v 922 is a remaining cluster in the forest (also called WPGMA). 923 924 * method='centroid' assigns 925 926 .. math:: 927 dist(s,t) = ||c_s-c_t||_2 928 929 where :math:`c_s` and :math:`c_t` are the centroids of 930 clusters :math:`s` and :math:`t`, respectively. When two 931 clusters :math:`s` and :math:`t` are combined into a new 932 cluster :math:`u`, the new centroid is computed over all the 933 original objects in clusters :math:`s` and :math:`t`. The 934 distance then becomes the Euclidean distance between the 935 centroid of :math:`u` and the centroid of a remaining cluster 936 :math:`v` in the forest. This is also known as the UPGMC 937 algorithm. 938 939 * method='median' assigns :math:`d(s,t)` like the ``centroid`` 940 method. When two clusters :math:`s` and :math:`t` are combined 941 into a new cluster :math:`u`, the average of centroids s and t 942 give the new centroid :math:`u`. This is also known as the 943 WPGMC algorithm. 944 945 * method='ward' uses the Ward variance minimization algorithm. 946 The new entry :math:`d(u,v)` is computed as follows, 947 948 .. math:: 949 950 d(u,v) = \\sqrt{\\frac{|v|+|s|} 951 {T}d(v,s)^2 952 + \\frac{|v|+|t|} 953 {T}d(v,t)^2 954 - \\frac{|v|} 955 {T}d(s,t)^2} 956 957 where :math:`u` is the newly joined cluster consisting of 958 clusters :math:`s` and :math:`t`, :math:`v` is an unused 959 cluster in the forest, :math:`T=|v|+|s|+|t|`, and 960 :math:`|*|` is the cardinality of its argument. This is also 961 known as the incremental algorithm. 962 963 Warning: When the minimum distance pair in the forest is chosen, there 964 may be two or more pairs with the same minimum distance. This 965 implementation may choose a different minimum than the MATLAB 966 version. 967 968 Parameters 969 ---------- 970 y : ndarray 971 A condensed distance matrix. A condensed distance matrix 972 is a flat array containing the upper triangular of the distance matrix. 973 This is the form that ``pdist`` returns. Alternatively, a collection of 974 :math:`m` observation vectors in :math:`n` dimensions may be passed as 975 an :math:`m` by :math:`n` array. All elements of the condensed distance 976 matrix must be finite, i.e., no NaNs or infs. 977 method : str, optional 978 The linkage algorithm to use. See the ``Linkage Methods`` section below 979 for full descriptions. 980 metric : str or function, optional 981 The distance metric to use in the case that y is a collection of 982 observation vectors; ignored otherwise. See the ``pdist`` 983 function for a list of valid distance metrics. A custom distance 984 function can also be used. 985 optimal_ordering : bool, optional 986 If True, the linkage matrix will be reordered so that the distance 987 between successive leaves is minimal. This results in a more intuitive 988 tree structure when the data are visualized. defaults to False, because 989 this algorithm can be slow, particularly on large datasets [2]_. See 990 also the `optimal_leaf_ordering` function. 991 992 .. versionadded:: 1.0.0 993 994 Returns 995 ------- 996 Z : ndarray 997 The hierarchical clustering encoded as a linkage matrix. 998 999 Notes 1000 ----- 1001 1. For method 'single', an optimized algorithm based on minimum spanning 1002 tree is implemented. It has time complexity :math:`O(n^2)`. 1003 For methods 'complete', 'average', 'weighted' and 'ward', an algorithm 1004 called nearest-neighbors chain is implemented. It also has time 1005 complexity :math:`O(n^2)`. 1006 For other methods, a naive algorithm is implemented with :math:`O(n^3)` 1007 time complexity. 1008 All algorithms use :math:`O(n^2)` memory. 1009 Refer to [1]_ for details about the algorithms. 1010 2. Methods 'centroid', 'median', and 'ward' are correctly defined only if 1011 Euclidean pairwise metric is used. If `y` is passed as precomputed 1012 pairwise distances, then it is the user's responsibility to assure that 1013 these distances are in fact Euclidean, otherwise the produced result 1014 will be incorrect. 1015 1016 See Also 1017 -------- 1018 scipy.spatial.distance.pdist : pairwise distance metrics 1019 1020 References 1021 ---------- 1022 .. [1] Daniel Mullner, "Modern hierarchical, agglomerative clustering 1023 algorithms", :arXiv:`1109.2378v1`. 1024 .. [2] Ziv Bar-Joseph, David K. Gifford, Tommi S. Jaakkola, "Fast optimal 1025 leaf ordering for hierarchical clustering", 2001. Bioinformatics 1026 :doi:`10.1093/bioinformatics/17.suppl_1.S22` 1027 1028 Examples 1029 -------- 1030 >>> from scipy.cluster.hierarchy import dendrogram, linkage 1031 >>> from matplotlib import pyplot as plt 1032 >>> X = [[i] for i in [2, 8, 0, 4, 1, 9, 9, 0]] 1033 1034 >>> Z = linkage(X, 'ward') 1035 >>> fig = plt.figure(figsize=(25, 10)) 1036 >>> dn = dendrogram(Z) 1037 1038 >>> Z = linkage(X, 'single') 1039 >>> fig = plt.figure(figsize=(25, 10)) 1040 >>> dn = dendrogram(Z) 1041 >>> plt.show() 1042 """ 1043 if method not in _LINKAGE_METHODS: 1044 raise ValueError("Invalid method: {0}".format(method)) 1045 1046 y = _convert_to_double(np.asarray(y, order='c')) 1047 1048 if y.ndim == 1: 1049 distance.is_valid_y(y, throw=True, name='y') 1050 [y] = _copy_arrays_if_base_present([y]) 1051 elif y.ndim == 2: 1052 if method in _EUCLIDEAN_METHODS and metric != 'euclidean': 1053 raise ValueError("Method '{0}' requires the distance metric " 1054 "to be Euclidean".format(method)) 1055 if y.shape[0] == y.shape[1] and np.allclose(np.diag(y), 0): 1056 if np.all(y >= 0) and np.allclose(y, y.T): 1057 _warning('The symmetric non-negative hollow observation ' 1058 'matrix looks suspiciously like an uncondensed ' 1059 'distance matrix') 1060 y = distance.pdist(y, metric) 1061 else: 1062 raise ValueError("`y` must be 1 or 2 dimensional.") 1063 1064 if not np.all(np.isfinite(y)): 1065 raise ValueError("The condensed distance matrix must contain only " 1066 "finite values.") 1067 1068 n = int(distance.num_obs_y(y)) 1069 method_code = _LINKAGE_METHODS[method] 1070 1071 if method == 'single': 1072 result = _hierarchy.mst_single_linkage(y, n) 1073 elif method in ['complete', 'average', 'weighted', 'ward']: 1074 result = _hierarchy.nn_chain(y, n, method_code) 1075 else: 1076 result = _hierarchy.fast_linkage(y, n, method_code) 1077 1078 if optimal_ordering: 1079 return optimal_leaf_ordering(result, y) 1080 else: 1081 return result 1082 1083 1084class ClusterNode: 1085 """ 1086 A tree node class for representing a cluster. 1087 1088 Leaf nodes correspond to original observations, while non-leaf nodes 1089 correspond to non-singleton clusters. 1090 1091 The `to_tree` function converts a matrix returned by the linkage 1092 function into an easy-to-use tree representation. 1093 1094 All parameter names are also attributes. 1095 1096 Parameters 1097 ---------- 1098 id : int 1099 The node id. 1100 left : ClusterNode instance, optional 1101 The left child tree node. 1102 right : ClusterNode instance, optional 1103 The right child tree node. 1104 dist : float, optional 1105 Distance for this cluster in the linkage matrix. 1106 count : int, optional 1107 The number of samples in this cluster. 1108 1109 See Also 1110 -------- 1111 to_tree : for converting a linkage matrix ``Z`` into a tree object. 1112 1113 """ 1114 1115 def __init__(self, id, left=None, right=None, dist=0, count=1): 1116 if id < 0: 1117 raise ValueError('The id must be non-negative.') 1118 if dist < 0: 1119 raise ValueError('The distance must be non-negative.') 1120 if (left is None and right is not None) or \ 1121 (left is not None and right is None): 1122 raise ValueError('Only full or proper binary trees are permitted.' 1123 ' This node has one child.') 1124 if count < 1: 1125 raise ValueError('A cluster must contain at least one original ' 1126 'observation.') 1127 self.id = id 1128 self.left = left 1129 self.right = right 1130 self.dist = dist 1131 if self.left is None: 1132 self.count = count 1133 else: 1134 self.count = left.count + right.count 1135 1136 def __lt__(self, node): 1137 if not isinstance(node, ClusterNode): 1138 raise ValueError("Can't compare ClusterNode " 1139 "to type {}".format(type(node))) 1140 return self.dist < node.dist 1141 1142 def __gt__(self, node): 1143 if not isinstance(node, ClusterNode): 1144 raise ValueError("Can't compare ClusterNode " 1145 "to type {}".format(type(node))) 1146 return self.dist > node.dist 1147 1148 def __eq__(self, node): 1149 if not isinstance(node, ClusterNode): 1150 raise ValueError("Can't compare ClusterNode " 1151 "to type {}".format(type(node))) 1152 return self.dist == node.dist 1153 1154 def get_id(self): 1155 """ 1156 The identifier of the target node. 1157 1158 For ``0 <= i < n``, `i` corresponds to original observation i. 1159 For ``n <= i < 2n-1``, `i` corresponds to non-singleton cluster formed 1160 at iteration ``i-n``. 1161 1162 Returns 1163 ------- 1164 id : int 1165 The identifier of the target node. 1166 1167 """ 1168 return self.id 1169 1170 def get_count(self): 1171 """ 1172 The number of leaf nodes (original observations) belonging to 1173 the cluster node nd. If the target node is a leaf, 1 is 1174 returned. 1175 1176 Returns 1177 ------- 1178 get_count : int 1179 The number of leaf nodes below the target node. 1180 1181 """ 1182 return self.count 1183 1184 def get_left(self): 1185 """ 1186 Return a reference to the left child tree object. 1187 1188 Returns 1189 ------- 1190 left : ClusterNode 1191 The left child of the target node. If the node is a leaf, 1192 None is returned. 1193 1194 """ 1195 return self.left 1196 1197 def get_right(self): 1198 """ 1199 Return a reference to the right child tree object. 1200 1201 Returns 1202 ------- 1203 right : ClusterNode 1204 The left child of the target node. If the node is a leaf, 1205 None is returned. 1206 1207 """ 1208 return self.right 1209 1210 def is_leaf(self): 1211 """ 1212 Return True if the target node is a leaf. 1213 1214 Returns 1215 ------- 1216 leafness : bool 1217 True if the target node is a leaf node. 1218 1219 """ 1220 return self.left is None 1221 1222 def pre_order(self, func=(lambda x: x.id)): 1223 """ 1224 Perform pre-order traversal without recursive function calls. 1225 1226 When a leaf node is first encountered, ``func`` is called with 1227 the leaf node as its argument, and its result is appended to 1228 the list. 1229 1230 For example, the statement:: 1231 1232 ids = root.pre_order(lambda x: x.id) 1233 1234 returns a list of the node ids corresponding to the leaf nodes 1235 of the tree as they appear from left to right. 1236 1237 Parameters 1238 ---------- 1239 func : function 1240 Applied to each leaf ClusterNode object in the pre-order traversal. 1241 Given the ``i``-th leaf node in the pre-order traversal ``n[i]``, 1242 the result of ``func(n[i])`` is stored in ``L[i]``. If not 1243 provided, the index of the original observation to which the node 1244 corresponds is used. 1245 1246 Returns 1247 ------- 1248 L : list 1249 The pre-order traversal. 1250 1251 """ 1252 # Do a preorder traversal, caching the result. To avoid having to do 1253 # recursion, we'll store the previous index we've visited in a vector. 1254 n = self.count 1255 1256 curNode = [None] * (2 * n) 1257 lvisited = set() 1258 rvisited = set() 1259 curNode[0] = self 1260 k = 0 1261 preorder = [] 1262 while k >= 0: 1263 nd = curNode[k] 1264 ndid = nd.id 1265 if nd.is_leaf(): 1266 preorder.append(func(nd)) 1267 k = k - 1 1268 else: 1269 if ndid not in lvisited: 1270 curNode[k + 1] = nd.left 1271 lvisited.add(ndid) 1272 k = k + 1 1273 elif ndid not in rvisited: 1274 curNode[k + 1] = nd.right 1275 rvisited.add(ndid) 1276 k = k + 1 1277 # If we've visited the left and right of this non-leaf 1278 # node already, go up in the tree. 1279 else: 1280 k = k - 1 1281 1282 return preorder 1283 1284 1285_cnode_bare = ClusterNode(0) 1286_cnode_type = type(ClusterNode) 1287 1288 1289def _order_cluster_tree(Z): 1290 """ 1291 Return clustering nodes in bottom-up order by distance. 1292 1293 Parameters 1294 ---------- 1295 Z : scipy.cluster.linkage array 1296 The linkage matrix. 1297 1298 Returns 1299 ------- 1300 nodes : list 1301 A list of ClusterNode objects. 1302 """ 1303 q = deque() 1304 tree = to_tree(Z) 1305 q.append(tree) 1306 nodes = [] 1307 1308 while q: 1309 node = q.popleft() 1310 if not node.is_leaf(): 1311 bisect.insort_left(nodes, node) 1312 q.append(node.get_right()) 1313 q.append(node.get_left()) 1314 return nodes 1315 1316 1317def cut_tree(Z, n_clusters=None, height=None): 1318 """ 1319 Given a linkage matrix Z, return the cut tree. 1320 1321 Parameters 1322 ---------- 1323 Z : scipy.cluster.linkage array 1324 The linkage matrix. 1325 n_clusters : array_like, optional 1326 Number of clusters in the tree at the cut point. 1327 height : array_like, optional 1328 The height at which to cut the tree. Only possible for ultrametric 1329 trees. 1330 1331 Returns 1332 ------- 1333 cutree : array 1334 An array indicating group membership at each agglomeration step. I.e., 1335 for a full cut tree, in the first column each data point is in its own 1336 cluster. At the next step, two nodes are merged. Finally, all 1337 singleton and non-singleton clusters are in one group. If `n_clusters` 1338 or `height` are given, the columns correspond to the columns of 1339 `n_clusters` or `height`. 1340 1341 Examples 1342 -------- 1343 >>> from scipy import cluster 1344 >>> import numpy as np 1345 >>> from numpy.random import default_rng 1346 >>> rng = default_rng() 1347 >>> X = rng.random((50, 4)) 1348 >>> Z = cluster.hierarchy.ward(X) 1349 >>> cutree = cluster.hierarchy.cut_tree(Z, n_clusters=[5, 10]) 1350 >>> cutree[:10] 1351 array([[0, 0], 1352 [1, 1], 1353 [2, 2], 1354 [3, 3], 1355 [3, 4], 1356 [2, 2], 1357 [0, 0], 1358 [1, 5], 1359 [3, 6], 1360 [4, 7]]) # random 1361 1362 """ 1363 nobs = num_obs_linkage(Z) 1364 nodes = _order_cluster_tree(Z) 1365 1366 if height is not None and n_clusters is not None: 1367 raise ValueError("At least one of either height or n_clusters " 1368 "must be None") 1369 elif height is None and n_clusters is None: # return the full cut tree 1370 cols_idx = np.arange(nobs) 1371 elif height is not None: 1372 heights = np.array([x.dist for x in nodes]) 1373 cols_idx = np.searchsorted(heights, height) 1374 else: 1375 cols_idx = nobs - np.searchsorted(np.arange(nobs), n_clusters) 1376 1377 try: 1378 n_cols = len(cols_idx) 1379 except TypeError: # scalar 1380 n_cols = 1 1381 cols_idx = np.array([cols_idx]) 1382 1383 groups = np.zeros((n_cols, nobs), dtype=int) 1384 last_group = np.arange(nobs) 1385 if 0 in cols_idx: 1386 groups[0] = last_group 1387 1388 for i, node in enumerate(nodes): 1389 idx = node.pre_order() 1390 this_group = last_group.copy() 1391 this_group[idx] = last_group[idx].min() 1392 this_group[this_group > last_group[idx].max()] -= 1 1393 if i + 1 in cols_idx: 1394 groups[np.nonzero(i + 1 == cols_idx)[0]] = this_group 1395 last_group = this_group 1396 1397 return groups.T 1398 1399 1400def to_tree(Z, rd=False): 1401 """ 1402 Convert a linkage matrix into an easy-to-use tree object. 1403 1404 The reference to the root `ClusterNode` object is returned (by default). 1405 1406 Each `ClusterNode` object has a ``left``, ``right``, ``dist``, ``id``, 1407 and ``count`` attribute. The left and right attributes point to 1408 ClusterNode objects that were combined to generate the cluster. 1409 If both are None then the `ClusterNode` object is a leaf node, its count 1410 must be 1, and its distance is meaningless but set to 0. 1411 1412 *Note: This function is provided for the convenience of the library 1413 user. ClusterNodes are not used as input to any of the functions in this 1414 library.* 1415 1416 Parameters 1417 ---------- 1418 Z : ndarray 1419 The linkage matrix in proper form (see the `linkage` 1420 function documentation). 1421 rd : bool, optional 1422 When False (default), a reference to the root `ClusterNode` object is 1423 returned. Otherwise, a tuple ``(r, d)`` is returned. ``r`` is a 1424 reference to the root node while ``d`` is a list of `ClusterNode` 1425 objects - one per original entry in the linkage matrix plus entries 1426 for all clustering steps. If a cluster id is 1427 less than the number of samples ``n`` in the data that the linkage 1428 matrix describes, then it corresponds to a singleton cluster (leaf 1429 node). 1430 See `linkage` for more information on the assignment of cluster ids 1431 to clusters. 1432 1433 Returns 1434 ------- 1435 tree : ClusterNode or tuple (ClusterNode, list of ClusterNode) 1436 If ``rd`` is False, a `ClusterNode`. 1437 If ``rd`` is True, a list of length ``2*n - 1``, with ``n`` the number 1438 of samples. See the description of `rd` above for more details. 1439 1440 See Also 1441 -------- 1442 linkage, is_valid_linkage, ClusterNode 1443 1444 Examples 1445 -------- 1446 >>> from scipy.cluster import hierarchy 1447 >>> rng = np.random.default_rng() 1448 >>> x = rng.random((5, 2)) 1449 >>> Z = hierarchy.linkage(x) 1450 >>> hierarchy.to_tree(Z) 1451 <scipy.cluster.hierarchy.ClusterNode object at ... 1452 >>> rootnode, nodelist = hierarchy.to_tree(Z, rd=True) 1453 >>> rootnode 1454 <scipy.cluster.hierarchy.ClusterNode object at ... 1455 >>> len(nodelist) 1456 9 1457 1458 """ 1459 Z = np.asarray(Z, order='c') 1460 is_valid_linkage(Z, throw=True, name='Z') 1461 1462 # Number of original objects is equal to the number of rows plus 1. 1463 n = Z.shape[0] + 1 1464 1465 # Create a list full of None's to store the node objects 1466 d = [None] * (n * 2 - 1) 1467 1468 # Create the nodes corresponding to the n original objects. 1469 for i in range(0, n): 1470 d[i] = ClusterNode(i) 1471 1472 nd = None 1473 1474 for i, row in enumerate(Z): 1475 fi = int(row[0]) 1476 fj = int(row[1]) 1477 if fi > i + n: 1478 raise ValueError(('Corrupt matrix Z. Index to derivative cluster ' 1479 'is used before it is formed. See row %d, ' 1480 'column 0') % fi) 1481 if fj > i + n: 1482 raise ValueError(('Corrupt matrix Z. Index to derivative cluster ' 1483 'is used before it is formed. See row %d, ' 1484 'column 1') % fj) 1485 1486 nd = ClusterNode(i + n, d[fi], d[fj], row[2]) 1487 # ^ id ^ left ^ right ^ dist 1488 if row[3] != nd.count: 1489 raise ValueError(('Corrupt matrix Z. The count Z[%d,3] is ' 1490 'incorrect.') % i) 1491 d[n + i] = nd 1492 1493 if rd: 1494 return (nd, d) 1495 else: 1496 return nd 1497 1498 1499def optimal_leaf_ordering(Z, y, metric='euclidean'): 1500 """ 1501 Given a linkage matrix Z and distance, reorder the cut tree. 1502 1503 Parameters 1504 ---------- 1505 Z : ndarray 1506 The hierarchical clustering encoded as a linkage matrix. See 1507 `linkage` for more information on the return structure and 1508 algorithm. 1509 y : ndarray 1510 The condensed distance matrix from which Z was generated. 1511 Alternatively, a collection of m observation vectors in n 1512 dimensions may be passed as an m by n array. 1513 metric : str or function, optional 1514 The distance metric to use in the case that y is a collection of 1515 observation vectors; ignored otherwise. See the ``pdist`` 1516 function for a list of valid distance metrics. A custom distance 1517 function can also be used. 1518 1519 Returns 1520 ------- 1521 Z_ordered : ndarray 1522 A copy of the linkage matrix Z, reordered to minimize the distance 1523 between adjacent leaves. 1524 1525 Examples 1526 -------- 1527 >>> from scipy.cluster import hierarchy 1528 >>> rng = np.random.default_rng() 1529 >>> X = rng.standard_normal((10, 10)) 1530 >>> Z = hierarchy.ward(X) 1531 >>> hierarchy.leaves_list(Z) 1532 array([0, 3, 1, 9, 2, 5, 7, 4, 6, 8], dtype=int32) 1533 >>> hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(Z, X)) 1534 array([3, 0, 2, 5, 7, 4, 8, 6, 9, 1], dtype=int32) 1535 1536 """ 1537 Z = np.asarray(Z, order='c') 1538 is_valid_linkage(Z, throw=True, name='Z') 1539 1540 y = _convert_to_double(np.asarray(y, order='c')) 1541 1542 if y.ndim == 1: 1543 distance.is_valid_y(y, throw=True, name='y') 1544 [y] = _copy_arrays_if_base_present([y]) 1545 elif y.ndim == 2: 1546 if y.shape[0] == y.shape[1] and np.allclose(np.diag(y), 0): 1547 if np.all(y >= 0) and np.allclose(y, y.T): 1548 _warning('The symmetric non-negative hollow observation ' 1549 'matrix looks suspiciously like an uncondensed ' 1550 'distance matrix') 1551 y = distance.pdist(y, metric) 1552 else: 1553 raise ValueError("`y` must be 1 or 2 dimensional.") 1554 1555 if not np.all(np.isfinite(y)): 1556 raise ValueError("The condensed distance matrix must contain only " 1557 "finite values.") 1558 1559 return _optimal_leaf_ordering.optimal_leaf_ordering(Z, y) 1560 1561 1562def _convert_to_bool(X): 1563 if X.dtype != bool: 1564 X = X.astype(bool) 1565 if not X.flags.contiguous: 1566 X = X.copy() 1567 return X 1568 1569 1570def _convert_to_double(X): 1571 if X.dtype != np.double: 1572 X = X.astype(np.double) 1573 if not X.flags.contiguous: 1574 X = X.copy() 1575 return X 1576 1577 1578def cophenet(Z, Y=None): 1579 """ 1580 Calculate the cophenetic distances between each observation in 1581 the hierarchical clustering defined by the linkage ``Z``. 1582 1583 Suppose ``p`` and ``q`` are original observations in 1584 disjoint clusters ``s`` and ``t``, respectively and 1585 ``s`` and ``t`` are joined by a direct parent cluster 1586 ``u``. The cophenetic distance between observations 1587 ``i`` and ``j`` is simply the distance between 1588 clusters ``s`` and ``t``. 1589 1590 Parameters 1591 ---------- 1592 Z : ndarray 1593 The hierarchical clustering encoded as an array 1594 (see `linkage` function). 1595 Y : ndarray (optional) 1596 Calculates the cophenetic correlation coefficient ``c`` of a 1597 hierarchical clustering defined by the linkage matrix `Z` 1598 of a set of :math:`n` observations in :math:`m` 1599 dimensions. `Y` is the condensed distance matrix from which 1600 `Z` was generated. 1601 1602 Returns 1603 ------- 1604 c : ndarray 1605 The cophentic correlation distance (if ``Y`` is passed). 1606 d : ndarray 1607 The cophenetic distance matrix in condensed form. The 1608 :math:`ij` th entry is the cophenetic distance between 1609 original observations :math:`i` and :math:`j`. 1610 1611 See Also 1612 -------- 1613 linkage : for a description of what a linkage matrix is. 1614 scipy.spatial.distance.squareform : transforming condensed matrices into square ones. 1615 1616 Examples 1617 -------- 1618 >>> from scipy.cluster.hierarchy import single, cophenet 1619 >>> from scipy.spatial.distance import pdist, squareform 1620 1621 Given a dataset ``X`` and a linkage matrix ``Z``, the cophenetic distance 1622 between two points of ``X`` is the distance between the largest two 1623 distinct clusters that each of the points: 1624 1625 >>> X = [[0, 0], [0, 1], [1, 0], 1626 ... [0, 4], [0, 3], [1, 4], 1627 ... [4, 0], [3, 0], [4, 1], 1628 ... [4, 4], [3, 4], [4, 3]] 1629 1630 ``X`` corresponds to this dataset :: 1631 1632 x x x x 1633 x x 1634 1635 x x 1636 x x x x 1637 1638 >>> Z = single(pdist(X)) 1639 >>> Z 1640 array([[ 0., 1., 1., 2.], 1641 [ 2., 12., 1., 3.], 1642 [ 3., 4., 1., 2.], 1643 [ 5., 14., 1., 3.], 1644 [ 6., 7., 1., 2.], 1645 [ 8., 16., 1., 3.], 1646 [ 9., 10., 1., 2.], 1647 [11., 18., 1., 3.], 1648 [13., 15., 2., 6.], 1649 [17., 20., 2., 9.], 1650 [19., 21., 2., 12.]]) 1651 >>> cophenet(Z) 1652 array([1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 2., 2., 2., 2., 2., 1653 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 2., 2., 1654 2., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1655 1., 1., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 1., 1., 1.]) 1656 1657 The output of the `scipy.cluster.hierarchy.cophenet` method is 1658 represented in condensed form. We can use 1659 `scipy.spatial.distance.squareform` to see the output as a 1660 regular matrix (where each element ``ij`` denotes the cophenetic distance 1661 between each ``i``, ``j`` pair of points in ``X``): 1662 1663 >>> squareform(cophenet(Z)) 1664 array([[0., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.], 1665 [1., 0., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.], 1666 [1., 1., 0., 2., 2., 2., 2., 2., 2., 2., 2., 2.], 1667 [2., 2., 2., 0., 1., 1., 2., 2., 2., 2., 2., 2.], 1668 [2., 2., 2., 1., 0., 1., 2., 2., 2., 2., 2., 2.], 1669 [2., 2., 2., 1., 1., 0., 2., 2., 2., 2., 2., 2.], 1670 [2., 2., 2., 2., 2., 2., 0., 1., 1., 2., 2., 2.], 1671 [2., 2., 2., 2., 2., 2., 1., 0., 1., 2., 2., 2.], 1672 [2., 2., 2., 2., 2., 2., 1., 1., 0., 2., 2., 2.], 1673 [2., 2., 2., 2., 2., 2., 2., 2., 2., 0., 1., 1.], 1674 [2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 0., 1.], 1675 [2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 0.]]) 1676 1677 In this example, the cophenetic distance between points on ``X`` that are 1678 very close (i.e., in the same corner) is 1. For other pairs of points is 2, 1679 because the points will be located in clusters at different 1680 corners - thus, the distance between these clusters will be larger. 1681 1682 """ 1683 Z = np.asarray(Z, order='c') 1684 is_valid_linkage(Z, throw=True, name='Z') 1685 Zs = Z.shape 1686 n = Zs[0] + 1 1687 1688 zz = np.zeros((n * (n-1)) // 2, dtype=np.double) 1689 # Since the C code does not support striding using strides. 1690 # The dimensions are used instead. 1691 Z = _convert_to_double(Z) 1692 1693 _hierarchy.cophenetic_distances(Z, zz, int(n)) 1694 if Y is None: 1695 return zz 1696 1697 Y = np.asarray(Y, order='c') 1698 distance.is_valid_y(Y, throw=True, name='Y') 1699 1700 z = zz.mean() 1701 y = Y.mean() 1702 Yy = Y - y 1703 Zz = zz - z 1704 numerator = (Yy * Zz) 1705 denomA = Yy**2 1706 denomB = Zz**2 1707 c = numerator.sum() / np.sqrt((denomA.sum() * denomB.sum())) 1708 return (c, zz) 1709 1710 1711def inconsistent(Z, d=2): 1712 r""" 1713 Calculate inconsistency statistics on a linkage matrix. 1714 1715 Parameters 1716 ---------- 1717 Z : ndarray 1718 The :math:`(n-1)` by 4 matrix encoding the linkage (hierarchical 1719 clustering). See `linkage` documentation for more information on its 1720 form. 1721 d : int, optional 1722 The number of links up to `d` levels below each non-singleton cluster. 1723 1724 Returns 1725 ------- 1726 R : ndarray 1727 A :math:`(n-1)` by 4 matrix where the ``i``'th row contains the link 1728 statistics for the non-singleton cluster ``i``. The link statistics are 1729 computed over the link heights for links :math:`d` levels below the 1730 cluster ``i``. ``R[i,0]`` and ``R[i,1]`` are the mean and standard 1731 deviation of the link heights, respectively; ``R[i,2]`` is the number 1732 of links included in the calculation; and ``R[i,3]`` is the 1733 inconsistency coefficient, 1734 1735 .. math:: \frac{\mathtt{Z[i,2]} - \mathtt{R[i,0]}} {R[i,1]} 1736 1737 Notes 1738 ----- 1739 This function behaves similarly to the MATLAB(TM) ``inconsistent`` 1740 function. 1741 1742 Examples 1743 -------- 1744 >>> from scipy.cluster.hierarchy import inconsistent, linkage 1745 >>> from matplotlib import pyplot as plt 1746 >>> X = [[i] for i in [2, 8, 0, 4, 1, 9, 9, 0]] 1747 >>> Z = linkage(X, 'ward') 1748 >>> print(Z) 1749 [[ 5. 6. 0. 2. ] 1750 [ 2. 7. 0. 2. ] 1751 [ 0. 4. 1. 2. ] 1752 [ 1. 8. 1.15470054 3. ] 1753 [ 9. 10. 2.12132034 4. ] 1754 [ 3. 12. 4.11096096 5. ] 1755 [11. 13. 14.07183949 8. ]] 1756 >>> inconsistent(Z) 1757 array([[ 0. , 0. , 1. , 0. ], 1758 [ 0. , 0. , 1. , 0. ], 1759 [ 1. , 0. , 1. , 0. ], 1760 [ 0.57735027, 0.81649658, 2. , 0.70710678], 1761 [ 1.04044011, 1.06123822, 3. , 1.01850858], 1762 [ 3.11614065, 1.40688837, 2. , 0.70710678], 1763 [ 6.44583366, 6.76770586, 3. , 1.12682288]]) 1764 1765 """ 1766 Z = np.asarray(Z, order='c') 1767 1768 Zs = Z.shape 1769 is_valid_linkage(Z, throw=True, name='Z') 1770 if (not d == np.floor(d)) or d < 0: 1771 raise ValueError('The second argument d must be a nonnegative ' 1772 'integer value.') 1773 1774 # Since the C code does not support striding using strides. 1775 # The dimensions are used instead. 1776 [Z] = _copy_arrays_if_base_present([Z]) 1777 1778 n = Zs[0] + 1 1779 R = np.zeros((n - 1, 4), dtype=np.double) 1780 1781 _hierarchy.inconsistent(Z, R, int(n), int(d)) 1782 return R 1783 1784 1785def from_mlab_linkage(Z): 1786 """ 1787 Convert a linkage matrix generated by MATLAB(TM) to a new 1788 linkage matrix compatible with this module. 1789 1790 The conversion does two things: 1791 1792 * the indices are converted from ``1..N`` to ``0..(N-1)`` form, 1793 and 1794 1795 * a fourth column ``Z[:,3]`` is added where ``Z[i,3]`` represents the 1796 number of original observations (leaves) in the non-singleton 1797 cluster ``i``. 1798 1799 This function is useful when loading in linkages from legacy data 1800 files generated by MATLAB. 1801 1802 Parameters 1803 ---------- 1804 Z : ndarray 1805 A linkage matrix generated by MATLAB(TM). 1806 1807 Returns 1808 ------- 1809 ZS : ndarray 1810 A linkage matrix compatible with ``scipy.cluster.hierarchy``. 1811 1812 See Also 1813 -------- 1814 linkage : for a description of what a linkage matrix is. 1815 to_mlab_linkage : transform from SciPy to MATLAB format. 1816 1817 Examples 1818 -------- 1819 >>> import numpy as np 1820 >>> from scipy.cluster.hierarchy import ward, from_mlab_linkage 1821 1822 Given a linkage matrix in MATLAB format ``mZ``, we can use 1823 `scipy.cluster.hierarchy.from_mlab_linkage` to import 1824 it into SciPy format: 1825 1826 >>> mZ = np.array([[1, 2, 1], [4, 5, 1], [7, 8, 1], 1827 ... [10, 11, 1], [3, 13, 1.29099445], 1828 ... [6, 14, 1.29099445], 1829 ... [9, 15, 1.29099445], 1830 ... [12, 16, 1.29099445], 1831 ... [17, 18, 5.77350269], 1832 ... [19, 20, 5.77350269], 1833 ... [21, 22, 8.16496581]]) 1834 1835 >>> Z = from_mlab_linkage(mZ) 1836 >>> Z 1837 array([[ 0. , 1. , 1. , 2. ], 1838 [ 3. , 4. , 1. , 2. ], 1839 [ 6. , 7. , 1. , 2. ], 1840 [ 9. , 10. , 1. , 2. ], 1841 [ 2. , 12. , 1.29099445, 3. ], 1842 [ 5. , 13. , 1.29099445, 3. ], 1843 [ 8. , 14. , 1.29099445, 3. ], 1844 [ 11. , 15. , 1.29099445, 3. ], 1845 [ 16. , 17. , 5.77350269, 6. ], 1846 [ 18. , 19. , 5.77350269, 6. ], 1847 [ 20. , 21. , 8.16496581, 12. ]]) 1848 1849 As expected, the linkage matrix ``Z`` returned includes an 1850 additional column counting the number of original samples in 1851 each cluster. Also, all cluster indices are reduced by 1 1852 (MATLAB format uses 1-indexing, whereas SciPy uses 0-indexing). 1853 1854 """ 1855 Z = np.asarray(Z, dtype=np.double, order='c') 1856 Zs = Z.shape 1857 1858 # If it's empty, return it. 1859 if len(Zs) == 0 or (len(Zs) == 1 and Zs[0] == 0): 1860 return Z.copy() 1861 1862 if len(Zs) != 2: 1863 raise ValueError("The linkage array must be rectangular.") 1864 1865 # If it contains no rows, return it. 1866 if Zs[0] == 0: 1867 return Z.copy() 1868 1869 Zpart = Z.copy() 1870 if Zpart[:, 0:2].min() != 1.0 and Zpart[:, 0:2].max() != 2 * Zs[0]: 1871 raise ValueError('The format of the indices is not 1..N') 1872 1873 Zpart[:, 0:2] -= 1.0 1874 CS = np.zeros((Zs[0],), dtype=np.double) 1875 _hierarchy.calculate_cluster_sizes(Zpart, CS, int(Zs[0]) + 1) 1876 return np.hstack([Zpart, CS.reshape(Zs[0], 1)]) 1877 1878 1879def to_mlab_linkage(Z): 1880 """ 1881 Convert a linkage matrix to a MATLAB(TM) compatible one. 1882 1883 Converts a linkage matrix ``Z`` generated by the linkage function 1884 of this module to a MATLAB(TM) compatible one. The return linkage 1885 matrix has the last column removed and the cluster indices are 1886 converted to ``1..N`` indexing. 1887 1888 Parameters 1889 ---------- 1890 Z : ndarray 1891 A linkage matrix generated by ``scipy.cluster.hierarchy``. 1892 1893 Returns 1894 ------- 1895 to_mlab_linkage : ndarray 1896 A linkage matrix compatible with MATLAB(TM)'s hierarchical 1897 clustering functions. 1898 1899 The return linkage matrix has the last column removed 1900 and the cluster indices are converted to ``1..N`` indexing. 1901 1902 See Also 1903 -------- 1904 linkage : for a description of what a linkage matrix is. 1905 from_mlab_linkage : transform from Matlab to SciPy format. 1906 1907 Examples 1908 -------- 1909 >>> from scipy.cluster.hierarchy import ward, to_mlab_linkage 1910 >>> from scipy.spatial.distance import pdist 1911 1912 >>> X = [[0, 0], [0, 1], [1, 0], 1913 ... [0, 4], [0, 3], [1, 4], 1914 ... [4, 0], [3, 0], [4, 1], 1915 ... [4, 4], [3, 4], [4, 3]] 1916 1917 >>> Z = ward(pdist(X)) 1918 >>> Z 1919 array([[ 0. , 1. , 1. , 2. ], 1920 [ 3. , 4. , 1. , 2. ], 1921 [ 6. , 7. , 1. , 2. ], 1922 [ 9. , 10. , 1. , 2. ], 1923 [ 2. , 12. , 1.29099445, 3. ], 1924 [ 5. , 13. , 1.29099445, 3. ], 1925 [ 8. , 14. , 1.29099445, 3. ], 1926 [11. , 15. , 1.29099445, 3. ], 1927 [16. , 17. , 5.77350269, 6. ], 1928 [18. , 19. , 5.77350269, 6. ], 1929 [20. , 21. , 8.16496581, 12. ]]) 1930 1931 After a linkage matrix ``Z`` has been created, we can use 1932 `scipy.cluster.hierarchy.to_mlab_linkage` to convert it 1933 into MATLAB format: 1934 1935 >>> mZ = to_mlab_linkage(Z) 1936 >>> mZ 1937 array([[ 1. , 2. , 1. ], 1938 [ 4. , 5. , 1. ], 1939 [ 7. , 8. , 1. ], 1940 [ 10. , 11. , 1. ], 1941 [ 3. , 13. , 1.29099445], 1942 [ 6. , 14. , 1.29099445], 1943 [ 9. , 15. , 1.29099445], 1944 [ 12. , 16. , 1.29099445], 1945 [ 17. , 18. , 5.77350269], 1946 [ 19. , 20. , 5.77350269], 1947 [ 21. , 22. , 8.16496581]]) 1948 1949 The new linkage matrix ``mZ`` uses 1-indexing for all the 1950 clusters (instead of 0-indexing). Also, the last column of 1951 the original linkage matrix has been dropped. 1952 1953 """ 1954 Z = np.asarray(Z, order='c', dtype=np.double) 1955 Zs = Z.shape 1956 if len(Zs) == 0 or (len(Zs) == 1 and Zs[0] == 0): 1957 return Z.copy() 1958 is_valid_linkage(Z, throw=True, name='Z') 1959 1960 ZP = Z[:, 0:3].copy() 1961 ZP[:, 0:2] += 1.0 1962 1963 return ZP 1964 1965 1966def is_monotonic(Z): 1967 """ 1968 Return True if the linkage passed is monotonic. 1969 1970 The linkage is monotonic if for every cluster :math:`s` and :math:`t` 1971 joined, the distance between them is no less than the distance 1972 between any previously joined clusters. 1973 1974 Parameters 1975 ---------- 1976 Z : ndarray 1977 The linkage matrix to check for monotonicity. 1978 1979 Returns 1980 ------- 1981 b : bool 1982 A boolean indicating whether the linkage is monotonic. 1983 1984 See Also 1985 -------- 1986 linkage : for a description of what a linkage matrix is. 1987 1988 Examples 1989 -------- 1990 >>> from scipy.cluster.hierarchy import median, ward, is_monotonic 1991 >>> from scipy.spatial.distance import pdist 1992 1993 By definition, some hierarchical clustering algorithms - such as 1994 `scipy.cluster.hierarchy.ward` - produce monotonic assignments of 1995 samples to clusters; however, this is not always true for other 1996 hierarchical methods - e.g. `scipy.cluster.hierarchy.median`. 1997 1998 Given a linkage matrix ``Z`` (as the result of a hierarchical clustering 1999 method) we can test programmatically whether it has the monotonicity 2000 property or not, using `scipy.cluster.hierarchy.is_monotonic`: 2001 2002 >>> X = [[0, 0], [0, 1], [1, 0], 2003 ... [0, 4], [0, 3], [1, 4], 2004 ... [4, 0], [3, 0], [4, 1], 2005 ... [4, 4], [3, 4], [4, 3]] 2006 2007 >>> Z = ward(pdist(X)) 2008 >>> Z 2009 array([[ 0. , 1. , 1. , 2. ], 2010 [ 3. , 4. , 1. , 2. ], 2011 [ 6. , 7. , 1. , 2. ], 2012 [ 9. , 10. , 1. , 2. ], 2013 [ 2. , 12. , 1.29099445, 3. ], 2014 [ 5. , 13. , 1.29099445, 3. ], 2015 [ 8. , 14. , 1.29099445, 3. ], 2016 [11. , 15. , 1.29099445, 3. ], 2017 [16. , 17. , 5.77350269, 6. ], 2018 [18. , 19. , 5.77350269, 6. ], 2019 [20. , 21. , 8.16496581, 12. ]]) 2020 >>> is_monotonic(Z) 2021 True 2022 2023 >>> Z = median(pdist(X)) 2024 >>> Z 2025 array([[ 0. , 1. , 1. , 2. ], 2026 [ 3. , 4. , 1. , 2. ], 2027 [ 9. , 10. , 1. , 2. ], 2028 [ 6. , 7. , 1. , 2. ], 2029 [ 2. , 12. , 1.11803399, 3. ], 2030 [ 5. , 13. , 1.11803399, 3. ], 2031 [ 8. , 15. , 1.11803399, 3. ], 2032 [11. , 14. , 1.11803399, 3. ], 2033 [18. , 19. , 3. , 6. ], 2034 [16. , 17. , 3.5 , 6. ], 2035 [20. , 21. , 3.25 , 12. ]]) 2036 >>> is_monotonic(Z) 2037 False 2038 2039 Note that this method is equivalent to just verifying that the distances 2040 in the third column of the linkage matrix appear in a monotonically 2041 increasing order. 2042 2043 """ 2044 Z = np.asarray(Z, order='c') 2045 is_valid_linkage(Z, throw=True, name='Z') 2046 2047 # We expect the i'th value to be greater than its successor. 2048 return (Z[1:, 2] >= Z[:-1, 2]).all() 2049 2050 2051def is_valid_im(R, warning=False, throw=False, name=None): 2052 """Return True if the inconsistency matrix passed is valid. 2053 2054 It must be a :math:`n` by 4 array of doubles. The standard 2055 deviations ``R[:,1]`` must be nonnegative. The link counts 2056 ``R[:,2]`` must be positive and no greater than :math:`n-1`. 2057 2058 Parameters 2059 ---------- 2060 R : ndarray 2061 The inconsistency matrix to check for validity. 2062 warning : bool, optional 2063 When True, issues a Python warning if the linkage 2064 matrix passed is invalid. 2065 throw : bool, optional 2066 When True, throws a Python exception if the linkage 2067 matrix passed is invalid. 2068 name : str, optional 2069 This string refers to the variable name of the invalid 2070 linkage matrix. 2071 2072 Returns 2073 ------- 2074 b : bool 2075 True if the inconsistency matrix is valid. 2076 2077 See Also 2078 -------- 2079 linkage : for a description of what a linkage matrix is. 2080 inconsistent : for the creation of a inconsistency matrix. 2081 2082 Examples 2083 -------- 2084 >>> from scipy.cluster.hierarchy import ward, inconsistent, is_valid_im 2085 >>> from scipy.spatial.distance import pdist 2086 2087 Given a data set ``X``, we can apply a clustering method to obtain a 2088 linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can 2089 be also used to obtain the inconsistency matrix ``R`` associated to 2090 this clustering process: 2091 2092 >>> X = [[0, 0], [0, 1], [1, 0], 2093 ... [0, 4], [0, 3], [1, 4], 2094 ... [4, 0], [3, 0], [4, 1], 2095 ... [4, 4], [3, 4], [4, 3]] 2096 2097 >>> Z = ward(pdist(X)) 2098 >>> R = inconsistent(Z) 2099 >>> Z 2100 array([[ 0. , 1. , 1. , 2. ], 2101 [ 3. , 4. , 1. , 2. ], 2102 [ 6. , 7. , 1. , 2. ], 2103 [ 9. , 10. , 1. , 2. ], 2104 [ 2. , 12. , 1.29099445, 3. ], 2105 [ 5. , 13. , 1.29099445, 3. ], 2106 [ 8. , 14. , 1.29099445, 3. ], 2107 [11. , 15. , 1.29099445, 3. ], 2108 [16. , 17. , 5.77350269, 6. ], 2109 [18. , 19. , 5.77350269, 6. ], 2110 [20. , 21. , 8.16496581, 12. ]]) 2111 >>> R 2112 array([[1. , 0. , 1. , 0. ], 2113 [1. , 0. , 1. , 0. ], 2114 [1. , 0. , 1. , 0. ], 2115 [1. , 0. , 1. , 0. ], 2116 [1.14549722, 0.20576415, 2. , 0.70710678], 2117 [1.14549722, 0.20576415, 2. , 0.70710678], 2118 [1.14549722, 0.20576415, 2. , 0.70710678], 2119 [1.14549722, 0.20576415, 2. , 0.70710678], 2120 [2.78516386, 2.58797734, 3. , 1.15470054], 2121 [2.78516386, 2.58797734, 3. , 1.15470054], 2122 [6.57065706, 1.38071187, 3. , 1.15470054]]) 2123 2124 Now we can use `scipy.cluster.hierarchy.is_valid_im` to verify that 2125 ``R`` is correct: 2126 2127 >>> is_valid_im(R) 2128 True 2129 2130 However, if ``R`` is wrongly constructed (e.g., one of the standard 2131 deviations is set to a negative value), then the check will fail: 2132 2133 >>> R[-1,1] = R[-1,1] * -1 2134 >>> is_valid_im(R) 2135 False 2136 2137 """ 2138 R = np.asarray(R, order='c') 2139 valid = True 2140 name_str = "%r " % name if name else '' 2141 try: 2142 if type(R) != np.ndarray: 2143 raise TypeError('Variable %spassed as inconsistency matrix is not ' 2144 'a numpy array.' % name_str) 2145 if R.dtype != np.double: 2146 raise TypeError('Inconsistency matrix %smust contain doubles ' 2147 '(double).' % name_str) 2148 if len(R.shape) != 2: 2149 raise ValueError('Inconsistency matrix %smust have shape=2 (i.e. ' 2150 'be two-dimensional).' % name_str) 2151 if R.shape[1] != 4: 2152 raise ValueError('Inconsistency matrix %smust have 4 columns.' % 2153 name_str) 2154 if R.shape[0] < 1: 2155 raise ValueError('Inconsistency matrix %smust have at least one ' 2156 'row.' % name_str) 2157 if (R[:, 0] < 0).any(): 2158 raise ValueError('Inconsistency matrix %scontains negative link ' 2159 'height means.' % name_str) 2160 if (R[:, 1] < 0).any(): 2161 raise ValueError('Inconsistency matrix %scontains negative link ' 2162 'height standard deviations.' % name_str) 2163 if (R[:, 2] < 0).any(): 2164 raise ValueError('Inconsistency matrix %scontains negative link ' 2165 'counts.' % name_str) 2166 except Exception as e: 2167 if throw: 2168 raise 2169 if warning: 2170 _warning(str(e)) 2171 valid = False 2172 2173 return valid 2174 2175 2176def is_valid_linkage(Z, warning=False, throw=False, name=None): 2177 """ 2178 Check the validity of a linkage matrix. 2179 2180 A linkage matrix is valid if it is a 2-D array (type double) 2181 with :math:`n` rows and 4 columns. The first two columns must contain 2182 indices between 0 and :math:`2n-1`. For a given row ``i``, the following 2183 two expressions have to hold: 2184 2185 .. math:: 2186 2187 0 \\leq \\mathtt{Z[i,0]} \\leq i+n-1 2188 0 \\leq Z[i,1] \\leq i+n-1 2189 2190 I.e., a cluster cannot join another cluster unless the cluster being joined 2191 has been generated. 2192 2193 Parameters 2194 ---------- 2195 Z : array_like 2196 Linkage matrix. 2197 warning : bool, optional 2198 When True, issues a Python warning if the linkage 2199 matrix passed is invalid. 2200 throw : bool, optional 2201 When True, throws a Python exception if the linkage 2202 matrix passed is invalid. 2203 name : str, optional 2204 This string refers to the variable name of the invalid 2205 linkage matrix. 2206 2207 Returns 2208 ------- 2209 b : bool 2210 True if the inconsistency matrix is valid. 2211 2212 See Also 2213 -------- 2214 linkage: for a description of what a linkage matrix is. 2215 2216 Examples 2217 -------- 2218 >>> from scipy.cluster.hierarchy import ward, is_valid_linkage 2219 >>> from scipy.spatial.distance import pdist 2220 2221 All linkage matrices generated by the clustering methods in this module 2222 will be valid (i.e., they will have the appropriate dimensions and the two 2223 required expressions will hold for all the rows). 2224 2225 We can check this using `scipy.cluster.hierarchy.is_valid_linkage`: 2226 2227 >>> X = [[0, 0], [0, 1], [1, 0], 2228 ... [0, 4], [0, 3], [1, 4], 2229 ... [4, 0], [3, 0], [4, 1], 2230 ... [4, 4], [3, 4], [4, 3]] 2231 2232 >>> Z = ward(pdist(X)) 2233 >>> Z 2234 array([[ 0. , 1. , 1. , 2. ], 2235 [ 3. , 4. , 1. , 2. ], 2236 [ 6. , 7. , 1. , 2. ], 2237 [ 9. , 10. , 1. , 2. ], 2238 [ 2. , 12. , 1.29099445, 3. ], 2239 [ 5. , 13. , 1.29099445, 3. ], 2240 [ 8. , 14. , 1.29099445, 3. ], 2241 [11. , 15. , 1.29099445, 3. ], 2242 [16. , 17. , 5.77350269, 6. ], 2243 [18. , 19. , 5.77350269, 6. ], 2244 [20. , 21. , 8.16496581, 12. ]]) 2245 >>> is_valid_linkage(Z) 2246 True 2247 2248 However, if we create a linkage matrix in a wrong way - or if we modify 2249 a valid one in a way that any of the required expressions don't hold 2250 anymore, then the check will fail: 2251 2252 >>> Z[3][1] = 20 # the cluster number 20 is not defined at this point 2253 >>> is_valid_linkage(Z) 2254 False 2255 2256 """ 2257 Z = np.asarray(Z, order='c') 2258 valid = True 2259 name_str = "%r " % name if name else '' 2260 try: 2261 if type(Z) != np.ndarray: 2262 raise TypeError('Passed linkage argument %sis not a valid array.' % 2263 name_str) 2264 if Z.dtype != np.double: 2265 raise TypeError('Linkage matrix %smust contain doubles.' % name_str) 2266 if len(Z.shape) != 2: 2267 raise ValueError('Linkage matrix %smust have shape=2 (i.e. be ' 2268 'two-dimensional).' % name_str) 2269 if Z.shape[1] != 4: 2270 raise ValueError('Linkage matrix %smust have 4 columns.' % name_str) 2271 if Z.shape[0] == 0: 2272 raise ValueError('Linkage must be computed on at least two ' 2273 'observations.') 2274 n = Z.shape[0] 2275 if n > 1: 2276 if ((Z[:, 0] < 0).any() or (Z[:, 1] < 0).any()): 2277 raise ValueError('Linkage %scontains negative indices.' % 2278 name_str) 2279 if (Z[:, 2] < 0).any(): 2280 raise ValueError('Linkage %scontains negative distances.' % 2281 name_str) 2282 if (Z[:, 3] < 0).any(): 2283 raise ValueError('Linkage %scontains negative counts.' % 2284 name_str) 2285 if _check_hierarchy_uses_cluster_before_formed(Z): 2286 raise ValueError('Linkage %suses non-singleton cluster before ' 2287 'it is formed.' % name_str) 2288 if _check_hierarchy_uses_cluster_more_than_once(Z): 2289 raise ValueError('Linkage %suses the same cluster more than once.' 2290 % name_str) 2291 except Exception as e: 2292 if throw: 2293 raise 2294 if warning: 2295 _warning(str(e)) 2296 valid = False 2297 2298 return valid 2299 2300 2301def _check_hierarchy_uses_cluster_before_formed(Z): 2302 n = Z.shape[0] + 1 2303 for i in range(0, n - 1): 2304 if Z[i, 0] >= n + i or Z[i, 1] >= n + i: 2305 return True 2306 return False 2307 2308 2309def _check_hierarchy_uses_cluster_more_than_once(Z): 2310 n = Z.shape[0] + 1 2311 chosen = set([]) 2312 for i in range(0, n - 1): 2313 if (Z[i, 0] in chosen) or (Z[i, 1] in chosen) or Z[i, 0] == Z[i, 1]: 2314 return True 2315 chosen.add(Z[i, 0]) 2316 chosen.add(Z[i, 1]) 2317 return False 2318 2319 2320def _check_hierarchy_not_all_clusters_used(Z): 2321 n = Z.shape[0] + 1 2322 chosen = set([]) 2323 for i in range(0, n - 1): 2324 chosen.add(int(Z[i, 0])) 2325 chosen.add(int(Z[i, 1])) 2326 must_chosen = set(range(0, 2 * n - 2)) 2327 return len(must_chosen.difference(chosen)) > 0 2328 2329 2330def num_obs_linkage(Z): 2331 """ 2332 Return the number of original observations of the linkage matrix passed. 2333 2334 Parameters 2335 ---------- 2336 Z : ndarray 2337 The linkage matrix on which to perform the operation. 2338 2339 Returns 2340 ------- 2341 n : int 2342 The number of original observations in the linkage. 2343 2344 Examples 2345 -------- 2346 >>> from scipy.cluster.hierarchy import ward, num_obs_linkage 2347 >>> from scipy.spatial.distance import pdist 2348 2349 >>> X = [[0, 0], [0, 1], [1, 0], 2350 ... [0, 4], [0, 3], [1, 4], 2351 ... [4, 0], [3, 0], [4, 1], 2352 ... [4, 4], [3, 4], [4, 3]] 2353 2354 >>> Z = ward(pdist(X)) 2355 2356 ``Z`` is a linkage matrix obtained after using the Ward clustering method 2357 with ``X``, a dataset with 12 data points. 2358 2359 >>> num_obs_linkage(Z) 2360 12 2361 2362 """ 2363 Z = np.asarray(Z, order='c') 2364 is_valid_linkage(Z, throw=True, name='Z') 2365 return (Z.shape[0] + 1) 2366 2367 2368def correspond(Z, Y): 2369 """ 2370 Check for correspondence between linkage and condensed distance matrices. 2371 2372 They must have the same number of original observations for 2373 the check to succeed. 2374 2375 This function is useful as a sanity check in algorithms that make 2376 extensive use of linkage and distance matrices that must 2377 correspond to the same set of original observations. 2378 2379 Parameters 2380 ---------- 2381 Z : array_like 2382 The linkage matrix to check for correspondence. 2383 Y : array_like 2384 The condensed distance matrix to check for correspondence. 2385 2386 Returns 2387 ------- 2388 b : bool 2389 A boolean indicating whether the linkage matrix and distance 2390 matrix could possibly correspond to one another. 2391 2392 See Also 2393 -------- 2394 linkage : for a description of what a linkage matrix is. 2395 2396 Examples 2397 -------- 2398 >>> from scipy.cluster.hierarchy import ward, correspond 2399 >>> from scipy.spatial.distance import pdist 2400 2401 This method can be used to check if a given linkage matrix ``Z`` has been 2402 obtained from the application of a cluster method over a dataset ``X``: 2403 2404 >>> X = [[0, 0], [0, 1], [1, 0], 2405 ... [0, 4], [0, 3], [1, 4], 2406 ... [4, 0], [3, 0], [4, 1], 2407 ... [4, 4], [3, 4], [4, 3]] 2408 >>> X_condensed = pdist(X) 2409 >>> Z = ward(X_condensed) 2410 2411 Here, we can compare ``Z`` and ``X`` (in condensed form): 2412 2413 >>> correspond(Z, X_condensed) 2414 True 2415 2416 """ 2417 is_valid_linkage(Z, throw=True) 2418 distance.is_valid_y(Y, throw=True) 2419 Z = np.asarray(Z, order='c') 2420 Y = np.asarray(Y, order='c') 2421 return distance.num_obs_y(Y) == num_obs_linkage(Z) 2422 2423 2424def fcluster(Z, t, criterion='inconsistent', depth=2, R=None, monocrit=None): 2425 """ 2426 Form flat clusters from the hierarchical clustering defined by 2427 the given linkage matrix. 2428 2429 Parameters 2430 ---------- 2431 Z : ndarray 2432 The hierarchical clustering encoded with the matrix returned 2433 by the `linkage` function. 2434 t : scalar 2435 For criteria 'inconsistent', 'distance' or 'monocrit', 2436 this is the threshold to apply when forming flat clusters. 2437 For 'maxclust' or 'maxclust_monocrit' criteria, 2438 this would be max number of clusters requested. 2439 criterion : str, optional 2440 The criterion to use in forming flat clusters. This can 2441 be any of the following values: 2442 2443 ``inconsistent`` : 2444 If a cluster node and all its 2445 descendants have an inconsistent value less than or equal 2446 to `t`, then all its leaf descendants belong to the 2447 same flat cluster. When no non-singleton cluster meets 2448 this criterion, every node is assigned to its own 2449 cluster. (Default) 2450 2451 ``distance`` : 2452 Forms flat clusters so that the original 2453 observations in each flat cluster have no greater a 2454 cophenetic distance than `t`. 2455 2456 ``maxclust`` : 2457 Finds a minimum threshold ``r`` so that 2458 the cophenetic distance between any two original 2459 observations in the same flat cluster is no more than 2460 ``r`` and no more than `t` flat clusters are formed. 2461 2462 ``monocrit`` : 2463 Forms a flat cluster from a cluster node c 2464 with index i when ``monocrit[j] <= t``. 2465 2466 For example, to threshold on the maximum mean distance 2467 as computed in the inconsistency matrix R with a 2468 threshold of 0.8 do:: 2469 2470 MR = maxRstat(Z, R, 3) 2471 fcluster(Z, t=0.8, criterion='monocrit', monocrit=MR) 2472 2473 ``maxclust_monocrit`` : 2474 Forms a flat cluster from a 2475 non-singleton cluster node ``c`` when ``monocrit[i] <= 2476 r`` for all cluster indices ``i`` below and including 2477 ``c``. ``r`` is minimized such that no more than ``t`` 2478 flat clusters are formed. monocrit must be 2479 monotonic. For example, to minimize the threshold t on 2480 maximum inconsistency values so that no more than 3 flat 2481 clusters are formed, do:: 2482 2483 MI = maxinconsts(Z, R) 2484 fcluster(Z, t=3, criterion='maxclust_monocrit', monocrit=MI) 2485 depth : int, optional 2486 The maximum depth to perform the inconsistency calculation. 2487 It has no meaning for the other criteria. Default is 2. 2488 R : ndarray, optional 2489 The inconsistency matrix to use for the 'inconsistent' 2490 criterion. This matrix is computed if not provided. 2491 monocrit : ndarray, optional 2492 An array of length n-1. `monocrit[i]` is the 2493 statistics upon which non-singleton i is thresholded. The 2494 monocrit vector must be monotonic, i.e., given a node c with 2495 index i, for all node indices j corresponding to nodes 2496 below c, ``monocrit[i] >= monocrit[j]``. 2497 2498 Returns 2499 ------- 2500 fcluster : ndarray 2501 An array of length ``n``. ``T[i]`` is the flat cluster number to 2502 which original observation ``i`` belongs. 2503 2504 See Also 2505 -------- 2506 linkage : for information about hierarchical clustering methods work. 2507 2508 Examples 2509 -------- 2510 >>> from scipy.cluster.hierarchy import ward, fcluster 2511 >>> from scipy.spatial.distance import pdist 2512 2513 All cluster linkage methods - e.g., `scipy.cluster.hierarchy.ward` 2514 generate a linkage matrix ``Z`` as their output: 2515 2516 >>> X = [[0, 0], [0, 1], [1, 0], 2517 ... [0, 4], [0, 3], [1, 4], 2518 ... [4, 0], [3, 0], [4, 1], 2519 ... [4, 4], [3, 4], [4, 3]] 2520 2521 >>> Z = ward(pdist(X)) 2522 2523 >>> Z 2524 array([[ 0. , 1. , 1. , 2. ], 2525 [ 3. , 4. , 1. , 2. ], 2526 [ 6. , 7. , 1. , 2. ], 2527 [ 9. , 10. , 1. , 2. ], 2528 [ 2. , 12. , 1.29099445, 3. ], 2529 [ 5. , 13. , 1.29099445, 3. ], 2530 [ 8. , 14. , 1.29099445, 3. ], 2531 [11. , 15. , 1.29099445, 3. ], 2532 [16. , 17. , 5.77350269, 6. ], 2533 [18. , 19. , 5.77350269, 6. ], 2534 [20. , 21. , 8.16496581, 12. ]]) 2535 2536 This matrix represents a dendrogram, where the first and second elements 2537 are the two clusters merged at each step, the third element is the 2538 distance between these clusters, and the fourth element is the size of 2539 the new cluster - the number of original data points included. 2540 2541 `scipy.cluster.hierarchy.fcluster` can be used to flatten the 2542 dendrogram, obtaining as a result an assignation of the original data 2543 points to single clusters. 2544 2545 This assignation mostly depends on a distance threshold ``t`` - the maximum 2546 inter-cluster distance allowed: 2547 2548 >>> fcluster(Z, t=0.9, criterion='distance') 2549 array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int32) 2550 2551 >>> fcluster(Z, t=1.1, criterion='distance') 2552 array([1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8], dtype=int32) 2553 2554 >>> fcluster(Z, t=3, criterion='distance') 2555 array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32) 2556 2557 >>> fcluster(Z, t=9, criterion='distance') 2558 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32) 2559 2560 In the first case, the threshold ``t`` is too small to allow any two 2561 samples in the data to form a cluster, so 12 different clusters are 2562 returned. 2563 2564 In the second case, the threshold is large enough to allow the first 2565 4 points to be merged with their nearest neighbors. So, here, only 8 2566 clusters are returned. 2567 2568 The third case, with a much higher threshold, allows for up to 8 data 2569 points to be connected - so 4 clusters are returned here. 2570 2571 Lastly, the threshold of the fourth case is large enough to allow for 2572 all data points to be merged together - so a single cluster is returned. 2573 2574 """ 2575 Z = np.asarray(Z, order='c') 2576 is_valid_linkage(Z, throw=True, name='Z') 2577 2578 n = Z.shape[0] + 1 2579 T = np.zeros((n,), dtype='i') 2580 2581 # Since the C code does not support striding using strides. 2582 # The dimensions are used instead. 2583 [Z] = _copy_arrays_if_base_present([Z]) 2584 2585 if criterion == 'inconsistent': 2586 if R is None: 2587 R = inconsistent(Z, depth) 2588 else: 2589 R = np.asarray(R, order='c') 2590 is_valid_im(R, throw=True, name='R') 2591 # Since the C code does not support striding using strides. 2592 # The dimensions are used instead. 2593 [R] = _copy_arrays_if_base_present([R]) 2594 _hierarchy.cluster_in(Z, R, T, float(t), int(n)) 2595 elif criterion == 'distance': 2596 _hierarchy.cluster_dist(Z, T, float(t), int(n)) 2597 elif criterion == 'maxclust': 2598 _hierarchy.cluster_maxclust_dist(Z, T, int(n), int(t)) 2599 elif criterion == 'monocrit': 2600 [monocrit] = _copy_arrays_if_base_present([monocrit]) 2601 _hierarchy.cluster_monocrit(Z, monocrit, T, float(t), int(n)) 2602 elif criterion == 'maxclust_monocrit': 2603 [monocrit] = _copy_arrays_if_base_present([monocrit]) 2604 _hierarchy.cluster_maxclust_monocrit(Z, monocrit, T, int(n), int(t)) 2605 else: 2606 raise ValueError('Invalid cluster formation criterion: %s' 2607 % str(criterion)) 2608 return T 2609 2610 2611def fclusterdata(X, t, criterion='inconsistent', 2612 metric='euclidean', depth=2, method='single', R=None): 2613 """ 2614 Cluster observation data using a given metric. 2615 2616 Clusters the original observations in the n-by-m data 2617 matrix X (n observations in m dimensions), using the euclidean 2618 distance metric to calculate distances between original observations, 2619 performs hierarchical clustering using the single linkage algorithm, 2620 and forms flat clusters using the inconsistency method with `t` as the 2621 cut-off threshold. 2622 2623 A 1-D array ``T`` of length ``n`` is returned. ``T[i]`` is 2624 the index of the flat cluster to which the original observation ``i`` 2625 belongs. 2626 2627 Parameters 2628 ---------- 2629 X : (N, M) ndarray 2630 N by M data matrix with N observations in M dimensions. 2631 t : scalar 2632 For criteria 'inconsistent', 'distance' or 'monocrit', 2633 this is the threshold to apply when forming flat clusters. 2634 For 'maxclust' or 'maxclust_monocrit' criteria, 2635 this would be max number of clusters requested. 2636 criterion : str, optional 2637 Specifies the criterion for forming flat clusters. Valid 2638 values are 'inconsistent' (default), 'distance', or 'maxclust' 2639 cluster formation algorithms. See `fcluster` for descriptions. 2640 metric : str or function, optional 2641 The distance metric for calculating pairwise distances. See 2642 ``distance.pdist`` for descriptions and linkage to verify 2643 compatibility with the linkage method. 2644 depth : int, optional 2645 The maximum depth for the inconsistency calculation. See 2646 `inconsistent` for more information. 2647 method : str, optional 2648 The linkage method to use (single, complete, average, 2649 weighted, median centroid, ward). See `linkage` for more 2650 information. Default is "single". 2651 R : ndarray, optional 2652 The inconsistency matrix. It will be computed if necessary 2653 if it is not passed. 2654 2655 Returns 2656 ------- 2657 fclusterdata : ndarray 2658 A vector of length n. T[i] is the flat cluster number to 2659 which original observation i belongs. 2660 2661 See Also 2662 -------- 2663 scipy.spatial.distance.pdist : pairwise distance metrics 2664 2665 Notes 2666 ----- 2667 This function is similar to the MATLAB function ``clusterdata``. 2668 2669 Examples 2670 -------- 2671 >>> from scipy.cluster.hierarchy import fclusterdata 2672 2673 This is a convenience method that abstracts all the steps to perform in a 2674 typical SciPy's hierarchical clustering workflow. 2675 2676 * Transform the input data into a condensed matrix with `scipy.spatial.distance.pdist`. 2677 2678 * Apply a clustering method. 2679 2680 * Obtain flat clusters at a user defined distance threshold ``t`` using `scipy.cluster.hierarchy.fcluster`. 2681 2682 >>> X = [[0, 0], [0, 1], [1, 0], 2683 ... [0, 4], [0, 3], [1, 4], 2684 ... [4, 0], [3, 0], [4, 1], 2685 ... [4, 4], [3, 4], [4, 3]] 2686 2687 >>> fclusterdata(X, t=1) 2688 array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32) 2689 2690 The output here (for the dataset ``X``, distance threshold ``t``, and the 2691 default settings) is four clusters with three data points each. 2692 2693 """ 2694 X = np.asarray(X, order='c', dtype=np.double) 2695 2696 if type(X) != np.ndarray or len(X.shape) != 2: 2697 raise TypeError('The observation matrix X must be an n by m numpy ' 2698 'array.') 2699 2700 Y = distance.pdist(X, metric=metric) 2701 Z = linkage(Y, method=method) 2702 if R is None: 2703 R = inconsistent(Z, d=depth) 2704 else: 2705 R = np.asarray(R, order='c') 2706 T = fcluster(Z, criterion=criterion, depth=depth, R=R, t=t) 2707 return T 2708 2709 2710def leaves_list(Z): 2711 """ 2712 Return a list of leaf node ids. 2713 2714 The return corresponds to the observation vector index as it appears 2715 in the tree from left to right. Z is a linkage matrix. 2716 2717 Parameters 2718 ---------- 2719 Z : ndarray 2720 The hierarchical clustering encoded as a matrix. `Z` is 2721 a linkage matrix. See `linkage` for more information. 2722 2723 Returns 2724 ------- 2725 leaves_list : ndarray 2726 The list of leaf node ids. 2727 2728 See Also 2729 -------- 2730 dendrogram : for information about dendrogram structure. 2731 2732 Examples 2733 -------- 2734 >>> from scipy.cluster.hierarchy import ward, dendrogram, leaves_list 2735 >>> from scipy.spatial.distance import pdist 2736 >>> from matplotlib import pyplot as plt 2737 2738 >>> X = [[0, 0], [0, 1], [1, 0], 2739 ... [0, 4], [0, 3], [1, 4], 2740 ... [4, 0], [3, 0], [4, 1], 2741 ... [4, 4], [3, 4], [4, 3]] 2742 2743 >>> Z = ward(pdist(X)) 2744 2745 The linkage matrix ``Z`` represents a dendrogram, that is, a tree that 2746 encodes the structure of the clustering performed. 2747 `scipy.cluster.hierarchy.leaves_list` shows the mapping between 2748 indices in the ``X`` dataset and leaves in the dendrogram: 2749 2750 >>> leaves_list(Z) 2751 array([ 2, 0, 1, 5, 3, 4, 8, 6, 7, 11, 9, 10], dtype=int32) 2752 2753 >>> fig = plt.figure(figsize=(25, 10)) 2754 >>> dn = dendrogram(Z) 2755 >>> plt.show() 2756 2757 """ 2758 Z = np.asarray(Z, order='c') 2759 is_valid_linkage(Z, throw=True, name='Z') 2760 n = Z.shape[0] + 1 2761 ML = np.zeros((n,), dtype='i') 2762 [Z] = _copy_arrays_if_base_present([Z]) 2763 _hierarchy.prelist(Z, ML, int(n)) 2764 return ML 2765 2766 2767# Maps number of leaves to text size. 2768# 2769# p <= 20, size="12" 2770# 20 < p <= 30, size="10" 2771# 30 < p <= 50, size="8" 2772# 50 < p <= np.inf, size="6" 2773 2774_dtextsizes = {20: 12, 30: 10, 50: 8, 85: 6, np.inf: 5} 2775_drotation = {20: 0, 40: 45, np.inf: 90} 2776_dtextsortedkeys = list(_dtextsizes.keys()) 2777_dtextsortedkeys.sort() 2778_drotationsortedkeys = list(_drotation.keys()) 2779_drotationsortedkeys.sort() 2780 2781 2782def _remove_dups(L): 2783 """ 2784 Remove duplicates AND preserve the original order of the elements. 2785 2786 The set class is not guaranteed to do this. 2787 """ 2788 seen_before = set([]) 2789 L2 = [] 2790 for i in L: 2791 if i not in seen_before: 2792 seen_before.add(i) 2793 L2.append(i) 2794 return L2 2795 2796 2797def _get_tick_text_size(p): 2798 for k in _dtextsortedkeys: 2799 if p <= k: 2800 return _dtextsizes[k] 2801 2802 2803def _get_tick_rotation(p): 2804 for k in _drotationsortedkeys: 2805 if p <= k: 2806 return _drotation[k] 2807 2808 2809def _plot_dendrogram(icoords, dcoords, ivl, p, n, mh, orientation, 2810 no_labels, color_list, leaf_font_size=None, 2811 leaf_rotation=None, contraction_marks=None, 2812 ax=None, above_threshold_color='C0'): 2813 # Import matplotlib here so that it's not imported unless dendrograms 2814 # are plotted. Raise an informative error if importing fails. 2815 try: 2816 # if an axis is provided, don't use pylab at all 2817 if ax is None: 2818 import matplotlib.pylab 2819 import matplotlib.patches 2820 import matplotlib.collections 2821 except ImportError as e: 2822 raise ImportError("You must install the matplotlib library to plot " 2823 "the dendrogram. Use no_plot=True to calculate the " 2824 "dendrogram without plotting.") from e 2825 2826 if ax is None: 2827 ax = matplotlib.pylab.gca() 2828 # if we're using pylab, we want to trigger a draw at the end 2829 trigger_redraw = True 2830 else: 2831 trigger_redraw = False 2832 2833 # Independent variable plot width 2834 ivw = len(ivl) * 10 2835 # Dependent variable plot height 2836 dvw = mh + mh * 0.05 2837 2838 iv_ticks = np.arange(5, len(ivl) * 10 + 5, 10) 2839 if orientation in ('top', 'bottom'): 2840 if orientation == 'top': 2841 ax.set_ylim([0, dvw]) 2842 ax.set_xlim([0, ivw]) 2843 else: 2844 ax.set_ylim([dvw, 0]) 2845 ax.set_xlim([0, ivw]) 2846 2847 xlines = icoords 2848 ylines = dcoords 2849 if no_labels: 2850 ax.set_xticks([]) 2851 ax.set_xticklabels([]) 2852 else: 2853 ax.set_xticks(iv_ticks) 2854 2855 if orientation == 'top': 2856 ax.xaxis.set_ticks_position('bottom') 2857 else: 2858 ax.xaxis.set_ticks_position('top') 2859 2860 # Make the tick marks invisible because they cover up the links 2861 for line in ax.get_xticklines(): 2862 line.set_visible(False) 2863 2864 leaf_rot = (float(_get_tick_rotation(len(ivl))) 2865 if (leaf_rotation is None) else leaf_rotation) 2866 leaf_font = (float(_get_tick_text_size(len(ivl))) 2867 if (leaf_font_size is None) else leaf_font_size) 2868 ax.set_xticklabels(ivl, rotation=leaf_rot, size=leaf_font) 2869 2870 elif orientation in ('left', 'right'): 2871 if orientation == 'left': 2872 ax.set_xlim([dvw, 0]) 2873 ax.set_ylim([0, ivw]) 2874 else: 2875 ax.set_xlim([0, dvw]) 2876 ax.set_ylim([0, ivw]) 2877 2878 xlines = dcoords 2879 ylines = icoords 2880 if no_labels: 2881 ax.set_yticks([]) 2882 ax.set_yticklabels([]) 2883 else: 2884 ax.set_yticks(iv_ticks) 2885 2886 if orientation == 'left': 2887 ax.yaxis.set_ticks_position('right') 2888 else: 2889 ax.yaxis.set_ticks_position('left') 2890 2891 # Make the tick marks invisible because they cover up the links 2892 for line in ax.get_yticklines(): 2893 line.set_visible(False) 2894 2895 leaf_font = (float(_get_tick_text_size(len(ivl))) 2896 if (leaf_font_size is None) else leaf_font_size) 2897 2898 if leaf_rotation is not None: 2899 ax.set_yticklabels(ivl, rotation=leaf_rotation, size=leaf_font) 2900 else: 2901 ax.set_yticklabels(ivl, size=leaf_font) 2902 2903 # Let's use collections instead. This way there is a separate legend item 2904 # for each tree grouping, rather than stupidly one for each line segment. 2905 colors_used = _remove_dups(color_list) 2906 color_to_lines = {} 2907 for color in colors_used: 2908 color_to_lines[color] = [] 2909 for (xline, yline, color) in zip(xlines, ylines, color_list): 2910 color_to_lines[color].append(list(zip(xline, yline))) 2911 2912 colors_to_collections = {} 2913 # Construct the collections. 2914 for color in colors_used: 2915 coll = matplotlib.collections.LineCollection(color_to_lines[color], 2916 colors=(color,)) 2917 colors_to_collections[color] = coll 2918 2919 # Add all the groupings below the color threshold. 2920 for color in colors_used: 2921 if color != above_threshold_color: 2922 ax.add_collection(colors_to_collections[color]) 2923 # If there's a grouping of links above the color threshold, it goes last. 2924 if above_threshold_color in colors_to_collections: 2925 ax.add_collection(colors_to_collections[above_threshold_color]) 2926 2927 if contraction_marks is not None: 2928 Ellipse = matplotlib.patches.Ellipse 2929 for (x, y) in contraction_marks: 2930 if orientation in ('left', 'right'): 2931 e = Ellipse((y, x), width=dvw / 100, height=1.0) 2932 else: 2933 e = Ellipse((x, y), width=1.0, height=dvw / 100) 2934 ax.add_artist(e) 2935 e.set_clip_box(ax.bbox) 2936 e.set_alpha(0.5) 2937 e.set_facecolor('k') 2938 2939 if trigger_redraw: 2940 matplotlib.pylab.draw_if_interactive() 2941 2942 2943# C0 is used for above threshhold color 2944_link_line_colors_default = ('C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9') 2945_link_line_colors = list(_link_line_colors_default) 2946 2947 2948def set_link_color_palette(palette): 2949 """ 2950 Set list of matplotlib color codes for use by dendrogram. 2951 2952 Note that this palette is global (i.e., setting it once changes the colors 2953 for all subsequent calls to `dendrogram`) and that it affects only the 2954 the colors below ``color_threshold``. 2955 2956 Note that `dendrogram` also accepts a custom coloring function through its 2957 ``link_color_func`` keyword, which is more flexible and non-global. 2958 2959 Parameters 2960 ---------- 2961 palette : list of str or None 2962 A list of matplotlib color codes. The order of the color codes is the 2963 order in which the colors are cycled through when color thresholding in 2964 the dendrogram. 2965 2966 If ``None``, resets the palette to its default (which are matplotlib 2967 default colors C1 to C9). 2968 2969 Returns 2970 ------- 2971 None 2972 2973 See Also 2974 -------- 2975 dendrogram 2976 2977 Notes 2978 ----- 2979 Ability to reset the palette with ``None`` added in SciPy 0.17.0. 2980 2981 Examples 2982 -------- 2983 >>> from scipy.cluster import hierarchy 2984 >>> ytdist = np.array([662., 877., 255., 412., 996., 295., 468., 268., 2985 ... 400., 754., 564., 138., 219., 869., 669.]) 2986 >>> Z = hierarchy.linkage(ytdist, 'single') 2987 >>> dn = hierarchy.dendrogram(Z, no_plot=True) 2988 >>> dn['color_list'] 2989 ['C1', 'C0', 'C0', 'C0', 'C0'] 2990 >>> hierarchy.set_link_color_palette(['c', 'm', 'y', 'k']) 2991 >>> dn = hierarchy.dendrogram(Z, no_plot=True, above_threshold_color='b') 2992 >>> dn['color_list'] 2993 ['c', 'b', 'b', 'b', 'b'] 2994 >>> dn = hierarchy.dendrogram(Z, no_plot=True, color_threshold=267, 2995 ... above_threshold_color='k') 2996 >>> dn['color_list'] 2997 ['c', 'm', 'm', 'k', 'k'] 2998 2999 Now, reset the color palette to its default: 3000 3001 >>> hierarchy.set_link_color_palette(None) 3002 3003 """ 3004 if palette is None: 3005 # reset to its default 3006 palette = _link_line_colors_default 3007 elif type(palette) not in (list, tuple): 3008 raise TypeError("palette must be a list or tuple") 3009 _ptypes = [isinstance(p, str) for p in palette] 3010 3011 if False in _ptypes: 3012 raise TypeError("all palette list elements must be color strings") 3013 3014 global _link_line_colors 3015 _link_line_colors = palette 3016 3017 3018def dendrogram(Z, p=30, truncate_mode=None, color_threshold=None, 3019 get_leaves=True, orientation='top', labels=None, 3020 count_sort=False, distance_sort=False, show_leaf_counts=True, 3021 no_plot=False, no_labels=False, leaf_font_size=None, 3022 leaf_rotation=None, leaf_label_func=None, 3023 show_contracted=False, link_color_func=None, ax=None, 3024 above_threshold_color='C0'): 3025 """ 3026 Plot the hierarchical clustering as a dendrogram. 3027 3028 The dendrogram illustrates how each cluster is 3029 composed by drawing a U-shaped link between a non-singleton 3030 cluster and its children. The top of the U-link indicates a 3031 cluster merge. The two legs of the U-link indicate which clusters 3032 were merged. The length of the two legs of the U-link represents 3033 the distance between the child clusters. It is also the 3034 cophenetic distance between original observations in the two 3035 children clusters. 3036 3037 Parameters 3038 ---------- 3039 Z : ndarray 3040 The linkage matrix encoding the hierarchical clustering to 3041 render as a dendrogram. See the ``linkage`` function for more 3042 information on the format of ``Z``. 3043 p : int, optional 3044 The ``p`` parameter for ``truncate_mode``. 3045 truncate_mode : str, optional 3046 The dendrogram can be hard to read when the original 3047 observation matrix from which the linkage is derived is 3048 large. Truncation is used to condense the dendrogram. There 3049 are several modes: 3050 3051 ``None`` 3052 No truncation is performed (default). 3053 Note: ``'none'`` is an alias for ``None`` that's kept for 3054 backward compatibility. 3055 3056 ``'lastp'`` 3057 The last ``p`` non-singleton clusters formed in the linkage are the 3058 only non-leaf nodes in the linkage; they correspond to rows 3059 ``Z[n-p-2:end]`` in ``Z``. All other non-singleton clusters are 3060 contracted into leaf nodes. 3061 3062 ``'level'`` 3063 No more than ``p`` levels of the dendrogram tree are displayed. 3064 A "level" includes all nodes with ``p`` merges from the final merge. 3065 3066 Note: ``'mtica'`` is an alias for ``'level'`` that's kept for 3067 backward compatibility. 3068 3069 color_threshold : double, optional 3070 For brevity, let :math:`t` be the ``color_threshold``. 3071 Colors all the descendent links below a cluster node 3072 :math:`k` the same color if :math:`k` is the first node below 3073 the cut threshold :math:`t`. All links connecting nodes with 3074 distances greater than or equal to the threshold are colored 3075 with de default matplotlib color ``'C0'``. If :math:`t` is less 3076 than or equal to zero, all nodes are colored ``'C0'``. 3077 If ``color_threshold`` is None or 'default', 3078 corresponding with MATLAB(TM) behavior, the threshold is set to 3079 ``0.7*max(Z[:,2])``. 3080 3081 get_leaves : bool, optional 3082 Includes a list ``R['leaves']=H`` in the result 3083 dictionary. For each :math:`i`, ``H[i] == j``, cluster node 3084 ``j`` appears in position ``i`` in the left-to-right traversal 3085 of the leaves, where :math:`j < 2n-1` and :math:`i < n`. 3086 orientation : str, optional 3087 The direction to plot the dendrogram, which can be any 3088 of the following strings: 3089 3090 ``'top'`` 3091 Plots the root at the top, and plot descendent links going downwards. 3092 (default). 3093 3094 ``'bottom'`` 3095 Plots the root at the bottom, and plot descendent links going 3096 upwards. 3097 3098 ``'left'`` 3099 Plots the root at the left, and plot descendent links going right. 3100 3101 ``'right'`` 3102 Plots the root at the right, and plot descendent links going left. 3103 3104 labels : ndarray, optional 3105 By default, ``labels`` is None so the index of the original observation 3106 is used to label the leaf nodes. Otherwise, this is an :math:`n`-sized 3107 sequence, with ``n == Z.shape[0] + 1``. The ``labels[i]`` value is the 3108 text to put under the :math:`i` th leaf node only if it corresponds to 3109 an original observation and not a non-singleton cluster. 3110 count_sort : str or bool, optional 3111 For each node n, the order (visually, from left-to-right) n's 3112 two descendent links are plotted is determined by this 3113 parameter, which can be any of the following values: 3114 3115 ``False`` 3116 Nothing is done. 3117 3118 ``'ascending'`` or ``True`` 3119 The child with the minimum number of original objects in its cluster 3120 is plotted first. 3121 3122 ``'descending'`` 3123 The child with the maximum number of original objects in its cluster 3124 is plotted first. 3125 3126 Note, ``distance_sort`` and ``count_sort`` cannot both be True. 3127 distance_sort : str or bool, optional 3128 For each node n, the order (visually, from left-to-right) n's 3129 two descendent links are plotted is determined by this 3130 parameter, which can be any of the following values: 3131 3132 ``False`` 3133 Nothing is done. 3134 3135 ``'ascending'`` or ``True`` 3136 The child with the minimum distance between its direct descendents is 3137 plotted first. 3138 3139 ``'descending'`` 3140 The child with the maximum distance between its direct descendents is 3141 plotted first. 3142 3143 Note ``distance_sort`` and ``count_sort`` cannot both be True. 3144 show_leaf_counts : bool, optional 3145 When True, leaf nodes representing :math:`k>1` original 3146 observation are labeled with the number of observations they 3147 contain in parentheses. 3148 no_plot : bool, optional 3149 When True, the final rendering is not performed. This is 3150 useful if only the data structures computed for the rendering 3151 are needed or if matplotlib is not available. 3152 no_labels : bool, optional 3153 When True, no labels appear next to the leaf nodes in the 3154 rendering of the dendrogram. 3155 leaf_rotation : double, optional 3156 Specifies the angle (in degrees) to rotate the leaf 3157 labels. When unspecified, the rotation is based on the number of 3158 nodes in the dendrogram (default is 0). 3159 leaf_font_size : int, optional 3160 Specifies the font size (in points) of the leaf labels. When 3161 unspecified, the size based on the number of nodes in the 3162 dendrogram. 3163 leaf_label_func : lambda or function, optional 3164 When ``leaf_label_func`` is a callable function, for each 3165 leaf with cluster index :math:`k < 2n-1`. The function 3166 is expected to return a string with the label for the 3167 leaf. 3168 3169 Indices :math:`k < n` correspond to original observations 3170 while indices :math:`k \\geq n` correspond to non-singleton 3171 clusters. 3172 3173 For example, to label singletons with their node id and 3174 non-singletons with their id, count, and inconsistency 3175 coefficient, simply do:: 3176 3177 # First define the leaf label function. 3178 def llf(id): 3179 if id < n: 3180 return str(id) 3181 else: 3182 return '[%d %d %1.2f]' % (id, count, R[n-id,3]) 3183 3184 # The text for the leaf nodes is going to be big so force 3185 # a rotation of 90 degrees. 3186 dendrogram(Z, leaf_label_func=llf, leaf_rotation=90) 3187 3188 # leaf_label_func can also be used together with ``truncate_mode`` parameter, 3189 # in which case you will get your leaves labeled after truncation: 3190 dendrogram(Z, leaf_label_func=llf, leaf_rotation=90, 3191 truncate_mode='level', p=2) 3192 3193 show_contracted : bool, optional 3194 When True the heights of non-singleton nodes contracted 3195 into a leaf node are plotted as crosses along the link 3196 connecting that leaf node. This really is only useful when 3197 truncation is used (see ``truncate_mode`` parameter). 3198 link_color_func : callable, optional 3199 If given, `link_color_function` is called with each non-singleton id 3200 corresponding to each U-shaped link it will paint. The function is 3201 expected to return the color to paint the link, encoded as a matplotlib 3202 color string code. For example:: 3203 3204 dendrogram(Z, link_color_func=lambda k: colors[k]) 3205 3206 colors the direct links below each untruncated non-singleton node 3207 ``k`` using ``colors[k]``. 3208 ax : matplotlib Axes instance, optional 3209 If None and `no_plot` is not True, the dendrogram will be plotted 3210 on the current axes. Otherwise if `no_plot` is not True the 3211 dendrogram will be plotted on the given ``Axes`` instance. This can be 3212 useful if the dendrogram is part of a more complex figure. 3213 above_threshold_color : str, optional 3214 This matplotlib color string sets the color of the links above the 3215 color_threshold. The default is ``'C0'``. 3216 3217 Returns 3218 ------- 3219 R : dict 3220 A dictionary of data structures computed to render the 3221 dendrogram. Its has the following keys: 3222 3223 ``'color_list'`` 3224 A list of color names. The k'th element represents the color of the 3225 k'th link. 3226 3227 ``'icoord'`` and ``'dcoord'`` 3228 Each of them is a list of lists. Let ``icoord = [I1, I2, ..., Ip]`` 3229 where ``Ik = [xk1, xk2, xk3, xk4]`` and ``dcoord = [D1, D2, ..., Dp]`` 3230 where ``Dk = [yk1, yk2, yk3, yk4]``, then the k'th link painted is 3231 ``(xk1, yk1)`` - ``(xk2, yk2)`` - ``(xk3, yk3)`` - ``(xk4, yk4)``. 3232 3233 ``'ivl'`` 3234 A list of labels corresponding to the leaf nodes. 3235 3236 ``'leaves'`` 3237 For each i, ``H[i] == j``, cluster node ``j`` appears in position 3238 ``i`` in the left-to-right traversal of the leaves, where 3239 :math:`j < 2n-1` and :math:`i < n`. If ``j`` is less than ``n``, the 3240 ``i``-th leaf node corresponds to an original observation. 3241 Otherwise, it corresponds to a non-singleton cluster. 3242 3243 ``'leaves_color_list'`` 3244 A list of color names. The k'th element represents the color of the 3245 k'th leaf. 3246 3247 See Also 3248 -------- 3249 linkage, set_link_color_palette 3250 3251 Notes 3252 ----- 3253 It is expected that the distances in ``Z[:,2]`` be monotonic, otherwise 3254 crossings appear in the dendrogram. 3255 3256 Examples 3257 -------- 3258 >>> from scipy.cluster import hierarchy 3259 >>> import matplotlib.pyplot as plt 3260 3261 A very basic example: 3262 3263 >>> ytdist = np.array([662., 877., 255., 412., 996., 295., 468., 268., 3264 ... 400., 754., 564., 138., 219., 869., 669.]) 3265 >>> Z = hierarchy.linkage(ytdist, 'single') 3266 >>> plt.figure() 3267 >>> dn = hierarchy.dendrogram(Z) 3268 3269 Now, plot in given axes, improve the color scheme and use both vertical and 3270 horizontal orientations: 3271 3272 >>> hierarchy.set_link_color_palette(['m', 'c', 'y', 'k']) 3273 >>> fig, axes = plt.subplots(1, 2, figsize=(8, 3)) 3274 >>> dn1 = hierarchy.dendrogram(Z, ax=axes[0], above_threshold_color='y', 3275 ... orientation='top') 3276 >>> dn2 = hierarchy.dendrogram(Z, ax=axes[1], 3277 ... above_threshold_color='#bcbddc', 3278 ... orientation='right') 3279 >>> hierarchy.set_link_color_palette(None) # reset to default after use 3280 >>> plt.show() 3281 3282 """ 3283 # This feature was thought about but never implemented (still useful?): 3284 # 3285 # ... = dendrogram(..., leaves_order=None) 3286 # 3287 # Plots the leaves in the order specified by a vector of 3288 # original observation indices. If the vector contains duplicates 3289 # or results in a crossing, an exception will be thrown. Passing 3290 # None orders leaf nodes based on the order they appear in the 3291 # pre-order traversal. 3292 Z = np.asarray(Z, order='c') 3293 3294 if orientation not in ["top", "left", "bottom", "right"]: 3295 raise ValueError("orientation must be one of 'top', 'left', " 3296 "'bottom', or 'right'") 3297 3298 if labels is not None and Z.shape[0] + 1 != len(labels): 3299 raise ValueError("Dimensions of Z and labels must be consistent.") 3300 3301 is_valid_linkage(Z, throw=True, name='Z') 3302 Zs = Z.shape 3303 n = Zs[0] + 1 3304 if type(p) in (int, float): 3305 p = int(p) 3306 else: 3307 raise TypeError('The second argument must be a number') 3308 3309 if truncate_mode not in ('lastp', 'mlab', 'mtica', 'level', 'none', None): 3310 # 'mlab' and 'mtica' are kept working for backwards compat. 3311 raise ValueError('Invalid truncation mode.') 3312 3313 if truncate_mode == 'lastp' or truncate_mode == 'mlab': 3314 if p > n or p == 0: 3315 p = n 3316 3317 if truncate_mode == 'mtica': 3318 # 'mtica' is an alias 3319 truncate_mode = 'level' 3320 3321 if truncate_mode == 'level': 3322 if p <= 0: 3323 p = np.inf 3324 3325 if get_leaves: 3326 lvs = [] 3327 else: 3328 lvs = None 3329 3330 icoord_list = [] 3331 dcoord_list = [] 3332 color_list = [] 3333 current_color = [0] 3334 currently_below_threshold = [False] 3335 ivl = [] # list of leaves 3336 3337 if color_threshold is None or (isinstance(color_threshold, str) and 3338 color_threshold == 'default'): 3339 color_threshold = max(Z[:, 2]) * 0.7 3340 3341 R = {'icoord': icoord_list, 'dcoord': dcoord_list, 'ivl': ivl, 3342 'leaves': lvs, 'color_list': color_list} 3343 3344 # Empty list will be filled in _dendrogram_calculate_info 3345 contraction_marks = [] if show_contracted else None 3346 3347 _dendrogram_calculate_info( 3348 Z=Z, p=p, 3349 truncate_mode=truncate_mode, 3350 color_threshold=color_threshold, 3351 get_leaves=get_leaves, 3352 orientation=orientation, 3353 labels=labels, 3354 count_sort=count_sort, 3355 distance_sort=distance_sort, 3356 show_leaf_counts=show_leaf_counts, 3357 i=2*n - 2, 3358 iv=0.0, 3359 ivl=ivl, 3360 n=n, 3361 icoord_list=icoord_list, 3362 dcoord_list=dcoord_list, 3363 lvs=lvs, 3364 current_color=current_color, 3365 color_list=color_list, 3366 currently_below_threshold=currently_below_threshold, 3367 leaf_label_func=leaf_label_func, 3368 contraction_marks=contraction_marks, 3369 link_color_func=link_color_func, 3370 above_threshold_color=above_threshold_color) 3371 3372 if not no_plot: 3373 mh = max(Z[:, 2]) 3374 _plot_dendrogram(icoord_list, dcoord_list, ivl, p, n, mh, orientation, 3375 no_labels, color_list, 3376 leaf_font_size=leaf_font_size, 3377 leaf_rotation=leaf_rotation, 3378 contraction_marks=contraction_marks, 3379 ax=ax, 3380 above_threshold_color=above_threshold_color) 3381 3382 R["leaves_color_list"] = _get_leaves_color_list(R) 3383 3384 return R 3385 3386 3387def _get_leaves_color_list(R): 3388 leaves_color_list = [None] * len(R['leaves']) 3389 for link_x, link_y, link_color in zip(R['icoord'], 3390 R['dcoord'], 3391 R['color_list']): 3392 for (xi, yi) in zip(link_x, link_y): 3393 if yi == 0.0: # if yi is 0.0, the point is a leaf 3394 # xi of leaves are 5, 15, 25, 35, ... (see `iv_ticks`) 3395 # index of leaves are 0, 1, 2, 3, ... as below 3396 leaf_index = (int(xi) - 5) // 10 3397 # each leaf has a same color of its link. 3398 leaves_color_list[leaf_index] = link_color 3399 return leaves_color_list 3400 3401 3402def _append_singleton_leaf_node(Z, p, n, level, lvs, ivl, leaf_label_func, 3403 i, labels): 3404 # If the leaf id structure is not None and is a list then the caller 3405 # to dendrogram has indicated that cluster id's corresponding to the 3406 # leaf nodes should be recorded. 3407 3408 if lvs is not None: 3409 lvs.append(int(i)) 3410 3411 # If leaf node labels are to be displayed... 3412 if ivl is not None: 3413 # If a leaf_label_func has been provided, the label comes from the 3414 # string returned from the leaf_label_func, which is a function 3415 # passed to dendrogram. 3416 if leaf_label_func: 3417 ivl.append(leaf_label_func(int(i))) 3418 else: 3419 # Otherwise, if the dendrogram caller has passed a labels list 3420 # for the leaf nodes, use it. 3421 if labels is not None: 3422 ivl.append(labels[int(i - n)]) 3423 else: 3424 # Otherwise, use the id as the label for the leaf.x 3425 ivl.append(str(int(i))) 3426 3427 3428def _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl, leaf_label_func, 3429 i, labels, show_leaf_counts): 3430 # If the leaf id structure is not None and is a list then the caller 3431 # to dendrogram has indicated that cluster id's corresponding to the 3432 # leaf nodes should be recorded. 3433 3434 if lvs is not None: 3435 lvs.append(int(i)) 3436 if ivl is not None: 3437 if leaf_label_func: 3438 ivl.append(leaf_label_func(int(i))) 3439 else: 3440 if show_leaf_counts: 3441 ivl.append("(" + str(int(Z[i - n, 3])) + ")") 3442 else: 3443 ivl.append("") 3444 3445 3446def _append_contraction_marks(Z, iv, i, n, contraction_marks): 3447 _append_contraction_marks_sub(Z, iv, int(Z[i - n, 0]), n, contraction_marks) 3448 _append_contraction_marks_sub(Z, iv, int(Z[i - n, 1]), n, contraction_marks) 3449 3450 3451def _append_contraction_marks_sub(Z, iv, i, n, contraction_marks): 3452 if i >= n: 3453 contraction_marks.append((iv, Z[i - n, 2])) 3454 _append_contraction_marks_sub(Z, iv, int(Z[i - n, 0]), n, contraction_marks) 3455 _append_contraction_marks_sub(Z, iv, int(Z[i - n, 1]), n, contraction_marks) 3456 3457 3458def _dendrogram_calculate_info(Z, p, truncate_mode, 3459 color_threshold=np.inf, get_leaves=True, 3460 orientation='top', labels=None, 3461 count_sort=False, distance_sort=False, 3462 show_leaf_counts=False, i=-1, iv=0.0, 3463 ivl=[], n=0, icoord_list=[], dcoord_list=[], 3464 lvs=None, mhr=False, 3465 current_color=[], color_list=[], 3466 currently_below_threshold=[], 3467 leaf_label_func=None, level=0, 3468 contraction_marks=None, 3469 link_color_func=None, 3470 above_threshold_color='C0'): 3471 """ 3472 Calculate the endpoints of the links as well as the labels for the 3473 the dendrogram rooted at the node with index i. iv is the independent 3474 variable value to plot the left-most leaf node below the root node i 3475 (if orientation='top', this would be the left-most x value where the 3476 plotting of this root node i and its descendents should begin). 3477 3478 ivl is a list to store the labels of the leaf nodes. The leaf_label_func 3479 is called whenever ivl != None, labels == None, and 3480 leaf_label_func != None. When ivl != None and labels != None, the 3481 labels list is used only for labeling the leaf nodes. When 3482 ivl == None, no labels are generated for leaf nodes. 3483 3484 When get_leaves==True, a list of leaves is built as they are visited 3485 in the dendrogram. 3486 3487 Returns a tuple with l being the independent variable coordinate that 3488 corresponds to the midpoint of cluster to the left of cluster i if 3489 i is non-singleton, otherwise the independent coordinate of the leaf 3490 node if i is a leaf node. 3491 3492 Returns 3493 ------- 3494 A tuple (left, w, h, md), where: 3495 * left is the independent variable coordinate of the center of the 3496 the U of the subtree 3497 3498 * w is the amount of space used for the subtree (in independent 3499 variable units) 3500 3501 * h is the height of the subtree in dependent variable units 3502 3503 * md is the ``max(Z[*,2]``) for all nodes ``*`` below and including 3504 the target node. 3505 3506 """ 3507 if n == 0: 3508 raise ValueError("Invalid singleton cluster count n.") 3509 3510 if i == -1: 3511 raise ValueError("Invalid root cluster index i.") 3512 3513 if truncate_mode == 'lastp': 3514 # If the node is a leaf node but corresponds to a non-singleton 3515 # cluster, its label is either the empty string or the number of 3516 # original observations belonging to cluster i. 3517 if 2*n - p > i >= n: 3518 d = Z[i - n, 2] 3519 _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl, 3520 leaf_label_func, i, labels, 3521 show_leaf_counts) 3522 if contraction_marks is not None: 3523 _append_contraction_marks(Z, iv + 5.0, i, n, contraction_marks) 3524 return (iv + 5.0, 10.0, 0.0, d) 3525 elif i < n: 3526 _append_singleton_leaf_node(Z, p, n, level, lvs, ivl, 3527 leaf_label_func, i, labels) 3528 return (iv + 5.0, 10.0, 0.0, 0.0) 3529 elif truncate_mode == 'level': 3530 if i > n and level > p: 3531 d = Z[i - n, 2] 3532 _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl, 3533 leaf_label_func, i, labels, 3534 show_leaf_counts) 3535 if contraction_marks is not None: 3536 _append_contraction_marks(Z, iv + 5.0, i, n, contraction_marks) 3537 return (iv + 5.0, 10.0, 0.0, d) 3538 elif i < n: 3539 _append_singleton_leaf_node(Z, p, n, level, lvs, ivl, 3540 leaf_label_func, i, labels) 3541 return (iv + 5.0, 10.0, 0.0, 0.0) 3542 elif truncate_mode in ('mlab',): 3543 msg = "Mode 'mlab' is deprecated in scipy 0.19.0 (it never worked)." 3544 warnings.warn(msg, DeprecationWarning) 3545 3546 # Otherwise, only truncate if we have a leaf node. 3547 # 3548 # Only place leaves if they correspond to original observations. 3549 if i < n: 3550 _append_singleton_leaf_node(Z, p, n, level, lvs, ivl, 3551 leaf_label_func, i, labels) 3552 return (iv + 5.0, 10.0, 0.0, 0.0) 3553 3554 # !!! Otherwise, we don't have a leaf node, so work on plotting a 3555 # non-leaf node. 3556 # Actual indices of a and b 3557 aa = int(Z[i - n, 0]) 3558 ab = int(Z[i - n, 1]) 3559 if aa >= n: 3560 # The number of singletons below cluster a 3561 na = Z[aa - n, 3] 3562 # The distance between a's two direct children. 3563 da = Z[aa - n, 2] 3564 else: 3565 na = 1 3566 da = 0.0 3567 if ab >= n: 3568 nb = Z[ab - n, 3] 3569 db = Z[ab - n, 2] 3570 else: 3571 nb = 1 3572 db = 0.0 3573 3574 if count_sort == 'ascending' or count_sort == True: 3575 # If a has a count greater than b, it and its descendents should 3576 # be drawn to the right. Otherwise, to the left. 3577 if na > nb: 3578 # The cluster index to draw to the left (ua) will be ab 3579 # and the one to draw to the right (ub) will be aa 3580 ua = ab 3581 ub = aa 3582 else: 3583 ua = aa 3584 ub = ab 3585 elif count_sort == 'descending': 3586 # If a has a count less than or equal to b, it and its 3587 # descendents should be drawn to the left. Otherwise, to 3588 # the right. 3589 if na > nb: 3590 ua = aa 3591 ub = ab 3592 else: 3593 ua = ab 3594 ub = aa 3595 elif distance_sort == 'ascending' or distance_sort == True: 3596 # If a has a distance greater than b, it and its descendents should 3597 # be drawn to the right. Otherwise, to the left. 3598 if da > db: 3599 ua = ab 3600 ub = aa 3601 else: 3602 ua = aa 3603 ub = ab 3604 elif distance_sort == 'descending': 3605 # If a has a distance less than or equal to b, it and its 3606 # descendents should be drawn to the left. Otherwise, to 3607 # the right. 3608 if da > db: 3609 ua = aa 3610 ub = ab 3611 else: 3612 ua = ab 3613 ub = aa 3614 else: 3615 ua = aa 3616 ub = ab 3617 3618 # Updated iv variable and the amount of space used. 3619 (uiva, uwa, uah, uamd) = \ 3620 _dendrogram_calculate_info( 3621 Z=Z, p=p, 3622 truncate_mode=truncate_mode, 3623 color_threshold=color_threshold, 3624 get_leaves=get_leaves, 3625 orientation=orientation, 3626 labels=labels, 3627 count_sort=count_sort, 3628 distance_sort=distance_sort, 3629 show_leaf_counts=show_leaf_counts, 3630 i=ua, iv=iv, ivl=ivl, n=n, 3631 icoord_list=icoord_list, 3632 dcoord_list=dcoord_list, lvs=lvs, 3633 current_color=current_color, 3634 color_list=color_list, 3635 currently_below_threshold=currently_below_threshold, 3636 leaf_label_func=leaf_label_func, 3637 level=level + 1, contraction_marks=contraction_marks, 3638 link_color_func=link_color_func, 3639 above_threshold_color=above_threshold_color) 3640 3641 h = Z[i - n, 2] 3642 if h >= color_threshold or color_threshold <= 0: 3643 c = above_threshold_color 3644 3645 if currently_below_threshold[0]: 3646 current_color[0] = (current_color[0] + 1) % len(_link_line_colors) 3647 currently_below_threshold[0] = False 3648 else: 3649 currently_below_threshold[0] = True 3650 c = _link_line_colors[current_color[0]] 3651 3652 (uivb, uwb, ubh, ubmd) = \ 3653 _dendrogram_calculate_info( 3654 Z=Z, p=p, 3655 truncate_mode=truncate_mode, 3656 color_threshold=color_threshold, 3657 get_leaves=get_leaves, 3658 orientation=orientation, 3659 labels=labels, 3660 count_sort=count_sort, 3661 distance_sort=distance_sort, 3662 show_leaf_counts=show_leaf_counts, 3663 i=ub, iv=iv + uwa, ivl=ivl, n=n, 3664 icoord_list=icoord_list, 3665 dcoord_list=dcoord_list, lvs=lvs, 3666 current_color=current_color, 3667 color_list=color_list, 3668 currently_below_threshold=currently_below_threshold, 3669 leaf_label_func=leaf_label_func, 3670 level=level + 1, contraction_marks=contraction_marks, 3671 link_color_func=link_color_func, 3672 above_threshold_color=above_threshold_color) 3673 3674 max_dist = max(uamd, ubmd, h) 3675 3676 icoord_list.append([uiva, uiva, uivb, uivb]) 3677 dcoord_list.append([uah, h, h, ubh]) 3678 if link_color_func is not None: 3679 v = link_color_func(int(i)) 3680 if not isinstance(v, str): 3681 raise TypeError("link_color_func must return a matplotlib " 3682 "color string!") 3683 color_list.append(v) 3684 else: 3685 color_list.append(c) 3686 3687 return (((uiva + uivb) / 2), uwa + uwb, h, max_dist) 3688 3689 3690def is_isomorphic(T1, T2): 3691 """ 3692 Determine if two different cluster assignments are equivalent. 3693 3694 Parameters 3695 ---------- 3696 T1 : array_like 3697 An assignment of singleton cluster ids to flat cluster ids. 3698 T2 : array_like 3699 An assignment of singleton cluster ids to flat cluster ids. 3700 3701 Returns 3702 ------- 3703 b : bool 3704 Whether the flat cluster assignments `T1` and `T2` are 3705 equivalent. 3706 3707 See Also 3708 -------- 3709 linkage : for a description of what a linkage matrix is. 3710 fcluster : for the creation of flat cluster assignments. 3711 3712 Examples 3713 -------- 3714 >>> from scipy.cluster.hierarchy import fcluster, is_isomorphic 3715 >>> from scipy.cluster.hierarchy import single, complete 3716 >>> from scipy.spatial.distance import pdist 3717 3718 Two flat cluster assignments can be isomorphic if they represent the same 3719 cluster assignment, with different labels. 3720 3721 For example, we can use the `scipy.cluster.hierarchy.single`: method 3722 and flatten the output to four clusters: 3723 3724 >>> X = [[0, 0], [0, 1], [1, 0], 3725 ... [0, 4], [0, 3], [1, 4], 3726 ... [4, 0], [3, 0], [4, 1], 3727 ... [4, 4], [3, 4], [4, 3]] 3728 3729 >>> Z = single(pdist(X)) 3730 >>> T = fcluster(Z, 1, criterion='distance') 3731 >>> T 3732 array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32) 3733 3734 We can then do the same using the 3735 `scipy.cluster.hierarchy.complete`: method: 3736 3737 >>> Z = complete(pdist(X)) 3738 >>> T_ = fcluster(Z, 1.5, criterion='distance') 3739 >>> T_ 3740 array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32) 3741 3742 As we can see, in both cases we obtain four clusters and all the data 3743 points are distributed in the same way - the only thing that changes 3744 are the flat cluster labels (3 => 1, 4 =>2, 2 =>3 and 4 =>1), so both 3745 cluster assignments are isomorphic: 3746 3747 >>> is_isomorphic(T, T_) 3748 True 3749 3750 """ 3751 T1 = np.asarray(T1, order='c') 3752 T2 = np.asarray(T2, order='c') 3753 3754 if type(T1) != np.ndarray: 3755 raise TypeError('T1 must be a numpy array.') 3756 if type(T2) != np.ndarray: 3757 raise TypeError('T2 must be a numpy array.') 3758 3759 T1S = T1.shape 3760 T2S = T2.shape 3761 3762 if len(T1S) != 1: 3763 raise ValueError('T1 must be one-dimensional.') 3764 if len(T2S) != 1: 3765 raise ValueError('T2 must be one-dimensional.') 3766 if T1S[0] != T2S[0]: 3767 raise ValueError('T1 and T2 must have the same number of elements.') 3768 n = T1S[0] 3769 d1 = {} 3770 d2 = {} 3771 for i in range(0, n): 3772 if T1[i] in d1: 3773 if not T2[i] in d2: 3774 return False 3775 if d1[T1[i]] != T2[i] or d2[T2[i]] != T1[i]: 3776 return False 3777 elif T2[i] in d2: 3778 return False 3779 else: 3780 d1[T1[i]] = T2[i] 3781 d2[T2[i]] = T1[i] 3782 return True 3783 3784 3785def maxdists(Z): 3786 """ 3787 Return the maximum distance between any non-singleton cluster. 3788 3789 Parameters 3790 ---------- 3791 Z : ndarray 3792 The hierarchical clustering encoded as a matrix. See 3793 ``linkage`` for more information. 3794 3795 Returns 3796 ------- 3797 maxdists : ndarray 3798 A ``(n-1)`` sized numpy array of doubles; ``MD[i]`` represents 3799 the maximum distance between any cluster (including 3800 singletons) below and including the node with index i. More 3801 specifically, ``MD[i] = Z[Q(i)-n, 2].max()`` where ``Q(i)`` is the 3802 set of all node indices below and including node i. 3803 3804 See Also 3805 -------- 3806 linkage : for a description of what a linkage matrix is. 3807 is_monotonic : for testing for monotonicity of a linkage matrix. 3808 3809 Examples 3810 -------- 3811 >>> from scipy.cluster.hierarchy import median, maxdists 3812 >>> from scipy.spatial.distance import pdist 3813 3814 Given a linkage matrix ``Z``, `scipy.cluster.hierarchy.maxdists` 3815 computes for each new cluster generated (i.e., for each row of the linkage 3816 matrix) what is the maximum distance between any two child clusters. 3817 3818 Due to the nature of hierarchical clustering, in many cases this is going 3819 to be just the distance between the two child clusters that were merged 3820 to form the current one - that is, Z[:,2]. 3821 3822 However, for non-monotonic cluster assignments such as 3823 `scipy.cluster.hierarchy.median` clustering this is not always the 3824 case: There may be cluster formations were the distance between the two 3825 clusters merged is smaller than the distance between their children. 3826 3827 We can see this in an example: 3828 3829 >>> X = [[0, 0], [0, 1], [1, 0], 3830 ... [0, 4], [0, 3], [1, 4], 3831 ... [4, 0], [3, 0], [4, 1], 3832 ... [4, 4], [3, 4], [4, 3]] 3833 3834 >>> Z = median(pdist(X)) 3835 >>> Z 3836 array([[ 0. , 1. , 1. , 2. ], 3837 [ 3. , 4. , 1. , 2. ], 3838 [ 9. , 10. , 1. , 2. ], 3839 [ 6. , 7. , 1. , 2. ], 3840 [ 2. , 12. , 1.11803399, 3. ], 3841 [ 5. , 13. , 1.11803399, 3. ], 3842 [ 8. , 15. , 1.11803399, 3. ], 3843 [11. , 14. , 1.11803399, 3. ], 3844 [18. , 19. , 3. , 6. ], 3845 [16. , 17. , 3.5 , 6. ], 3846 [20. , 21. , 3.25 , 12. ]]) 3847 >>> maxdists(Z) 3848 array([1. , 1. , 1. , 1. , 1.11803399, 3849 1.11803399, 1.11803399, 1.11803399, 3. , 3.5 , 3850 3.5 ]) 3851 3852 Note that while the distance between the two clusters merged when creating the 3853 last cluster is 3.25, there are two children (clusters 16 and 17) whose distance 3854 is larger (3.5). Thus, `scipy.cluster.hierarchy.maxdists` returns 3.5 in 3855 this case. 3856 3857 """ 3858 Z = np.asarray(Z, order='c', dtype=np.double) 3859 is_valid_linkage(Z, throw=True, name='Z') 3860 3861 n = Z.shape[0] + 1 3862 MD = np.zeros((n - 1,)) 3863 [Z] = _copy_arrays_if_base_present([Z]) 3864 _hierarchy.get_max_dist_for_each_cluster(Z, MD, int(n)) 3865 return MD 3866 3867 3868def maxinconsts(Z, R): 3869 """ 3870 Return the maximum inconsistency coefficient for each 3871 non-singleton cluster and its children. 3872 3873 Parameters 3874 ---------- 3875 Z : ndarray 3876 The hierarchical clustering encoded as a matrix. See 3877 `linkage` for more information. 3878 R : ndarray 3879 The inconsistency matrix. 3880 3881 Returns 3882 ------- 3883 MI : ndarray 3884 A monotonic ``(n-1)``-sized numpy array of doubles. 3885 3886 See Also 3887 -------- 3888 linkage : for a description of what a linkage matrix is. 3889 inconsistent : for the creation of a inconsistency matrix. 3890 3891 Examples 3892 -------- 3893 >>> from scipy.cluster.hierarchy import median, inconsistent, maxinconsts 3894 >>> from scipy.spatial.distance import pdist 3895 3896 Given a data set ``X``, we can apply a clustering method to obtain a 3897 linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can 3898 be also used to obtain the inconsistency matrix ``R`` associated to 3899 this clustering process: 3900 3901 >>> X = [[0, 0], [0, 1], [1, 0], 3902 ... [0, 4], [0, 3], [1, 4], 3903 ... [4, 0], [3, 0], [4, 1], 3904 ... [4, 4], [3, 4], [4, 3]] 3905 3906 >>> Z = median(pdist(X)) 3907 >>> R = inconsistent(Z) 3908 >>> Z 3909 array([[ 0. , 1. , 1. , 2. ], 3910 [ 3. , 4. , 1. , 2. ], 3911 [ 9. , 10. , 1. , 2. ], 3912 [ 6. , 7. , 1. , 2. ], 3913 [ 2. , 12. , 1.11803399, 3. ], 3914 [ 5. , 13. , 1.11803399, 3. ], 3915 [ 8. , 15. , 1.11803399, 3. ], 3916 [11. , 14. , 1.11803399, 3. ], 3917 [18. , 19. , 3. , 6. ], 3918 [16. , 17. , 3.5 , 6. ], 3919 [20. , 21. , 3.25 , 12. ]]) 3920 >>> R 3921 array([[1. , 0. , 1. , 0. ], 3922 [1. , 0. , 1. , 0. ], 3923 [1. , 0. , 1. , 0. ], 3924 [1. , 0. , 1. , 0. ], 3925 [1.05901699, 0.08346263, 2. , 0.70710678], 3926 [1.05901699, 0.08346263, 2. , 0.70710678], 3927 [1.05901699, 0.08346263, 2. , 0.70710678], 3928 [1.05901699, 0.08346263, 2. , 0.70710678], 3929 [1.74535599, 1.08655358, 3. , 1.15470054], 3930 [1.91202266, 1.37522872, 3. , 1.15470054], 3931 [3.25 , 0.25 , 3. , 0. ]]) 3932 3933 Here, `scipy.cluster.hierarchy.maxinconsts` can be used to compute 3934 the maximum value of the inconsistency statistic (the last column of 3935 ``R``) for each non-singleton cluster and its children: 3936 3937 >>> maxinconsts(Z, R) 3938 array([0. , 0. , 0. , 0. , 0.70710678, 3939 0.70710678, 0.70710678, 0.70710678, 1.15470054, 1.15470054, 3940 1.15470054]) 3941 3942 """ 3943 Z = np.asarray(Z, order='c') 3944 R = np.asarray(R, order='c') 3945 is_valid_linkage(Z, throw=True, name='Z') 3946 is_valid_im(R, throw=True, name='R') 3947 3948 n = Z.shape[0] + 1 3949 if Z.shape[0] != R.shape[0]: 3950 raise ValueError("The inconsistency matrix and linkage matrix each " 3951 "have a different number of rows.") 3952 MI = np.zeros((n - 1,)) 3953 [Z, R] = _copy_arrays_if_base_present([Z, R]) 3954 _hierarchy.get_max_Rfield_for_each_cluster(Z, R, MI, int(n), 3) 3955 return MI 3956 3957 3958def maxRstat(Z, R, i): 3959 """ 3960 Return the maximum statistic for each non-singleton cluster and its 3961 children. 3962 3963 Parameters 3964 ---------- 3965 Z : array_like 3966 The hierarchical clustering encoded as a matrix. See `linkage` for more 3967 information. 3968 R : array_like 3969 The inconsistency matrix. 3970 i : int 3971 The column of `R` to use as the statistic. 3972 3973 Returns 3974 ------- 3975 MR : ndarray 3976 Calculates the maximum statistic for the i'th column of the 3977 inconsistency matrix `R` for each non-singleton cluster 3978 node. ``MR[j]`` is the maximum over ``R[Q(j)-n, i]``, where 3979 ``Q(j)`` the set of all node ids corresponding to nodes below 3980 and including ``j``. 3981 3982 See Also 3983 -------- 3984 linkage : for a description of what a linkage matrix is. 3985 inconsistent : for the creation of a inconsistency matrix. 3986 3987 Examples 3988 -------- 3989 >>> from scipy.cluster.hierarchy import median, inconsistent, maxRstat 3990 >>> from scipy.spatial.distance import pdist 3991 3992 Given a data set ``X``, we can apply a clustering method to obtain a 3993 linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can 3994 be also used to obtain the inconsistency matrix ``R`` associated to 3995 this clustering process: 3996 3997 >>> X = [[0, 0], [0, 1], [1, 0], 3998 ... [0, 4], [0, 3], [1, 4], 3999 ... [4, 0], [3, 0], [4, 1], 4000 ... [4, 4], [3, 4], [4, 3]] 4001 4002 >>> Z = median(pdist(X)) 4003 >>> R = inconsistent(Z) 4004 >>> R 4005 array([[1. , 0. , 1. , 0. ], 4006 [1. , 0. , 1. , 0. ], 4007 [1. , 0. , 1. , 0. ], 4008 [1. , 0. , 1. , 0. ], 4009 [1.05901699, 0.08346263, 2. , 0.70710678], 4010 [1.05901699, 0.08346263, 2. , 0.70710678], 4011 [1.05901699, 0.08346263, 2. , 0.70710678], 4012 [1.05901699, 0.08346263, 2. , 0.70710678], 4013 [1.74535599, 1.08655358, 3. , 1.15470054], 4014 [1.91202266, 1.37522872, 3. , 1.15470054], 4015 [3.25 , 0.25 , 3. , 0. ]]) 4016 4017 `scipy.cluster.hierarchy.maxRstat` can be used to compute 4018 the maximum value of each column of ``R``, for each non-singleton 4019 cluster and its children: 4020 4021 >>> maxRstat(Z, R, 0) 4022 array([1. , 1. , 1. , 1. , 1.05901699, 4023 1.05901699, 1.05901699, 1.05901699, 1.74535599, 1.91202266, 4024 3.25 ]) 4025 >>> maxRstat(Z, R, 1) 4026 array([0. , 0. , 0. , 0. , 0.08346263, 4027 0.08346263, 0.08346263, 0.08346263, 1.08655358, 1.37522872, 4028 1.37522872]) 4029 >>> maxRstat(Z, R, 3) 4030 array([0. , 0. , 0. , 0. , 0.70710678, 4031 0.70710678, 0.70710678, 0.70710678, 1.15470054, 1.15470054, 4032 1.15470054]) 4033 4034 """ 4035 Z = np.asarray(Z, order='c') 4036 R = np.asarray(R, order='c') 4037 is_valid_linkage(Z, throw=True, name='Z') 4038 is_valid_im(R, throw=True, name='R') 4039 if type(i) is not int: 4040 raise TypeError('The third argument must be an integer.') 4041 if i < 0 or i > 3: 4042 raise ValueError('i must be an integer between 0 and 3 inclusive.') 4043 4044 if Z.shape[0] != R.shape[0]: 4045 raise ValueError("The inconsistency matrix and linkage matrix each " 4046 "have a different number of rows.") 4047 4048 n = Z.shape[0] + 1 4049 MR = np.zeros((n - 1,)) 4050 [Z, R] = _copy_arrays_if_base_present([Z, R]) 4051 _hierarchy.get_max_Rfield_for_each_cluster(Z, R, MR, int(n), i) 4052 return MR 4053 4054 4055def leaders(Z, T): 4056 """ 4057 Return the root nodes in a hierarchical clustering. 4058 4059 Returns the root nodes in a hierarchical clustering corresponding 4060 to a cut defined by a flat cluster assignment vector ``T``. See 4061 the ``fcluster`` function for more information on the format of ``T``. 4062 4063 For each flat cluster :math:`j` of the :math:`k` flat clusters 4064 represented in the n-sized flat cluster assignment vector ``T``, 4065 this function finds the lowest cluster node :math:`i` in the linkage 4066 tree Z, such that: 4067 4068 * leaf descendants belong only to flat cluster j 4069 (i.e., ``T[p]==j`` for all :math:`p` in :math:`S(i)`, where 4070 :math:`S(i)` is the set of leaf ids of descendant leaf nodes 4071 with cluster node :math:`i`) 4072 4073 * there does not exist a leaf that is not a descendant with 4074 :math:`i` that also belongs to cluster :math:`j` 4075 (i.e., ``T[q]!=j`` for all :math:`q` not in :math:`S(i)`). If 4076 this condition is violated, ``T`` is not a valid cluster 4077 assignment vector, and an exception will be thrown. 4078 4079 Parameters 4080 ---------- 4081 Z : ndarray 4082 The hierarchical clustering encoded as a matrix. See 4083 `linkage` for more information. 4084 T : ndarray 4085 The flat cluster assignment vector. 4086 4087 Returns 4088 ------- 4089 L : ndarray 4090 The leader linkage node id's stored as a k-element 1-D array, 4091 where ``k`` is the number of flat clusters found in ``T``. 4092 4093 ``L[j]=i`` is the linkage cluster node id that is the 4094 leader of flat cluster with id M[j]. If ``i < n``, ``i`` 4095 corresponds to an original observation, otherwise it 4096 corresponds to a non-singleton cluster. 4097 M : ndarray 4098 The leader linkage node id's stored as a k-element 1-D array, where 4099 ``k`` is the number of flat clusters found in ``T``. This allows the 4100 set of flat cluster ids to be any arbitrary set of ``k`` integers. 4101 4102 For example: if ``L[3]=2`` and ``M[3]=8``, the flat cluster with 4103 id 8's leader is linkage node 2. 4104 4105 See Also 4106 -------- 4107 fcluster : for the creation of flat cluster assignments. 4108 4109 Examples 4110 -------- 4111 >>> from scipy.cluster.hierarchy import ward, fcluster, leaders 4112 >>> from scipy.spatial.distance import pdist 4113 4114 Given a linkage matrix ``Z`` - obtained after apply a clustering method 4115 to a dataset ``X`` - and a flat cluster assignment array ``T``: 4116 4117 >>> X = [[0, 0], [0, 1], [1, 0], 4118 ... [0, 4], [0, 3], [1, 4], 4119 ... [4, 0], [3, 0], [4, 1], 4120 ... [4, 4], [3, 4], [4, 3]] 4121 4122 >>> Z = ward(pdist(X)) 4123 >>> Z 4124 array([[ 0. , 1. , 1. , 2. ], 4125 [ 3. , 4. , 1. , 2. ], 4126 [ 6. , 7. , 1. , 2. ], 4127 [ 9. , 10. , 1. , 2. ], 4128 [ 2. , 12. , 1.29099445, 3. ], 4129 [ 5. , 13. , 1.29099445, 3. ], 4130 [ 8. , 14. , 1.29099445, 3. ], 4131 [11. , 15. , 1.29099445, 3. ], 4132 [16. , 17. , 5.77350269, 6. ], 4133 [18. , 19. , 5.77350269, 6. ], 4134 [20. , 21. , 8.16496581, 12. ]]) 4135 4136 >>> T = fcluster(Z, 3, criterion='distance') 4137 >>> T 4138 array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32) 4139 4140 `scipy.cluster.hierarchy.leaders` returns the indices of the nodes 4141 in the dendrogram that are the leaders of each flat cluster: 4142 4143 >>> L, M = leaders(Z, T) 4144 >>> L 4145 array([16, 17, 18, 19], dtype=int32) 4146 4147 (remember that indices 0-11 point to the 12 data points in ``X``, 4148 whereas indices 12-22 point to the 11 rows of ``Z``) 4149 4150 `scipy.cluster.hierarchy.leaders` also returns the indices of 4151 the flat clusters in ``T``: 4152 4153 >>> M 4154 array([1, 2, 3, 4], dtype=int32) 4155 4156 """ 4157 Z = np.asarray(Z, order='c') 4158 T = np.asarray(T, order='c') 4159 if type(T) != np.ndarray or T.dtype != 'i': 4160 raise TypeError('T must be a one-dimensional numpy array of integers.') 4161 is_valid_linkage(Z, throw=True, name='Z') 4162 if len(T) != Z.shape[0] + 1: 4163 raise ValueError('Mismatch: len(T)!=Z.shape[0] + 1.') 4164 4165 Cl = np.unique(T) 4166 kk = len(Cl) 4167 L = np.zeros((kk,), dtype='i') 4168 M = np.zeros((kk,), dtype='i') 4169 n = Z.shape[0] + 1 4170 [Z, T] = _copy_arrays_if_base_present([Z, T]) 4171 s = _hierarchy.leaders(Z, T, L, M, int(kk), int(n)) 4172 if s >= 0: 4173 raise ValueError(('T is not a valid assignment vector. Error found ' 4174 'when examining linkage node %d (< 2n-1).') % s) 4175 return (L, M) 4176