1"""
2Hierarchical clustering (:mod:`scipy.cluster.hierarchy`)
3========================================================
4
5.. currentmodule:: scipy.cluster.hierarchy
6
7These functions cut hierarchical clusterings into flat clusterings
8or find the roots of the forest formed by a cut by providing the flat
9cluster ids of each observation.
10
11.. autosummary::
12   :toctree: generated/
13
14   fcluster
15   fclusterdata
16   leaders
17
18These are routines for agglomerative clustering.
19
20.. autosummary::
21   :toctree: generated/
22
23   linkage
24   single
25   complete
26   average
27   weighted
28   centroid
29   median
30   ward
31
32These routines compute statistics on hierarchies.
33
34.. autosummary::
35   :toctree: generated/
36
37   cophenet
38   from_mlab_linkage
39   inconsistent
40   maxinconsts
41   maxdists
42   maxRstat
43   to_mlab_linkage
44
45Routines for visualizing flat clusters.
46
47.. autosummary::
48   :toctree: generated/
49
50   dendrogram
51
52These are data structures and routines for representing hierarchies as
53tree objects.
54
55.. autosummary::
56   :toctree: generated/
57
58   ClusterNode
59   leaves_list
60   to_tree
61   cut_tree
62   optimal_leaf_ordering
63
64These are predicates for checking the validity of linkage and
65inconsistency matrices as well as for checking isomorphism of two
66flat cluster assignments.
67
68.. autosummary::
69   :toctree: generated/
70
71   is_valid_im
72   is_valid_linkage
73   is_isomorphic
74   is_monotonic
75   correspond
76   num_obs_linkage
77
78Utility routines for plotting:
79
80.. autosummary::
81   :toctree: generated/
82
83   set_link_color_palette
84
85Utility classes:
86
87.. autosummary::
88   :toctree: generated/
89
90   DisjointSet -- data structure for incremental connectivity queries
91
92"""
93# Copyright (C) Damian Eads, 2007-2008. New BSD License.
94
95# hierarchy.py (derived from cluster.py, http://scipy-cluster.googlecode.com)
96#
97# Author: Damian Eads
98# Date:   September 22, 2007
99#
100# Copyright (c) 2007, 2008, Damian Eads
101#
102# All rights reserved.
103#
104# Redistribution and use in source and binary forms, with or without
105# modification, are permitted provided that the following conditions
106# are met:
107#   - Redistributions of source code must retain the above
108#     copyright notice, this list of conditions and the
109#     following disclaimer.
110#   - Redistributions in binary form must reproduce the above copyright
111#     notice, this list of conditions and the following disclaimer
112#     in the documentation and/or other materials provided with the
113#     distribution.
114#   - Neither the name of the author nor the names of its
115#     contributors may be used to endorse or promote products derived
116#     from this software without specific prior written permission.
117#
118# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
119# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
120# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
121# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
122# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
123# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
124# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
125# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
126# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
127# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
128# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
129
130import warnings
131import bisect
132from collections import deque
133
134import numpy as np
135from . import _hierarchy, _optimal_leaf_ordering
136import scipy.spatial.distance as distance
137from scipy._lib._disjoint_set import DisjointSet
138
139
140_LINKAGE_METHODS = {'single': 0, 'complete': 1, 'average': 2, 'centroid': 3,
141                    'median': 4, 'ward': 5, 'weighted': 6}
142_EUCLIDEAN_METHODS = ('centroid', 'median', 'ward')
143
144__all__ = ['ClusterNode', 'DisjointSet', 'average', 'centroid', 'complete',
145           'cophenet', 'correspond', 'cut_tree', 'dendrogram', 'fcluster',
146           'fclusterdata', 'from_mlab_linkage', 'inconsistent',
147           'is_isomorphic', 'is_monotonic', 'is_valid_im', 'is_valid_linkage',
148           'leaders', 'leaves_list', 'linkage', 'maxRstat', 'maxdists',
149           'maxinconsts', 'median', 'num_obs_linkage', 'optimal_leaf_ordering',
150           'set_link_color_palette', 'single', 'to_mlab_linkage', 'to_tree',
151           'ward', 'weighted', 'distance']
152
153
154class ClusterWarning(UserWarning):
155    pass
156
157
158def _warning(s):
159    warnings.warn('scipy.cluster: %s' % s, ClusterWarning, stacklevel=3)
160
161
162def _copy_array_if_base_present(a):
163    """
164    Copy the array if its base points to a parent array.
165    """
166    if a.base is not None:
167        return a.copy()
168    elif np.issubsctype(a, np.float32):
169        return np.array(a, dtype=np.double)
170    else:
171        return a
172
173
174def _copy_arrays_if_base_present(T):
175    """
176    Accept a tuple of arrays T. Copies the array T[i] if its base array
177    points to an actual array. Otherwise, the reference is just copied.
178    This is useful if the arrays are being passed to a C function that
179    does not do proper striding.
180    """
181    l = [_copy_array_if_base_present(a) for a in T]
182    return l
183
184
185def _randdm(pnts):
186    """
187    Generate a random distance matrix stored in condensed form.
188
189    Parameters
190    ----------
191    pnts : int
192        The number of points in the distance matrix. Has to be at least 2.
193
194    Returns
195    -------
196    D : ndarray
197        A ``pnts * (pnts - 1) / 2`` sized vector is returned.
198    """
199    if pnts >= 2:
200        D = np.random.rand(pnts * (pnts - 1) / 2)
201    else:
202        raise ValueError("The number of points in the distance matrix "
203                         "must be at least 2.")
204    return D
205
206
207def single(y):
208    """
209    Perform single/min/nearest linkage on the condensed distance matrix ``y``.
210
211    Parameters
212    ----------
213    y : ndarray
214        The upper triangular of the distance matrix. The result of
215        ``pdist`` is returned in this form.
216
217    Returns
218    -------
219    Z : ndarray
220        The linkage matrix.
221
222    See Also
223    --------
224    linkage : for advanced creation of hierarchical clusterings.
225    scipy.spatial.distance.pdist : pairwise distance metrics
226
227    Examples
228    --------
229    >>> from scipy.cluster.hierarchy import single, fcluster
230    >>> from scipy.spatial.distance import pdist
231
232    First, we need a toy dataset to play with::
233
234        x x    x x
235        x        x
236
237        x        x
238        x x    x x
239
240    >>> X = [[0, 0], [0, 1], [1, 0],
241    ...      [0, 4], [0, 3], [1, 4],
242    ...      [4, 0], [3, 0], [4, 1],
243    ...      [4, 4], [3, 4], [4, 3]]
244
245    Then, we get a condensed distance matrix from this dataset:
246
247    >>> y = pdist(X)
248
249    Finally, we can perform the clustering:
250
251    >>> Z = single(y)
252    >>> Z
253    array([[ 0.,  1.,  1.,  2.],
254           [ 2., 12.,  1.,  3.],
255           [ 3.,  4.,  1.,  2.],
256           [ 5., 14.,  1.,  3.],
257           [ 6.,  7.,  1.,  2.],
258           [ 8., 16.,  1.,  3.],
259           [ 9., 10.,  1.,  2.],
260           [11., 18.,  1.,  3.],
261           [13., 15.,  2.,  6.],
262           [17., 20.,  2.,  9.],
263           [19., 21.,  2., 12.]])
264
265    The linkage matrix ``Z`` represents a dendrogram - see
266    `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
267    contents.
268
269    We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
270    each initial point would belong given a distance threshold:
271
272    >>> fcluster(Z, 0.9, criterion='distance')
273    array([ 7,  8,  9, 10, 11, 12,  4,  5,  6,  1,  2,  3], dtype=int32)
274    >>> fcluster(Z, 1, criterion='distance')
275    array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32)
276    >>> fcluster(Z, 2, criterion='distance')
277    array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
278
279    Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
280    plot of the dendrogram.
281    """
282    return linkage(y, method='single', metric='euclidean')
283
284
285def complete(y):
286    """
287    Perform complete/max/farthest point linkage on a condensed distance matrix.
288
289    Parameters
290    ----------
291    y : ndarray
292        The upper triangular of the distance matrix. The result of
293        ``pdist`` is returned in this form.
294
295    Returns
296    -------
297    Z : ndarray
298        A linkage matrix containing the hierarchical clustering. See
299        the `linkage` function documentation for more information
300        on its structure.
301
302    See Also
303    --------
304    linkage : for advanced creation of hierarchical clusterings.
305    scipy.spatial.distance.pdist : pairwise distance metrics
306
307    Examples
308    --------
309    >>> from scipy.cluster.hierarchy import complete, fcluster
310    >>> from scipy.spatial.distance import pdist
311
312    First, we need a toy dataset to play with::
313
314        x x    x x
315        x        x
316
317        x        x
318        x x    x x
319
320    >>> X = [[0, 0], [0, 1], [1, 0],
321    ...      [0, 4], [0, 3], [1, 4],
322    ...      [4, 0], [3, 0], [4, 1],
323    ...      [4, 4], [3, 4], [4, 3]]
324
325    Then, we get a condensed distance matrix from this dataset:
326
327    >>> y = pdist(X)
328
329    Finally, we can perform the clustering:
330
331    >>> Z = complete(y)
332    >>> Z
333    array([[ 0.        ,  1.        ,  1.        ,  2.        ],
334           [ 3.        ,  4.        ,  1.        ,  2.        ],
335           [ 6.        ,  7.        ,  1.        ,  2.        ],
336           [ 9.        , 10.        ,  1.        ,  2.        ],
337           [ 2.        , 12.        ,  1.41421356,  3.        ],
338           [ 5.        , 13.        ,  1.41421356,  3.        ],
339           [ 8.        , 14.        ,  1.41421356,  3.        ],
340           [11.        , 15.        ,  1.41421356,  3.        ],
341           [16.        , 17.        ,  4.12310563,  6.        ],
342           [18.        , 19.        ,  4.12310563,  6.        ],
343           [20.        , 21.        ,  5.65685425, 12.        ]])
344
345    The linkage matrix ``Z`` represents a dendrogram - see
346    `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
347    contents.
348
349    We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
350    each initial point would belong given a distance threshold:
351
352    >>> fcluster(Z, 0.9, criterion='distance')
353    array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int32)
354    >>> fcluster(Z, 1.5, criterion='distance')
355    array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
356    >>> fcluster(Z, 4.5, criterion='distance')
357    array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], dtype=int32)
358    >>> fcluster(Z, 6, criterion='distance')
359    array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
360
361    Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
362    plot of the dendrogram.
363    """
364    return linkage(y, method='complete', metric='euclidean')
365
366
367def average(y):
368    """
369    Perform average/UPGMA linkage on a condensed distance matrix.
370
371    Parameters
372    ----------
373    y : ndarray
374        The upper triangular of the distance matrix. The result of
375        ``pdist`` is returned in this form.
376
377    Returns
378    -------
379    Z : ndarray
380        A linkage matrix containing the hierarchical clustering. See
381        `linkage` for more information on its structure.
382
383    See Also
384    --------
385    linkage : for advanced creation of hierarchical clusterings.
386    scipy.spatial.distance.pdist : pairwise distance metrics
387
388    Examples
389    --------
390    >>> from scipy.cluster.hierarchy import average, fcluster
391    >>> from scipy.spatial.distance import pdist
392
393    First, we need a toy dataset to play with::
394
395        x x    x x
396        x        x
397
398        x        x
399        x x    x x
400
401    >>> X = [[0, 0], [0, 1], [1, 0],
402    ...      [0, 4], [0, 3], [1, 4],
403    ...      [4, 0], [3, 0], [4, 1],
404    ...      [4, 4], [3, 4], [4, 3]]
405
406    Then, we get a condensed distance matrix from this dataset:
407
408    >>> y = pdist(X)
409
410    Finally, we can perform the clustering:
411
412    >>> Z = average(y)
413    >>> Z
414    array([[ 0.        ,  1.        ,  1.        ,  2.        ],
415           [ 3.        ,  4.        ,  1.        ,  2.        ],
416           [ 6.        ,  7.        ,  1.        ,  2.        ],
417           [ 9.        , 10.        ,  1.        ,  2.        ],
418           [ 2.        , 12.        ,  1.20710678,  3.        ],
419           [ 5.        , 13.        ,  1.20710678,  3.        ],
420           [ 8.        , 14.        ,  1.20710678,  3.        ],
421           [11.        , 15.        ,  1.20710678,  3.        ],
422           [16.        , 17.        ,  3.39675184,  6.        ],
423           [18.        , 19.        ,  3.39675184,  6.        ],
424           [20.        , 21.        ,  4.09206523, 12.        ]])
425
426    The linkage matrix ``Z`` represents a dendrogram - see
427    `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
428    contents.
429
430    We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
431    each initial point would belong given a distance threshold:
432
433    >>> fcluster(Z, 0.9, criterion='distance')
434    array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int32)
435    >>> fcluster(Z, 1.5, criterion='distance')
436    array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
437    >>> fcluster(Z, 4, criterion='distance')
438    array([1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2], dtype=int32)
439    >>> fcluster(Z, 6, criterion='distance')
440    array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
441
442    Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
443    plot of the dendrogram.
444
445    """
446    return linkage(y, method='average', metric='euclidean')
447
448
449def weighted(y):
450    """
451    Perform weighted/WPGMA linkage on the condensed distance matrix.
452
453    See `linkage` for more information on the return
454    structure and algorithm.
455
456    Parameters
457    ----------
458    y : ndarray
459        The upper triangular of the distance matrix. The result of
460        ``pdist`` is returned in this form.
461
462    Returns
463    -------
464    Z : ndarray
465        A linkage matrix containing the hierarchical clustering. See
466        `linkage` for more information on its structure.
467
468    See Also
469    --------
470    linkage : for advanced creation of hierarchical clusterings.
471    scipy.spatial.distance.pdist : pairwise distance metrics
472
473    Examples
474    --------
475    >>> from scipy.cluster.hierarchy import weighted, fcluster
476    >>> from scipy.spatial.distance import pdist
477
478    First, we need a toy dataset to play with::
479
480        x x    x x
481        x        x
482
483        x        x
484        x x    x x
485
486    >>> X = [[0, 0], [0, 1], [1, 0],
487    ...      [0, 4], [0, 3], [1, 4],
488    ...      [4, 0], [3, 0], [4, 1],
489    ...      [4, 4], [3, 4], [4, 3]]
490
491    Then, we get a condensed distance matrix from this dataset:
492
493    >>> y = pdist(X)
494
495    Finally, we can perform the clustering:
496
497    >>> Z = weighted(y)
498    >>> Z
499    array([[ 0.        ,  1.        ,  1.        ,  2.        ],
500           [ 6.        ,  7.        ,  1.        ,  2.        ],
501           [ 3.        ,  4.        ,  1.        ,  2.        ],
502           [ 9.        , 11.        ,  1.        ,  2.        ],
503           [ 2.        , 12.        ,  1.20710678,  3.        ],
504           [ 8.        , 13.        ,  1.20710678,  3.        ],
505           [ 5.        , 14.        ,  1.20710678,  3.        ],
506           [10.        , 15.        ,  1.20710678,  3.        ],
507           [18.        , 19.        ,  3.05595762,  6.        ],
508           [16.        , 17.        ,  3.32379407,  6.        ],
509           [20.        , 21.        ,  4.06357713, 12.        ]])
510
511    The linkage matrix ``Z`` represents a dendrogram - see
512    `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
513    contents.
514
515    We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
516    each initial point would belong given a distance threshold:
517
518    >>> fcluster(Z, 0.9, criterion='distance')
519    array([ 7,  8,  9,  1,  2,  3, 10, 11, 12,  4,  6,  5], dtype=int32)
520    >>> fcluster(Z, 1.5, criterion='distance')
521    array([3, 3, 3, 1, 1, 1, 4, 4, 4, 2, 2, 2], dtype=int32)
522    >>> fcluster(Z, 4, criterion='distance')
523    array([2, 2, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1], dtype=int32)
524    >>> fcluster(Z, 6, criterion='distance')
525    array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
526
527    Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
528    plot of the dendrogram.
529
530    """
531    return linkage(y, method='weighted', metric='euclidean')
532
533
534def centroid(y):
535    """
536    Perform centroid/UPGMC linkage.
537
538    See `linkage` for more information on the input matrix,
539    return structure, and algorithm.
540
541    The following are common calling conventions:
542
543    1. ``Z = centroid(y)``
544
545       Performs centroid/UPGMC linkage on the condensed distance
546       matrix ``y``.
547
548    2. ``Z = centroid(X)``
549
550       Performs centroid/UPGMC linkage on the observation matrix ``X``
551       using Euclidean distance as the distance metric.
552
553    Parameters
554    ----------
555    y : ndarray
556        A condensed distance matrix. A condensed
557        distance matrix is a flat array containing the upper
558        triangular of the distance matrix. This is the form that
559        ``pdist`` returns. Alternatively, a collection of
560        m observation vectors in n dimensions may be passed as
561        an m by n array.
562
563    Returns
564    -------
565    Z : ndarray
566        A linkage matrix containing the hierarchical clustering. See
567        the `linkage` function documentation for more information
568        on its structure.
569
570    See Also
571    --------
572    linkage : for advanced creation of hierarchical clusterings.
573    scipy.spatial.distance.pdist : pairwise distance metrics
574
575    Examples
576    --------
577    >>> from scipy.cluster.hierarchy import centroid, fcluster
578    >>> from scipy.spatial.distance import pdist
579
580    First, we need a toy dataset to play with::
581
582        x x    x x
583        x        x
584
585        x        x
586        x x    x x
587
588    >>> X = [[0, 0], [0, 1], [1, 0],
589    ...      [0, 4], [0, 3], [1, 4],
590    ...      [4, 0], [3, 0], [4, 1],
591    ...      [4, 4], [3, 4], [4, 3]]
592
593    Then, we get a condensed distance matrix from this dataset:
594
595    >>> y = pdist(X)
596
597    Finally, we can perform the clustering:
598
599    >>> Z = centroid(y)
600    >>> Z
601    array([[ 0.        ,  1.        ,  1.        ,  2.        ],
602           [ 3.        ,  4.        ,  1.        ,  2.        ],
603           [ 9.        , 10.        ,  1.        ,  2.        ],
604           [ 6.        ,  7.        ,  1.        ,  2.        ],
605           [ 2.        , 12.        ,  1.11803399,  3.        ],
606           [ 5.        , 13.        ,  1.11803399,  3.        ],
607           [ 8.        , 15.        ,  1.11803399,  3.        ],
608           [11.        , 14.        ,  1.11803399,  3.        ],
609           [18.        , 19.        ,  3.33333333,  6.        ],
610           [16.        , 17.        ,  3.33333333,  6.        ],
611           [20.        , 21.        ,  3.33333333, 12.        ]])
612
613    The linkage matrix ``Z`` represents a dendrogram - see
614    `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
615    contents.
616
617    We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
618    each initial point would belong given a distance threshold:
619
620    >>> fcluster(Z, 0.9, criterion='distance')
621    array([ 7,  8,  9, 10, 11, 12,  1,  2,  3,  4,  5,  6], dtype=int32)
622    >>> fcluster(Z, 1.1, criterion='distance')
623    array([5, 5, 6, 7, 7, 8, 1, 1, 2, 3, 3, 4], dtype=int32)
624    >>> fcluster(Z, 2, criterion='distance')
625    array([3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2], dtype=int32)
626    >>> fcluster(Z, 4, criterion='distance')
627    array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
628
629    Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
630    plot of the dendrogram.
631
632    """
633    return linkage(y, method='centroid', metric='euclidean')
634
635
636def median(y):
637    """
638    Perform median/WPGMC linkage.
639
640    See `linkage` for more information on the return structure
641    and algorithm.
642
643     The following are common calling conventions:
644
645     1. ``Z = median(y)``
646
647        Performs median/WPGMC linkage on the condensed distance matrix
648        ``y``.  See ``linkage`` for more information on the return
649        structure and algorithm.
650
651     2. ``Z = median(X)``
652
653        Performs median/WPGMC linkage on the observation matrix ``X``
654        using Euclidean distance as the distance metric. See `linkage`
655        for more information on the return structure and algorithm.
656
657    Parameters
658    ----------
659    y : ndarray
660        A condensed distance matrix. A condensed
661        distance matrix is a flat array containing the upper
662        triangular of the distance matrix. This is the form that
663        ``pdist`` returns.  Alternatively, a collection of
664        m observation vectors in n dimensions may be passed as
665        an m by n array.
666
667    Returns
668    -------
669    Z : ndarray
670        The hierarchical clustering encoded as a linkage matrix.
671
672    See Also
673    --------
674    linkage : for advanced creation of hierarchical clusterings.
675    scipy.spatial.distance.pdist : pairwise distance metrics
676
677    Examples
678    --------
679    >>> from scipy.cluster.hierarchy import median, fcluster
680    >>> from scipy.spatial.distance import pdist
681
682    First, we need a toy dataset to play with::
683
684        x x    x x
685        x        x
686
687        x        x
688        x x    x x
689
690    >>> X = [[0, 0], [0, 1], [1, 0],
691    ...      [0, 4], [0, 3], [1, 4],
692    ...      [4, 0], [3, 0], [4, 1],
693    ...      [4, 4], [3, 4], [4, 3]]
694
695    Then, we get a condensed distance matrix from this dataset:
696
697    >>> y = pdist(X)
698
699    Finally, we can perform the clustering:
700
701    >>> Z = median(y)
702    >>> Z
703    array([[ 0.        ,  1.        ,  1.        ,  2.        ],
704           [ 3.        ,  4.        ,  1.        ,  2.        ],
705           [ 9.        , 10.        ,  1.        ,  2.        ],
706           [ 6.        ,  7.        ,  1.        ,  2.        ],
707           [ 2.        , 12.        ,  1.11803399,  3.        ],
708           [ 5.        , 13.        ,  1.11803399,  3.        ],
709           [ 8.        , 15.        ,  1.11803399,  3.        ],
710           [11.        , 14.        ,  1.11803399,  3.        ],
711           [18.        , 19.        ,  3.        ,  6.        ],
712           [16.        , 17.        ,  3.5       ,  6.        ],
713           [20.        , 21.        ,  3.25      , 12.        ]])
714
715    The linkage matrix ``Z`` represents a dendrogram - see
716    `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
717    contents.
718
719    We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
720    each initial point would belong given a distance threshold:
721
722    >>> fcluster(Z, 0.9, criterion='distance')
723    array([ 7,  8,  9, 10, 11, 12,  1,  2,  3,  4,  5,  6], dtype=int32)
724    >>> fcluster(Z, 1.1, criterion='distance')
725    array([5, 5, 6, 7, 7, 8, 1, 1, 2, 3, 3, 4], dtype=int32)
726    >>> fcluster(Z, 2, criterion='distance')
727    array([3, 3, 3, 4, 4, 4, 1, 1, 1, 2, 2, 2], dtype=int32)
728    >>> fcluster(Z, 4, criterion='distance')
729    array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
730
731    Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
732    plot of the dendrogram.
733
734    """
735    return linkage(y, method='median', metric='euclidean')
736
737
738def ward(y):
739    """
740    Perform Ward's linkage on a condensed distance matrix.
741
742    See `linkage` for more information on the return structure
743    and algorithm.
744
745    The following are common calling conventions:
746
747    1. ``Z = ward(y)``
748       Performs Ward's linkage on the condensed distance matrix ``y``.
749
750    2. ``Z = ward(X)``
751       Performs Ward's linkage on the observation matrix ``X`` using
752       Euclidean distance as the distance metric.
753
754    Parameters
755    ----------
756    y : ndarray
757        A condensed distance matrix. A condensed
758        distance matrix is a flat array containing the upper
759        triangular of the distance matrix. This is the form that
760        ``pdist`` returns.  Alternatively, a collection of
761        m observation vectors in n dimensions may be passed as
762        an m by n array.
763
764    Returns
765    -------
766    Z : ndarray
767        The hierarchical clustering encoded as a linkage matrix. See
768        `linkage` for more information on the return structure and
769        algorithm.
770
771    See Also
772    --------
773    linkage : for advanced creation of hierarchical clusterings.
774    scipy.spatial.distance.pdist : pairwise distance metrics
775
776    Examples
777    --------
778    >>> from scipy.cluster.hierarchy import ward, fcluster
779    >>> from scipy.spatial.distance import pdist
780
781    First, we need a toy dataset to play with::
782
783        x x    x x
784        x        x
785
786        x        x
787        x x    x x
788
789    >>> X = [[0, 0], [0, 1], [1, 0],
790    ...      [0, 4], [0, 3], [1, 4],
791    ...      [4, 0], [3, 0], [4, 1],
792    ...      [4, 4], [3, 4], [4, 3]]
793
794    Then, we get a condensed distance matrix from this dataset:
795
796    >>> y = pdist(X)
797
798    Finally, we can perform the clustering:
799
800    >>> Z = ward(y)
801    >>> Z
802    array([[ 0.        ,  1.        ,  1.        ,  2.        ],
803           [ 3.        ,  4.        ,  1.        ,  2.        ],
804           [ 6.        ,  7.        ,  1.        ,  2.        ],
805           [ 9.        , 10.        ,  1.        ,  2.        ],
806           [ 2.        , 12.        ,  1.29099445,  3.        ],
807           [ 5.        , 13.        ,  1.29099445,  3.        ],
808           [ 8.        , 14.        ,  1.29099445,  3.        ],
809           [11.        , 15.        ,  1.29099445,  3.        ],
810           [16.        , 17.        ,  5.77350269,  6.        ],
811           [18.        , 19.        ,  5.77350269,  6.        ],
812           [20.        , 21.        ,  8.16496581, 12.        ]])
813
814    The linkage matrix ``Z`` represents a dendrogram - see
815    `scipy.cluster.hierarchy.linkage` for a detailed explanation of its
816    contents.
817
818    We can use `scipy.cluster.hierarchy.fcluster` to see to which cluster
819    each initial point would belong given a distance threshold:
820
821    >>> fcluster(Z, 0.9, criterion='distance')
822    array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int32)
823    >>> fcluster(Z, 1.1, criterion='distance')
824    array([1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8], dtype=int32)
825    >>> fcluster(Z, 3, criterion='distance')
826    array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
827    >>> fcluster(Z, 9, criterion='distance')
828    array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
829
830    Also, `scipy.cluster.hierarchy.dendrogram` can be used to generate a
831    plot of the dendrogram.
832
833    """
834    return linkage(y, method='ward', metric='euclidean')
835
836
837def linkage(y, method='single', metric='euclidean', optimal_ordering=False):
838    """
839    Perform hierarchical/agglomerative clustering.
840
841    The input y may be either a 1-D condensed distance matrix
842    or a 2-D array of observation vectors.
843
844    If y is a 1-D condensed distance matrix,
845    then y must be a :math:`\\binom{n}{2}` sized
846    vector, where n is the number of original observations paired
847    in the distance matrix. The behavior of this function is very
848    similar to the MATLAB linkage function.
849
850    A :math:`(n-1)` by 4 matrix ``Z`` is returned. At the
851    :math:`i`-th iteration, clusters with indices ``Z[i, 0]`` and
852    ``Z[i, 1]`` are combined to form cluster :math:`n + i`. A
853    cluster with an index less than :math:`n` corresponds to one of
854    the :math:`n` original observations. The distance between
855    clusters ``Z[i, 0]`` and ``Z[i, 1]`` is given by ``Z[i, 2]``. The
856    fourth value ``Z[i, 3]`` represents the number of original
857    observations in the newly formed cluster.
858
859    The following linkage methods are used to compute the distance
860    :math:`d(s, t)` between two clusters :math:`s` and
861    :math:`t`. The algorithm begins with a forest of clusters that
862    have yet to be used in the hierarchy being formed. When two
863    clusters :math:`s` and :math:`t` from this forest are combined
864    into a single cluster :math:`u`, :math:`s` and :math:`t` are
865    removed from the forest, and :math:`u` is added to the
866    forest. When only one cluster remains in the forest, the algorithm
867    stops, and this cluster becomes the root.
868
869    A distance matrix is maintained at each iteration. The ``d[i,j]``
870    entry corresponds to the distance between cluster :math:`i` and
871    :math:`j` in the original forest.
872
873    At each iteration, the algorithm must update the distance matrix
874    to reflect the distance of the newly formed cluster u with the
875    remaining clusters in the forest.
876
877    Suppose there are :math:`|u|` original observations
878    :math:`u[0], \\ldots, u[|u|-1]` in cluster :math:`u` and
879    :math:`|v|` original objects :math:`v[0], \\ldots, v[|v|-1]` in
880    cluster :math:`v`. Recall, :math:`s` and :math:`t` are
881    combined to form cluster :math:`u`. Let :math:`v` be any
882    remaining cluster in the forest that is not :math:`u`.
883
884    The following are methods for calculating the distance between the
885    newly formed cluster :math:`u` and each :math:`v`.
886
887      * method='single' assigns
888
889        .. math::
890           d(u,v) = \\min(dist(u[i],v[j]))
891
892        for all points :math:`i` in cluster :math:`u` and
893        :math:`j` in cluster :math:`v`. This is also known as the
894        Nearest Point Algorithm.
895
896      * method='complete' assigns
897
898        .. math::
899           d(u, v) = \\max(dist(u[i],v[j]))
900
901        for all points :math:`i` in cluster u and :math:`j` in
902        cluster :math:`v`. This is also known by the Farthest Point
903        Algorithm or Voor Hees Algorithm.
904
905      * method='average' assigns
906
907        .. math::
908           d(u,v) = \\sum_{ij} \\frac{d(u[i], v[j])}
909                                   {(|u|*|v|)}
910
911        for all points :math:`i` and :math:`j` where :math:`|u|`
912        and :math:`|v|` are the cardinalities of clusters :math:`u`
913        and :math:`v`, respectively. This is also called the UPGMA
914        algorithm.
915
916      * method='weighted' assigns
917
918        .. math::
919           d(u,v) = (dist(s,v) + dist(t,v))/2
920
921        where cluster u was formed with cluster s and t and v
922        is a remaining cluster in the forest (also called WPGMA).
923
924      * method='centroid' assigns
925
926        .. math::
927           dist(s,t) = ||c_s-c_t||_2
928
929        where :math:`c_s` and :math:`c_t` are the centroids of
930        clusters :math:`s` and :math:`t`, respectively. When two
931        clusters :math:`s` and :math:`t` are combined into a new
932        cluster :math:`u`, the new centroid is computed over all the
933        original objects in clusters :math:`s` and :math:`t`. The
934        distance then becomes the Euclidean distance between the
935        centroid of :math:`u` and the centroid of a remaining cluster
936        :math:`v` in the forest. This is also known as the UPGMC
937        algorithm.
938
939      * method='median' assigns :math:`d(s,t)` like the ``centroid``
940        method. When two clusters :math:`s` and :math:`t` are combined
941        into a new cluster :math:`u`, the average of centroids s and t
942        give the new centroid :math:`u`. This is also known as the
943        WPGMC algorithm.
944
945      * method='ward' uses the Ward variance minimization algorithm.
946        The new entry :math:`d(u,v)` is computed as follows,
947
948        .. math::
949
950           d(u,v) = \\sqrt{\\frac{|v|+|s|}
951                               {T}d(v,s)^2
952                        + \\frac{|v|+|t|}
953                               {T}d(v,t)^2
954                        - \\frac{|v|}
955                               {T}d(s,t)^2}
956
957        where :math:`u` is the newly joined cluster consisting of
958        clusters :math:`s` and :math:`t`, :math:`v` is an unused
959        cluster in the forest, :math:`T=|v|+|s|+|t|`, and
960        :math:`|*|` is the cardinality of its argument. This is also
961        known as the incremental algorithm.
962
963    Warning: When the minimum distance pair in the forest is chosen, there
964    may be two or more pairs with the same minimum distance. This
965    implementation may choose a different minimum than the MATLAB
966    version.
967
968    Parameters
969    ----------
970    y : ndarray
971        A condensed distance matrix. A condensed distance matrix
972        is a flat array containing the upper triangular of the distance matrix.
973        This is the form that ``pdist`` returns. Alternatively, a collection of
974        :math:`m` observation vectors in :math:`n` dimensions may be passed as
975        an :math:`m` by :math:`n` array. All elements of the condensed distance
976        matrix must be finite, i.e., no NaNs or infs.
977    method : str, optional
978        The linkage algorithm to use. See the ``Linkage Methods`` section below
979        for full descriptions.
980    metric : str or function, optional
981        The distance metric to use in the case that y is a collection of
982        observation vectors; ignored otherwise. See the ``pdist``
983        function for a list of valid distance metrics. A custom distance
984        function can also be used.
985    optimal_ordering : bool, optional
986        If True, the linkage matrix will be reordered so that the distance
987        between successive leaves is minimal. This results in a more intuitive
988        tree structure when the data are visualized. defaults to False, because
989        this algorithm can be slow, particularly on large datasets [2]_. See
990        also the `optimal_leaf_ordering` function.
991
992        .. versionadded:: 1.0.0
993
994    Returns
995    -------
996    Z : ndarray
997        The hierarchical clustering encoded as a linkage matrix.
998
999    Notes
1000    -----
1001    1. For method 'single', an optimized algorithm based on minimum spanning
1002       tree is implemented. It has time complexity :math:`O(n^2)`.
1003       For methods 'complete', 'average', 'weighted' and 'ward', an algorithm
1004       called nearest-neighbors chain is implemented. It also has time
1005       complexity :math:`O(n^2)`.
1006       For other methods, a naive algorithm is implemented with :math:`O(n^3)`
1007       time complexity.
1008       All algorithms use :math:`O(n^2)` memory.
1009       Refer to [1]_ for details about the algorithms.
1010    2. Methods 'centroid', 'median', and 'ward' are correctly defined only if
1011       Euclidean pairwise metric is used. If `y` is passed as precomputed
1012       pairwise distances, then it is the user's responsibility to assure that
1013       these distances are in fact Euclidean, otherwise the produced result
1014       will be incorrect.
1015
1016    See Also
1017    --------
1018    scipy.spatial.distance.pdist : pairwise distance metrics
1019
1020    References
1021    ----------
1022    .. [1] Daniel Mullner, "Modern hierarchical, agglomerative clustering
1023           algorithms", :arXiv:`1109.2378v1`.
1024    .. [2] Ziv Bar-Joseph, David K. Gifford, Tommi S. Jaakkola, "Fast optimal
1025           leaf ordering for hierarchical clustering", 2001. Bioinformatics
1026           :doi:`10.1093/bioinformatics/17.suppl_1.S22`
1027
1028    Examples
1029    --------
1030    >>> from scipy.cluster.hierarchy import dendrogram, linkage
1031    >>> from matplotlib import pyplot as plt
1032    >>> X = [[i] for i in [2, 8, 0, 4, 1, 9, 9, 0]]
1033
1034    >>> Z = linkage(X, 'ward')
1035    >>> fig = plt.figure(figsize=(25, 10))
1036    >>> dn = dendrogram(Z)
1037
1038    >>> Z = linkage(X, 'single')
1039    >>> fig = plt.figure(figsize=(25, 10))
1040    >>> dn = dendrogram(Z)
1041    >>> plt.show()
1042    """
1043    if method not in _LINKAGE_METHODS:
1044        raise ValueError("Invalid method: {0}".format(method))
1045
1046    y = _convert_to_double(np.asarray(y, order='c'))
1047
1048    if y.ndim == 1:
1049        distance.is_valid_y(y, throw=True, name='y')
1050        [y] = _copy_arrays_if_base_present([y])
1051    elif y.ndim == 2:
1052        if method in _EUCLIDEAN_METHODS and metric != 'euclidean':
1053            raise ValueError("Method '{0}' requires the distance metric "
1054                             "to be Euclidean".format(method))
1055        if y.shape[0] == y.shape[1] and np.allclose(np.diag(y), 0):
1056            if np.all(y >= 0) and np.allclose(y, y.T):
1057                _warning('The symmetric non-negative hollow observation '
1058                         'matrix looks suspiciously like an uncondensed '
1059                         'distance matrix')
1060        y = distance.pdist(y, metric)
1061    else:
1062        raise ValueError("`y` must be 1 or 2 dimensional.")
1063
1064    if not np.all(np.isfinite(y)):
1065        raise ValueError("The condensed distance matrix must contain only "
1066                         "finite values.")
1067
1068    n = int(distance.num_obs_y(y))
1069    method_code = _LINKAGE_METHODS[method]
1070
1071    if method == 'single':
1072        result = _hierarchy.mst_single_linkage(y, n)
1073    elif method in ['complete', 'average', 'weighted', 'ward']:
1074        result = _hierarchy.nn_chain(y, n, method_code)
1075    else:
1076        result = _hierarchy.fast_linkage(y, n, method_code)
1077
1078    if optimal_ordering:
1079        return optimal_leaf_ordering(result, y)
1080    else:
1081        return result
1082
1083
1084class ClusterNode:
1085    """
1086    A tree node class for representing a cluster.
1087
1088    Leaf nodes correspond to original observations, while non-leaf nodes
1089    correspond to non-singleton clusters.
1090
1091    The `to_tree` function converts a matrix returned by the linkage
1092    function into an easy-to-use tree representation.
1093
1094    All parameter names are also attributes.
1095
1096    Parameters
1097    ----------
1098    id : int
1099        The node id.
1100    left : ClusterNode instance, optional
1101        The left child tree node.
1102    right : ClusterNode instance, optional
1103        The right child tree node.
1104    dist : float, optional
1105        Distance for this cluster in the linkage matrix.
1106    count : int, optional
1107        The number of samples in this cluster.
1108
1109    See Also
1110    --------
1111    to_tree : for converting a linkage matrix ``Z`` into a tree object.
1112
1113    """
1114
1115    def __init__(self, id, left=None, right=None, dist=0, count=1):
1116        if id < 0:
1117            raise ValueError('The id must be non-negative.')
1118        if dist < 0:
1119            raise ValueError('The distance must be non-negative.')
1120        if (left is None and right is not None) or \
1121           (left is not None and right is None):
1122            raise ValueError('Only full or proper binary trees are permitted.'
1123                             '  This node has one child.')
1124        if count < 1:
1125            raise ValueError('A cluster must contain at least one original '
1126                             'observation.')
1127        self.id = id
1128        self.left = left
1129        self.right = right
1130        self.dist = dist
1131        if self.left is None:
1132            self.count = count
1133        else:
1134            self.count = left.count + right.count
1135
1136    def __lt__(self, node):
1137        if not isinstance(node, ClusterNode):
1138            raise ValueError("Can't compare ClusterNode "
1139                             "to type {}".format(type(node)))
1140        return self.dist < node.dist
1141
1142    def __gt__(self, node):
1143        if not isinstance(node, ClusterNode):
1144            raise ValueError("Can't compare ClusterNode "
1145                             "to type {}".format(type(node)))
1146        return self.dist > node.dist
1147
1148    def __eq__(self, node):
1149        if not isinstance(node, ClusterNode):
1150            raise ValueError("Can't compare ClusterNode "
1151                             "to type {}".format(type(node)))
1152        return self.dist == node.dist
1153
1154    def get_id(self):
1155        """
1156        The identifier of the target node.
1157
1158        For ``0 <= i < n``, `i` corresponds to original observation i.
1159        For ``n <= i < 2n-1``, `i` corresponds to non-singleton cluster formed
1160        at iteration ``i-n``.
1161
1162        Returns
1163        -------
1164        id : int
1165            The identifier of the target node.
1166
1167        """
1168        return self.id
1169
1170    def get_count(self):
1171        """
1172        The number of leaf nodes (original observations) belonging to
1173        the cluster node nd. If the target node is a leaf, 1 is
1174        returned.
1175
1176        Returns
1177        -------
1178        get_count : int
1179            The number of leaf nodes below the target node.
1180
1181        """
1182        return self.count
1183
1184    def get_left(self):
1185        """
1186        Return a reference to the left child tree object.
1187
1188        Returns
1189        -------
1190        left : ClusterNode
1191            The left child of the target node. If the node is a leaf,
1192            None is returned.
1193
1194        """
1195        return self.left
1196
1197    def get_right(self):
1198        """
1199        Return a reference to the right child tree object.
1200
1201        Returns
1202        -------
1203        right : ClusterNode
1204            The left child of the target node. If the node is a leaf,
1205            None is returned.
1206
1207        """
1208        return self.right
1209
1210    def is_leaf(self):
1211        """
1212        Return True if the target node is a leaf.
1213
1214        Returns
1215        -------
1216        leafness : bool
1217            True if the target node is a leaf node.
1218
1219        """
1220        return self.left is None
1221
1222    def pre_order(self, func=(lambda x: x.id)):
1223        """
1224        Perform pre-order traversal without recursive function calls.
1225
1226        When a leaf node is first encountered, ``func`` is called with
1227        the leaf node as its argument, and its result is appended to
1228        the list.
1229
1230        For example, the statement::
1231
1232           ids = root.pre_order(lambda x: x.id)
1233
1234        returns a list of the node ids corresponding to the leaf nodes
1235        of the tree as they appear from left to right.
1236
1237        Parameters
1238        ----------
1239        func : function
1240            Applied to each leaf ClusterNode object in the pre-order traversal.
1241            Given the ``i``-th leaf node in the pre-order traversal ``n[i]``,
1242            the result of ``func(n[i])`` is stored in ``L[i]``. If not
1243            provided, the index of the original observation to which the node
1244            corresponds is used.
1245
1246        Returns
1247        -------
1248        L : list
1249            The pre-order traversal.
1250
1251        """
1252        # Do a preorder traversal, caching the result. To avoid having to do
1253        # recursion, we'll store the previous index we've visited in a vector.
1254        n = self.count
1255
1256        curNode = [None] * (2 * n)
1257        lvisited = set()
1258        rvisited = set()
1259        curNode[0] = self
1260        k = 0
1261        preorder = []
1262        while k >= 0:
1263            nd = curNode[k]
1264            ndid = nd.id
1265            if nd.is_leaf():
1266                preorder.append(func(nd))
1267                k = k - 1
1268            else:
1269                if ndid not in lvisited:
1270                    curNode[k + 1] = nd.left
1271                    lvisited.add(ndid)
1272                    k = k + 1
1273                elif ndid not in rvisited:
1274                    curNode[k + 1] = nd.right
1275                    rvisited.add(ndid)
1276                    k = k + 1
1277                # If we've visited the left and right of this non-leaf
1278                # node already, go up in the tree.
1279                else:
1280                    k = k - 1
1281
1282        return preorder
1283
1284
1285_cnode_bare = ClusterNode(0)
1286_cnode_type = type(ClusterNode)
1287
1288
1289def _order_cluster_tree(Z):
1290    """
1291    Return clustering nodes in bottom-up order by distance.
1292
1293    Parameters
1294    ----------
1295    Z : scipy.cluster.linkage array
1296        The linkage matrix.
1297
1298    Returns
1299    -------
1300    nodes : list
1301        A list of ClusterNode objects.
1302    """
1303    q = deque()
1304    tree = to_tree(Z)
1305    q.append(tree)
1306    nodes = []
1307
1308    while q:
1309        node = q.popleft()
1310        if not node.is_leaf():
1311            bisect.insort_left(nodes, node)
1312            q.append(node.get_right())
1313            q.append(node.get_left())
1314    return nodes
1315
1316
1317def cut_tree(Z, n_clusters=None, height=None):
1318    """
1319    Given a linkage matrix Z, return the cut tree.
1320
1321    Parameters
1322    ----------
1323    Z : scipy.cluster.linkage array
1324        The linkage matrix.
1325    n_clusters : array_like, optional
1326        Number of clusters in the tree at the cut point.
1327    height : array_like, optional
1328        The height at which to cut the tree. Only possible for ultrametric
1329        trees.
1330
1331    Returns
1332    -------
1333    cutree : array
1334        An array indicating group membership at each agglomeration step. I.e.,
1335        for a full cut tree, in the first column each data point is in its own
1336        cluster. At the next step, two nodes are merged. Finally, all
1337        singleton and non-singleton clusters are in one group. If `n_clusters`
1338        or `height` are given, the columns correspond to the columns of
1339        `n_clusters` or `height`.
1340
1341    Examples
1342    --------
1343    >>> from scipy import cluster
1344    >>> import numpy as np
1345    >>> from numpy.random import default_rng
1346    >>> rng = default_rng()
1347    >>> X = rng.random((50, 4))
1348    >>> Z = cluster.hierarchy.ward(X)
1349    >>> cutree = cluster.hierarchy.cut_tree(Z, n_clusters=[5, 10])
1350    >>> cutree[:10]
1351    array([[0, 0],
1352           [1, 1],
1353           [2, 2],
1354           [3, 3],
1355           [3, 4],
1356           [2, 2],
1357           [0, 0],
1358           [1, 5],
1359           [3, 6],
1360           [4, 7]])  # random
1361
1362    """
1363    nobs = num_obs_linkage(Z)
1364    nodes = _order_cluster_tree(Z)
1365
1366    if height is not None and n_clusters is not None:
1367        raise ValueError("At least one of either height or n_clusters "
1368                         "must be None")
1369    elif height is None and n_clusters is None:  # return the full cut tree
1370        cols_idx = np.arange(nobs)
1371    elif height is not None:
1372        heights = np.array([x.dist for x in nodes])
1373        cols_idx = np.searchsorted(heights, height)
1374    else:
1375        cols_idx = nobs - np.searchsorted(np.arange(nobs), n_clusters)
1376
1377    try:
1378        n_cols = len(cols_idx)
1379    except TypeError:  # scalar
1380        n_cols = 1
1381        cols_idx = np.array([cols_idx])
1382
1383    groups = np.zeros((n_cols, nobs), dtype=int)
1384    last_group = np.arange(nobs)
1385    if 0 in cols_idx:
1386        groups[0] = last_group
1387
1388    for i, node in enumerate(nodes):
1389        idx = node.pre_order()
1390        this_group = last_group.copy()
1391        this_group[idx] = last_group[idx].min()
1392        this_group[this_group > last_group[idx].max()] -= 1
1393        if i + 1 in cols_idx:
1394            groups[np.nonzero(i + 1 == cols_idx)[0]] = this_group
1395        last_group = this_group
1396
1397    return groups.T
1398
1399
1400def to_tree(Z, rd=False):
1401    """
1402    Convert a linkage matrix into an easy-to-use tree object.
1403
1404    The reference to the root `ClusterNode` object is returned (by default).
1405
1406    Each `ClusterNode` object has a ``left``, ``right``, ``dist``, ``id``,
1407    and ``count`` attribute. The left and right attributes point to
1408    ClusterNode objects that were combined to generate the cluster.
1409    If both are None then the `ClusterNode` object is a leaf node, its count
1410    must be 1, and its distance is meaningless but set to 0.
1411
1412    *Note: This function is provided for the convenience of the library
1413    user. ClusterNodes are not used as input to any of the functions in this
1414    library.*
1415
1416    Parameters
1417    ----------
1418    Z : ndarray
1419        The linkage matrix in proper form (see the `linkage`
1420        function documentation).
1421    rd : bool, optional
1422        When False (default), a reference to the root `ClusterNode` object is
1423        returned.  Otherwise, a tuple ``(r, d)`` is returned. ``r`` is a
1424        reference to the root node while ``d`` is a list of `ClusterNode`
1425        objects - one per original entry in the linkage matrix plus entries
1426        for all clustering steps. If a cluster id is
1427        less than the number of samples ``n`` in the data that the linkage
1428        matrix describes, then it corresponds to a singleton cluster (leaf
1429        node).
1430        See `linkage` for more information on the assignment of cluster ids
1431        to clusters.
1432
1433    Returns
1434    -------
1435    tree : ClusterNode or tuple (ClusterNode, list of ClusterNode)
1436        If ``rd`` is False, a `ClusterNode`.
1437        If ``rd`` is True, a list of length ``2*n - 1``, with ``n`` the number
1438        of samples.  See the description of `rd` above for more details.
1439
1440    See Also
1441    --------
1442    linkage, is_valid_linkage, ClusterNode
1443
1444    Examples
1445    --------
1446    >>> from scipy.cluster import hierarchy
1447    >>> rng = np.random.default_rng()
1448    >>> x = rng.random((5, 2))
1449    >>> Z = hierarchy.linkage(x)
1450    >>> hierarchy.to_tree(Z)
1451    <scipy.cluster.hierarchy.ClusterNode object at ...
1452    >>> rootnode, nodelist = hierarchy.to_tree(Z, rd=True)
1453    >>> rootnode
1454    <scipy.cluster.hierarchy.ClusterNode object at ...
1455    >>> len(nodelist)
1456    9
1457
1458    """
1459    Z = np.asarray(Z, order='c')
1460    is_valid_linkage(Z, throw=True, name='Z')
1461
1462    # Number of original objects is equal to the number of rows plus 1.
1463    n = Z.shape[0] + 1
1464
1465    # Create a list full of None's to store the node objects
1466    d = [None] * (n * 2 - 1)
1467
1468    # Create the nodes corresponding to the n original objects.
1469    for i in range(0, n):
1470        d[i] = ClusterNode(i)
1471
1472    nd = None
1473
1474    for i, row in enumerate(Z):
1475        fi = int(row[0])
1476        fj = int(row[1])
1477        if fi > i + n:
1478            raise ValueError(('Corrupt matrix Z. Index to derivative cluster '
1479                              'is used before it is formed. See row %d, '
1480                              'column 0') % fi)
1481        if fj > i + n:
1482            raise ValueError(('Corrupt matrix Z. Index to derivative cluster '
1483                              'is used before it is formed. See row %d, '
1484                              'column 1') % fj)
1485
1486        nd = ClusterNode(i + n, d[fi], d[fj], row[2])
1487        #                ^ id   ^ left ^ right ^ dist
1488        if row[3] != nd.count:
1489            raise ValueError(('Corrupt matrix Z. The count Z[%d,3] is '
1490                              'incorrect.') % i)
1491        d[n + i] = nd
1492
1493    if rd:
1494        return (nd, d)
1495    else:
1496        return nd
1497
1498
1499def optimal_leaf_ordering(Z, y, metric='euclidean'):
1500    """
1501    Given a linkage matrix Z and distance, reorder the cut tree.
1502
1503    Parameters
1504    ----------
1505    Z : ndarray
1506        The hierarchical clustering encoded as a linkage matrix. See
1507        `linkage` for more information on the return structure and
1508        algorithm.
1509    y : ndarray
1510        The condensed distance matrix from which Z was generated.
1511        Alternatively, a collection of m observation vectors in n
1512        dimensions may be passed as an m by n array.
1513    metric : str or function, optional
1514        The distance metric to use in the case that y is a collection of
1515        observation vectors; ignored otherwise. See the ``pdist``
1516        function for a list of valid distance metrics. A custom distance
1517        function can also be used.
1518
1519    Returns
1520    -------
1521    Z_ordered : ndarray
1522        A copy of the linkage matrix Z, reordered to minimize the distance
1523        between adjacent leaves.
1524
1525    Examples
1526    --------
1527    >>> from scipy.cluster import hierarchy
1528    >>> rng = np.random.default_rng()
1529    >>> X = rng.standard_normal((10, 10))
1530    >>> Z = hierarchy.ward(X)
1531    >>> hierarchy.leaves_list(Z)
1532    array([0, 3, 1, 9, 2, 5, 7, 4, 6, 8], dtype=int32)
1533    >>> hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(Z, X))
1534    array([3, 0, 2, 5, 7, 4, 8, 6, 9, 1], dtype=int32)
1535
1536    """
1537    Z = np.asarray(Z, order='c')
1538    is_valid_linkage(Z, throw=True, name='Z')
1539
1540    y = _convert_to_double(np.asarray(y, order='c'))
1541
1542    if y.ndim == 1:
1543        distance.is_valid_y(y, throw=True, name='y')
1544        [y] = _copy_arrays_if_base_present([y])
1545    elif y.ndim == 2:
1546        if y.shape[0] == y.shape[1] and np.allclose(np.diag(y), 0):
1547            if np.all(y >= 0) and np.allclose(y, y.T):
1548                _warning('The symmetric non-negative hollow observation '
1549                         'matrix looks suspiciously like an uncondensed '
1550                         'distance matrix')
1551        y = distance.pdist(y, metric)
1552    else:
1553        raise ValueError("`y` must be 1 or 2 dimensional.")
1554
1555    if not np.all(np.isfinite(y)):
1556        raise ValueError("The condensed distance matrix must contain only "
1557                         "finite values.")
1558
1559    return _optimal_leaf_ordering.optimal_leaf_ordering(Z, y)
1560
1561
1562def _convert_to_bool(X):
1563    if X.dtype != bool:
1564        X = X.astype(bool)
1565    if not X.flags.contiguous:
1566        X = X.copy()
1567    return X
1568
1569
1570def _convert_to_double(X):
1571    if X.dtype != np.double:
1572        X = X.astype(np.double)
1573    if not X.flags.contiguous:
1574        X = X.copy()
1575    return X
1576
1577
1578def cophenet(Z, Y=None):
1579    """
1580    Calculate the cophenetic distances between each observation in
1581    the hierarchical clustering defined by the linkage ``Z``.
1582
1583    Suppose ``p`` and ``q`` are original observations in
1584    disjoint clusters ``s`` and ``t``, respectively and
1585    ``s`` and ``t`` are joined by a direct parent cluster
1586    ``u``. The cophenetic distance between observations
1587    ``i`` and ``j`` is simply the distance between
1588    clusters ``s`` and ``t``.
1589
1590    Parameters
1591    ----------
1592    Z : ndarray
1593        The hierarchical clustering encoded as an array
1594        (see `linkage` function).
1595    Y : ndarray (optional)
1596        Calculates the cophenetic correlation coefficient ``c`` of a
1597        hierarchical clustering defined by the linkage matrix `Z`
1598        of a set of :math:`n` observations in :math:`m`
1599        dimensions. `Y` is the condensed distance matrix from which
1600        `Z` was generated.
1601
1602    Returns
1603    -------
1604    c : ndarray
1605        The cophentic correlation distance (if ``Y`` is passed).
1606    d : ndarray
1607        The cophenetic distance matrix in condensed form. The
1608        :math:`ij` th entry is the cophenetic distance between
1609        original observations :math:`i` and :math:`j`.
1610
1611    See Also
1612    --------
1613    linkage : for a description of what a linkage matrix is.
1614    scipy.spatial.distance.squareform : transforming condensed matrices into square ones.
1615
1616    Examples
1617    --------
1618    >>> from scipy.cluster.hierarchy import single, cophenet
1619    >>> from scipy.spatial.distance import pdist, squareform
1620
1621    Given a dataset ``X`` and a linkage matrix ``Z``, the cophenetic distance
1622    between two points of ``X`` is the distance between the largest two
1623    distinct clusters that each of the points:
1624
1625    >>> X = [[0, 0], [0, 1], [1, 0],
1626    ...      [0, 4], [0, 3], [1, 4],
1627    ...      [4, 0], [3, 0], [4, 1],
1628    ...      [4, 4], [3, 4], [4, 3]]
1629
1630    ``X`` corresponds to this dataset ::
1631
1632        x x    x x
1633        x        x
1634
1635        x        x
1636        x x    x x
1637
1638    >>> Z = single(pdist(X))
1639    >>> Z
1640    array([[ 0.,  1.,  1.,  2.],
1641           [ 2., 12.,  1.,  3.],
1642           [ 3.,  4.,  1.,  2.],
1643           [ 5., 14.,  1.,  3.],
1644           [ 6.,  7.,  1.,  2.],
1645           [ 8., 16.,  1.,  3.],
1646           [ 9., 10.,  1.,  2.],
1647           [11., 18.,  1.,  3.],
1648           [13., 15.,  2.,  6.],
1649           [17., 20.,  2.,  9.],
1650           [19., 21.,  2., 12.]])
1651    >>> cophenet(Z)
1652    array([1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 2., 2., 2., 2., 2.,
1653           2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 2., 2.,
1654           2., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
1655           1., 1., 2., 2., 2., 1., 2., 2., 2., 2., 2., 2., 1., 1., 1.])
1656
1657    The output of the `scipy.cluster.hierarchy.cophenet` method is
1658    represented in condensed form. We can use
1659    `scipy.spatial.distance.squareform` to see the output as a
1660    regular matrix (where each element ``ij`` denotes the cophenetic distance
1661    between each ``i``, ``j`` pair of points in ``X``):
1662
1663    >>> squareform(cophenet(Z))
1664    array([[0., 1., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
1665           [1., 0., 1., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
1666           [1., 1., 0., 2., 2., 2., 2., 2., 2., 2., 2., 2.],
1667           [2., 2., 2., 0., 1., 1., 2., 2., 2., 2., 2., 2.],
1668           [2., 2., 2., 1., 0., 1., 2., 2., 2., 2., 2., 2.],
1669           [2., 2., 2., 1., 1., 0., 2., 2., 2., 2., 2., 2.],
1670           [2., 2., 2., 2., 2., 2., 0., 1., 1., 2., 2., 2.],
1671           [2., 2., 2., 2., 2., 2., 1., 0., 1., 2., 2., 2.],
1672           [2., 2., 2., 2., 2., 2., 1., 1., 0., 2., 2., 2.],
1673           [2., 2., 2., 2., 2., 2., 2., 2., 2., 0., 1., 1.],
1674           [2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 0., 1.],
1675           [2., 2., 2., 2., 2., 2., 2., 2., 2., 1., 1., 0.]])
1676
1677    In this example, the cophenetic distance between points on ``X`` that are
1678    very close (i.e., in the same corner) is 1. For other pairs of points is 2,
1679    because the points will be located in clusters at different
1680    corners - thus, the distance between these clusters will be larger.
1681
1682    """
1683    Z = np.asarray(Z, order='c')
1684    is_valid_linkage(Z, throw=True, name='Z')
1685    Zs = Z.shape
1686    n = Zs[0] + 1
1687
1688    zz = np.zeros((n * (n-1)) // 2, dtype=np.double)
1689    # Since the C code does not support striding using strides.
1690    # The dimensions are used instead.
1691    Z = _convert_to_double(Z)
1692
1693    _hierarchy.cophenetic_distances(Z, zz, int(n))
1694    if Y is None:
1695        return zz
1696
1697    Y = np.asarray(Y, order='c')
1698    distance.is_valid_y(Y, throw=True, name='Y')
1699
1700    z = zz.mean()
1701    y = Y.mean()
1702    Yy = Y - y
1703    Zz = zz - z
1704    numerator = (Yy * Zz)
1705    denomA = Yy**2
1706    denomB = Zz**2
1707    c = numerator.sum() / np.sqrt((denomA.sum() * denomB.sum()))
1708    return (c, zz)
1709
1710
1711def inconsistent(Z, d=2):
1712    r"""
1713    Calculate inconsistency statistics on a linkage matrix.
1714
1715    Parameters
1716    ----------
1717    Z : ndarray
1718        The :math:`(n-1)` by 4 matrix encoding the linkage (hierarchical
1719        clustering).  See `linkage` documentation for more information on its
1720        form.
1721    d : int, optional
1722        The number of links up to `d` levels below each non-singleton cluster.
1723
1724    Returns
1725    -------
1726    R : ndarray
1727        A :math:`(n-1)` by 4 matrix where the ``i``'th row contains the link
1728        statistics for the non-singleton cluster ``i``. The link statistics are
1729        computed over the link heights for links :math:`d` levels below the
1730        cluster ``i``. ``R[i,0]`` and ``R[i,1]`` are the mean and standard
1731        deviation of the link heights, respectively; ``R[i,2]`` is the number
1732        of links included in the calculation; and ``R[i,3]`` is the
1733        inconsistency coefficient,
1734
1735        .. math:: \frac{\mathtt{Z[i,2]} - \mathtt{R[i,0]}} {R[i,1]}
1736
1737    Notes
1738    -----
1739    This function behaves similarly to the MATLAB(TM) ``inconsistent``
1740    function.
1741
1742    Examples
1743    --------
1744    >>> from scipy.cluster.hierarchy import inconsistent, linkage
1745    >>> from matplotlib import pyplot as plt
1746    >>> X = [[i] for i in [2, 8, 0, 4, 1, 9, 9, 0]]
1747    >>> Z = linkage(X, 'ward')
1748    >>> print(Z)
1749    [[ 5.          6.          0.          2.        ]
1750     [ 2.          7.          0.          2.        ]
1751     [ 0.          4.          1.          2.        ]
1752     [ 1.          8.          1.15470054  3.        ]
1753     [ 9.         10.          2.12132034  4.        ]
1754     [ 3.         12.          4.11096096  5.        ]
1755     [11.         13.         14.07183949  8.        ]]
1756    >>> inconsistent(Z)
1757    array([[ 0.        ,  0.        ,  1.        ,  0.        ],
1758           [ 0.        ,  0.        ,  1.        ,  0.        ],
1759           [ 1.        ,  0.        ,  1.        ,  0.        ],
1760           [ 0.57735027,  0.81649658,  2.        ,  0.70710678],
1761           [ 1.04044011,  1.06123822,  3.        ,  1.01850858],
1762           [ 3.11614065,  1.40688837,  2.        ,  0.70710678],
1763           [ 6.44583366,  6.76770586,  3.        ,  1.12682288]])
1764
1765    """
1766    Z = np.asarray(Z, order='c')
1767
1768    Zs = Z.shape
1769    is_valid_linkage(Z, throw=True, name='Z')
1770    if (not d == np.floor(d)) or d < 0:
1771        raise ValueError('The second argument d must be a nonnegative '
1772                         'integer value.')
1773
1774    # Since the C code does not support striding using strides.
1775    # The dimensions are used instead.
1776    [Z] = _copy_arrays_if_base_present([Z])
1777
1778    n = Zs[0] + 1
1779    R = np.zeros((n - 1, 4), dtype=np.double)
1780
1781    _hierarchy.inconsistent(Z, R, int(n), int(d))
1782    return R
1783
1784
1785def from_mlab_linkage(Z):
1786    """
1787    Convert a linkage matrix generated by MATLAB(TM) to a new
1788    linkage matrix compatible with this module.
1789
1790    The conversion does two things:
1791
1792     * the indices are converted from ``1..N`` to ``0..(N-1)`` form,
1793       and
1794
1795     * a fourth column ``Z[:,3]`` is added where ``Z[i,3]`` represents the
1796       number of original observations (leaves) in the non-singleton
1797       cluster ``i``.
1798
1799    This function is useful when loading in linkages from legacy data
1800    files generated by MATLAB.
1801
1802    Parameters
1803    ----------
1804    Z : ndarray
1805        A linkage matrix generated by MATLAB(TM).
1806
1807    Returns
1808    -------
1809    ZS : ndarray
1810        A linkage matrix compatible with ``scipy.cluster.hierarchy``.
1811
1812    See Also
1813    --------
1814    linkage : for a description of what a linkage matrix is.
1815    to_mlab_linkage : transform from SciPy to MATLAB format.
1816
1817    Examples
1818    --------
1819    >>> import numpy as np
1820    >>> from scipy.cluster.hierarchy import ward, from_mlab_linkage
1821
1822    Given a linkage matrix in MATLAB format ``mZ``, we can use
1823    `scipy.cluster.hierarchy.from_mlab_linkage` to import
1824    it into SciPy format:
1825
1826    >>> mZ = np.array([[1, 2, 1], [4, 5, 1], [7, 8, 1],
1827    ...                [10, 11, 1], [3, 13, 1.29099445],
1828    ...                [6, 14, 1.29099445],
1829    ...                [9, 15, 1.29099445],
1830    ...                [12, 16, 1.29099445],
1831    ...                [17, 18, 5.77350269],
1832    ...                [19, 20, 5.77350269],
1833    ...                [21, 22,  8.16496581]])
1834
1835    >>> Z = from_mlab_linkage(mZ)
1836    >>> Z
1837    array([[  0.        ,   1.        ,   1.        ,   2.        ],
1838           [  3.        ,   4.        ,   1.        ,   2.        ],
1839           [  6.        ,   7.        ,   1.        ,   2.        ],
1840           [  9.        ,  10.        ,   1.        ,   2.        ],
1841           [  2.        ,  12.        ,   1.29099445,   3.        ],
1842           [  5.        ,  13.        ,   1.29099445,   3.        ],
1843           [  8.        ,  14.        ,   1.29099445,   3.        ],
1844           [ 11.        ,  15.        ,   1.29099445,   3.        ],
1845           [ 16.        ,  17.        ,   5.77350269,   6.        ],
1846           [ 18.        ,  19.        ,   5.77350269,   6.        ],
1847           [ 20.        ,  21.        ,   8.16496581,  12.        ]])
1848
1849    As expected, the linkage matrix ``Z`` returned includes an
1850    additional column counting the number of original samples in
1851    each cluster. Also, all cluster indices are reduced by 1
1852    (MATLAB format uses 1-indexing, whereas SciPy uses 0-indexing).
1853
1854    """
1855    Z = np.asarray(Z, dtype=np.double, order='c')
1856    Zs = Z.shape
1857
1858    # If it's empty, return it.
1859    if len(Zs) == 0 or (len(Zs) == 1 and Zs[0] == 0):
1860        return Z.copy()
1861
1862    if len(Zs) != 2:
1863        raise ValueError("The linkage array must be rectangular.")
1864
1865    # If it contains no rows, return it.
1866    if Zs[0] == 0:
1867        return Z.copy()
1868
1869    Zpart = Z.copy()
1870    if Zpart[:, 0:2].min() != 1.0 and Zpart[:, 0:2].max() != 2 * Zs[0]:
1871        raise ValueError('The format of the indices is not 1..N')
1872
1873    Zpart[:, 0:2] -= 1.0
1874    CS = np.zeros((Zs[0],), dtype=np.double)
1875    _hierarchy.calculate_cluster_sizes(Zpart, CS, int(Zs[0]) + 1)
1876    return np.hstack([Zpart, CS.reshape(Zs[0], 1)])
1877
1878
1879def to_mlab_linkage(Z):
1880    """
1881    Convert a linkage matrix to a MATLAB(TM) compatible one.
1882
1883    Converts a linkage matrix ``Z`` generated by the linkage function
1884    of this module to a MATLAB(TM) compatible one. The return linkage
1885    matrix has the last column removed and the cluster indices are
1886    converted to ``1..N`` indexing.
1887
1888    Parameters
1889    ----------
1890    Z : ndarray
1891        A linkage matrix generated by ``scipy.cluster.hierarchy``.
1892
1893    Returns
1894    -------
1895    to_mlab_linkage : ndarray
1896        A linkage matrix compatible with MATLAB(TM)'s hierarchical
1897        clustering functions.
1898
1899        The return linkage matrix has the last column removed
1900        and the cluster indices are converted to ``1..N`` indexing.
1901
1902    See Also
1903    --------
1904    linkage : for a description of what a linkage matrix is.
1905    from_mlab_linkage : transform from Matlab to SciPy format.
1906
1907    Examples
1908    --------
1909    >>> from scipy.cluster.hierarchy import ward, to_mlab_linkage
1910    >>> from scipy.spatial.distance import pdist
1911
1912    >>> X = [[0, 0], [0, 1], [1, 0],
1913    ...      [0, 4], [0, 3], [1, 4],
1914    ...      [4, 0], [3, 0], [4, 1],
1915    ...      [4, 4], [3, 4], [4, 3]]
1916
1917    >>> Z = ward(pdist(X))
1918    >>> Z
1919    array([[ 0.        ,  1.        ,  1.        ,  2.        ],
1920           [ 3.        ,  4.        ,  1.        ,  2.        ],
1921           [ 6.        ,  7.        ,  1.        ,  2.        ],
1922           [ 9.        , 10.        ,  1.        ,  2.        ],
1923           [ 2.        , 12.        ,  1.29099445,  3.        ],
1924           [ 5.        , 13.        ,  1.29099445,  3.        ],
1925           [ 8.        , 14.        ,  1.29099445,  3.        ],
1926           [11.        , 15.        ,  1.29099445,  3.        ],
1927           [16.        , 17.        ,  5.77350269,  6.        ],
1928           [18.        , 19.        ,  5.77350269,  6.        ],
1929           [20.        , 21.        ,  8.16496581, 12.        ]])
1930
1931    After a linkage matrix ``Z`` has been created, we can use
1932    `scipy.cluster.hierarchy.to_mlab_linkage` to convert it
1933    into MATLAB format:
1934
1935    >>> mZ = to_mlab_linkage(Z)
1936    >>> mZ
1937    array([[  1.        ,   2.        ,   1.        ],
1938           [  4.        ,   5.        ,   1.        ],
1939           [  7.        ,   8.        ,   1.        ],
1940           [ 10.        ,  11.        ,   1.        ],
1941           [  3.        ,  13.        ,   1.29099445],
1942           [  6.        ,  14.        ,   1.29099445],
1943           [  9.        ,  15.        ,   1.29099445],
1944           [ 12.        ,  16.        ,   1.29099445],
1945           [ 17.        ,  18.        ,   5.77350269],
1946           [ 19.        ,  20.        ,   5.77350269],
1947           [ 21.        ,  22.        ,   8.16496581]])
1948
1949    The new linkage matrix ``mZ`` uses 1-indexing for all the
1950    clusters (instead of 0-indexing). Also, the last column of
1951    the original linkage matrix has been dropped.
1952
1953    """
1954    Z = np.asarray(Z, order='c', dtype=np.double)
1955    Zs = Z.shape
1956    if len(Zs) == 0 or (len(Zs) == 1 and Zs[0] == 0):
1957        return Z.copy()
1958    is_valid_linkage(Z, throw=True, name='Z')
1959
1960    ZP = Z[:, 0:3].copy()
1961    ZP[:, 0:2] += 1.0
1962
1963    return ZP
1964
1965
1966def is_monotonic(Z):
1967    """
1968    Return True if the linkage passed is monotonic.
1969
1970    The linkage is monotonic if for every cluster :math:`s` and :math:`t`
1971    joined, the distance between them is no less than the distance
1972    between any previously joined clusters.
1973
1974    Parameters
1975    ----------
1976    Z : ndarray
1977        The linkage matrix to check for monotonicity.
1978
1979    Returns
1980    -------
1981    b : bool
1982        A boolean indicating whether the linkage is monotonic.
1983
1984    See Also
1985    --------
1986    linkage : for a description of what a linkage matrix is.
1987
1988    Examples
1989    --------
1990    >>> from scipy.cluster.hierarchy import median, ward, is_monotonic
1991    >>> from scipy.spatial.distance import pdist
1992
1993    By definition, some hierarchical clustering algorithms - such as
1994    `scipy.cluster.hierarchy.ward` - produce monotonic assignments of
1995    samples to clusters; however, this is not always true for other
1996    hierarchical methods - e.g. `scipy.cluster.hierarchy.median`.
1997
1998    Given a linkage matrix ``Z`` (as the result of a hierarchical clustering
1999    method) we can test programmatically whether it has the monotonicity
2000    property or not, using `scipy.cluster.hierarchy.is_monotonic`:
2001
2002    >>> X = [[0, 0], [0, 1], [1, 0],
2003    ...      [0, 4], [0, 3], [1, 4],
2004    ...      [4, 0], [3, 0], [4, 1],
2005    ...      [4, 4], [3, 4], [4, 3]]
2006
2007    >>> Z = ward(pdist(X))
2008    >>> Z
2009    array([[ 0.        ,  1.        ,  1.        ,  2.        ],
2010           [ 3.        ,  4.        ,  1.        ,  2.        ],
2011           [ 6.        ,  7.        ,  1.        ,  2.        ],
2012           [ 9.        , 10.        ,  1.        ,  2.        ],
2013           [ 2.        , 12.        ,  1.29099445,  3.        ],
2014           [ 5.        , 13.        ,  1.29099445,  3.        ],
2015           [ 8.        , 14.        ,  1.29099445,  3.        ],
2016           [11.        , 15.        ,  1.29099445,  3.        ],
2017           [16.        , 17.        ,  5.77350269,  6.        ],
2018           [18.        , 19.        ,  5.77350269,  6.        ],
2019           [20.        , 21.        ,  8.16496581, 12.        ]])
2020    >>> is_monotonic(Z)
2021    True
2022
2023    >>> Z = median(pdist(X))
2024    >>> Z
2025    array([[ 0.        ,  1.        ,  1.        ,  2.        ],
2026           [ 3.        ,  4.        ,  1.        ,  2.        ],
2027           [ 9.        , 10.        ,  1.        ,  2.        ],
2028           [ 6.        ,  7.        ,  1.        ,  2.        ],
2029           [ 2.        , 12.        ,  1.11803399,  3.        ],
2030           [ 5.        , 13.        ,  1.11803399,  3.        ],
2031           [ 8.        , 15.        ,  1.11803399,  3.        ],
2032           [11.        , 14.        ,  1.11803399,  3.        ],
2033           [18.        , 19.        ,  3.        ,  6.        ],
2034           [16.        , 17.        ,  3.5       ,  6.        ],
2035           [20.        , 21.        ,  3.25      , 12.        ]])
2036    >>> is_monotonic(Z)
2037    False
2038
2039    Note that this method is equivalent to just verifying that the distances
2040    in the third column of the linkage matrix appear in a monotonically
2041    increasing order.
2042
2043    """
2044    Z = np.asarray(Z, order='c')
2045    is_valid_linkage(Z, throw=True, name='Z')
2046
2047    # We expect the i'th value to be greater than its successor.
2048    return (Z[1:, 2] >= Z[:-1, 2]).all()
2049
2050
2051def is_valid_im(R, warning=False, throw=False, name=None):
2052    """Return True if the inconsistency matrix passed is valid.
2053
2054    It must be a :math:`n` by 4 array of doubles. The standard
2055    deviations ``R[:,1]`` must be nonnegative. The link counts
2056    ``R[:,2]`` must be positive and no greater than :math:`n-1`.
2057
2058    Parameters
2059    ----------
2060    R : ndarray
2061        The inconsistency matrix to check for validity.
2062    warning : bool, optional
2063        When True, issues a Python warning if the linkage
2064        matrix passed is invalid.
2065    throw : bool, optional
2066        When True, throws a Python exception if the linkage
2067        matrix passed is invalid.
2068    name : str, optional
2069        This string refers to the variable name of the invalid
2070        linkage matrix.
2071
2072    Returns
2073    -------
2074    b : bool
2075        True if the inconsistency matrix is valid.
2076
2077    See Also
2078    --------
2079    linkage : for a description of what a linkage matrix is.
2080    inconsistent : for the creation of a inconsistency matrix.
2081
2082    Examples
2083    --------
2084    >>> from scipy.cluster.hierarchy import ward, inconsistent, is_valid_im
2085    >>> from scipy.spatial.distance import pdist
2086
2087    Given a data set ``X``, we can apply a clustering method to obtain a
2088    linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can
2089    be also used to obtain the inconsistency matrix ``R`` associated to
2090    this clustering process:
2091
2092    >>> X = [[0, 0], [0, 1], [1, 0],
2093    ...      [0, 4], [0, 3], [1, 4],
2094    ...      [4, 0], [3, 0], [4, 1],
2095    ...      [4, 4], [3, 4], [4, 3]]
2096
2097    >>> Z = ward(pdist(X))
2098    >>> R = inconsistent(Z)
2099    >>> Z
2100    array([[ 0.        ,  1.        ,  1.        ,  2.        ],
2101           [ 3.        ,  4.        ,  1.        ,  2.        ],
2102           [ 6.        ,  7.        ,  1.        ,  2.        ],
2103           [ 9.        , 10.        ,  1.        ,  2.        ],
2104           [ 2.        , 12.        ,  1.29099445,  3.        ],
2105           [ 5.        , 13.        ,  1.29099445,  3.        ],
2106           [ 8.        , 14.        ,  1.29099445,  3.        ],
2107           [11.        , 15.        ,  1.29099445,  3.        ],
2108           [16.        , 17.        ,  5.77350269,  6.        ],
2109           [18.        , 19.        ,  5.77350269,  6.        ],
2110           [20.        , 21.        ,  8.16496581, 12.        ]])
2111    >>> R
2112    array([[1.        , 0.        , 1.        , 0.        ],
2113           [1.        , 0.        , 1.        , 0.        ],
2114           [1.        , 0.        , 1.        , 0.        ],
2115           [1.        , 0.        , 1.        , 0.        ],
2116           [1.14549722, 0.20576415, 2.        , 0.70710678],
2117           [1.14549722, 0.20576415, 2.        , 0.70710678],
2118           [1.14549722, 0.20576415, 2.        , 0.70710678],
2119           [1.14549722, 0.20576415, 2.        , 0.70710678],
2120           [2.78516386, 2.58797734, 3.        , 1.15470054],
2121           [2.78516386, 2.58797734, 3.        , 1.15470054],
2122           [6.57065706, 1.38071187, 3.        , 1.15470054]])
2123
2124    Now we can use `scipy.cluster.hierarchy.is_valid_im` to verify that
2125    ``R`` is correct:
2126
2127    >>> is_valid_im(R)
2128    True
2129
2130    However, if ``R`` is wrongly constructed (e.g., one of the standard
2131    deviations is set to a negative value), then the check will fail:
2132
2133    >>> R[-1,1] = R[-1,1] * -1
2134    >>> is_valid_im(R)
2135    False
2136
2137    """
2138    R = np.asarray(R, order='c')
2139    valid = True
2140    name_str = "%r " % name if name else ''
2141    try:
2142        if type(R) != np.ndarray:
2143            raise TypeError('Variable %spassed as inconsistency matrix is not '
2144                            'a numpy array.' % name_str)
2145        if R.dtype != np.double:
2146            raise TypeError('Inconsistency matrix %smust contain doubles '
2147                            '(double).' % name_str)
2148        if len(R.shape) != 2:
2149            raise ValueError('Inconsistency matrix %smust have shape=2 (i.e. '
2150                             'be two-dimensional).' % name_str)
2151        if R.shape[1] != 4:
2152            raise ValueError('Inconsistency matrix %smust have 4 columns.' %
2153                             name_str)
2154        if R.shape[0] < 1:
2155            raise ValueError('Inconsistency matrix %smust have at least one '
2156                             'row.' % name_str)
2157        if (R[:, 0] < 0).any():
2158            raise ValueError('Inconsistency matrix %scontains negative link '
2159                             'height means.' % name_str)
2160        if (R[:, 1] < 0).any():
2161            raise ValueError('Inconsistency matrix %scontains negative link '
2162                             'height standard deviations.' % name_str)
2163        if (R[:, 2] < 0).any():
2164            raise ValueError('Inconsistency matrix %scontains negative link '
2165                             'counts.' % name_str)
2166    except Exception as e:
2167        if throw:
2168            raise
2169        if warning:
2170            _warning(str(e))
2171        valid = False
2172
2173    return valid
2174
2175
2176def is_valid_linkage(Z, warning=False, throw=False, name=None):
2177    """
2178    Check the validity of a linkage matrix.
2179
2180    A linkage matrix is valid if it is a 2-D array (type double)
2181    with :math:`n` rows and 4 columns. The first two columns must contain
2182    indices between 0 and :math:`2n-1`. For a given row ``i``, the following
2183    two expressions have to hold:
2184
2185    .. math::
2186
2187        0 \\leq \\mathtt{Z[i,0]} \\leq i+n-1
2188        0 \\leq Z[i,1] \\leq i+n-1
2189
2190    I.e., a cluster cannot join another cluster unless the cluster being joined
2191    has been generated.
2192
2193    Parameters
2194    ----------
2195    Z : array_like
2196        Linkage matrix.
2197    warning : bool, optional
2198        When True, issues a Python warning if the linkage
2199        matrix passed is invalid.
2200    throw : bool, optional
2201        When True, throws a Python exception if the linkage
2202        matrix passed is invalid.
2203    name : str, optional
2204        This string refers to the variable name of the invalid
2205        linkage matrix.
2206
2207    Returns
2208    -------
2209    b : bool
2210        True if the inconsistency matrix is valid.
2211
2212    See Also
2213    --------
2214    linkage: for a description of what a linkage matrix is.
2215
2216    Examples
2217    --------
2218    >>> from scipy.cluster.hierarchy import ward, is_valid_linkage
2219    >>> from scipy.spatial.distance import pdist
2220
2221    All linkage matrices generated by the clustering methods in this module
2222    will be valid (i.e., they will have the appropriate dimensions and the two
2223    required expressions will hold for all the rows).
2224
2225    We can check this using `scipy.cluster.hierarchy.is_valid_linkage`:
2226
2227    >>> X = [[0, 0], [0, 1], [1, 0],
2228    ...      [0, 4], [0, 3], [1, 4],
2229    ...      [4, 0], [3, 0], [4, 1],
2230    ...      [4, 4], [3, 4], [4, 3]]
2231
2232    >>> Z = ward(pdist(X))
2233    >>> Z
2234    array([[ 0.        ,  1.        ,  1.        ,  2.        ],
2235           [ 3.        ,  4.        ,  1.        ,  2.        ],
2236           [ 6.        ,  7.        ,  1.        ,  2.        ],
2237           [ 9.        , 10.        ,  1.        ,  2.        ],
2238           [ 2.        , 12.        ,  1.29099445,  3.        ],
2239           [ 5.        , 13.        ,  1.29099445,  3.        ],
2240           [ 8.        , 14.        ,  1.29099445,  3.        ],
2241           [11.        , 15.        ,  1.29099445,  3.        ],
2242           [16.        , 17.        ,  5.77350269,  6.        ],
2243           [18.        , 19.        ,  5.77350269,  6.        ],
2244           [20.        , 21.        ,  8.16496581, 12.        ]])
2245    >>> is_valid_linkage(Z)
2246    True
2247
2248    However, if we create a linkage matrix in a wrong way - or if we modify
2249    a valid one in a way that any of the required expressions don't hold
2250    anymore, then the check will fail:
2251
2252    >>> Z[3][1] = 20    # the cluster number 20 is not defined at this point
2253    >>> is_valid_linkage(Z)
2254    False
2255
2256    """
2257    Z = np.asarray(Z, order='c')
2258    valid = True
2259    name_str = "%r " % name if name else ''
2260    try:
2261        if type(Z) != np.ndarray:
2262            raise TypeError('Passed linkage argument %sis not a valid array.' %
2263                            name_str)
2264        if Z.dtype != np.double:
2265            raise TypeError('Linkage matrix %smust contain doubles.' % name_str)
2266        if len(Z.shape) != 2:
2267            raise ValueError('Linkage matrix %smust have shape=2 (i.e. be '
2268                             'two-dimensional).' % name_str)
2269        if Z.shape[1] != 4:
2270            raise ValueError('Linkage matrix %smust have 4 columns.' % name_str)
2271        if Z.shape[0] == 0:
2272            raise ValueError('Linkage must be computed on at least two '
2273                             'observations.')
2274        n = Z.shape[0]
2275        if n > 1:
2276            if ((Z[:, 0] < 0).any() or (Z[:, 1] < 0).any()):
2277                raise ValueError('Linkage %scontains negative indices.' %
2278                                 name_str)
2279            if (Z[:, 2] < 0).any():
2280                raise ValueError('Linkage %scontains negative distances.' %
2281                                 name_str)
2282            if (Z[:, 3] < 0).any():
2283                raise ValueError('Linkage %scontains negative counts.' %
2284                                 name_str)
2285        if _check_hierarchy_uses_cluster_before_formed(Z):
2286            raise ValueError('Linkage %suses non-singleton cluster before '
2287                             'it is formed.' % name_str)
2288        if _check_hierarchy_uses_cluster_more_than_once(Z):
2289            raise ValueError('Linkage %suses the same cluster more than once.'
2290                             % name_str)
2291    except Exception as e:
2292        if throw:
2293            raise
2294        if warning:
2295            _warning(str(e))
2296        valid = False
2297
2298    return valid
2299
2300
2301def _check_hierarchy_uses_cluster_before_formed(Z):
2302    n = Z.shape[0] + 1
2303    for i in range(0, n - 1):
2304        if Z[i, 0] >= n + i or Z[i, 1] >= n + i:
2305            return True
2306    return False
2307
2308
2309def _check_hierarchy_uses_cluster_more_than_once(Z):
2310    n = Z.shape[0] + 1
2311    chosen = set([])
2312    for i in range(0, n - 1):
2313        if (Z[i, 0] in chosen) or (Z[i, 1] in chosen) or Z[i, 0] == Z[i, 1]:
2314            return True
2315        chosen.add(Z[i, 0])
2316        chosen.add(Z[i, 1])
2317    return False
2318
2319
2320def _check_hierarchy_not_all_clusters_used(Z):
2321    n = Z.shape[0] + 1
2322    chosen = set([])
2323    for i in range(0, n - 1):
2324        chosen.add(int(Z[i, 0]))
2325        chosen.add(int(Z[i, 1]))
2326    must_chosen = set(range(0, 2 * n - 2))
2327    return len(must_chosen.difference(chosen)) > 0
2328
2329
2330def num_obs_linkage(Z):
2331    """
2332    Return the number of original observations of the linkage matrix passed.
2333
2334    Parameters
2335    ----------
2336    Z : ndarray
2337        The linkage matrix on which to perform the operation.
2338
2339    Returns
2340    -------
2341    n : int
2342        The number of original observations in the linkage.
2343
2344    Examples
2345    --------
2346    >>> from scipy.cluster.hierarchy import ward, num_obs_linkage
2347    >>> from scipy.spatial.distance import pdist
2348
2349    >>> X = [[0, 0], [0, 1], [1, 0],
2350    ...      [0, 4], [0, 3], [1, 4],
2351    ...      [4, 0], [3, 0], [4, 1],
2352    ...      [4, 4], [3, 4], [4, 3]]
2353
2354    >>> Z = ward(pdist(X))
2355
2356    ``Z`` is a linkage matrix obtained after using the Ward clustering method
2357    with ``X``, a dataset with 12 data points.
2358
2359    >>> num_obs_linkage(Z)
2360    12
2361
2362    """
2363    Z = np.asarray(Z, order='c')
2364    is_valid_linkage(Z, throw=True, name='Z')
2365    return (Z.shape[0] + 1)
2366
2367
2368def correspond(Z, Y):
2369    """
2370    Check for correspondence between linkage and condensed distance matrices.
2371
2372    They must have the same number of original observations for
2373    the check to succeed.
2374
2375    This function is useful as a sanity check in algorithms that make
2376    extensive use of linkage and distance matrices that must
2377    correspond to the same set of original observations.
2378
2379    Parameters
2380    ----------
2381    Z : array_like
2382        The linkage matrix to check for correspondence.
2383    Y : array_like
2384        The condensed distance matrix to check for correspondence.
2385
2386    Returns
2387    -------
2388    b : bool
2389        A boolean indicating whether the linkage matrix and distance
2390        matrix could possibly correspond to one another.
2391
2392    See Also
2393    --------
2394    linkage : for a description of what a linkage matrix is.
2395
2396    Examples
2397    --------
2398    >>> from scipy.cluster.hierarchy import ward, correspond
2399    >>> from scipy.spatial.distance import pdist
2400
2401    This method can be used to check if a given linkage matrix ``Z`` has been
2402    obtained from the application of a cluster method over a dataset ``X``:
2403
2404    >>> X = [[0, 0], [0, 1], [1, 0],
2405    ...      [0, 4], [0, 3], [1, 4],
2406    ...      [4, 0], [3, 0], [4, 1],
2407    ...      [4, 4], [3, 4], [4, 3]]
2408    >>> X_condensed = pdist(X)
2409    >>> Z = ward(X_condensed)
2410
2411    Here, we can compare ``Z`` and ``X`` (in condensed form):
2412
2413    >>> correspond(Z, X_condensed)
2414    True
2415
2416    """
2417    is_valid_linkage(Z, throw=True)
2418    distance.is_valid_y(Y, throw=True)
2419    Z = np.asarray(Z, order='c')
2420    Y = np.asarray(Y, order='c')
2421    return distance.num_obs_y(Y) == num_obs_linkage(Z)
2422
2423
2424def fcluster(Z, t, criterion='inconsistent', depth=2, R=None, monocrit=None):
2425    """
2426    Form flat clusters from the hierarchical clustering defined by
2427    the given linkage matrix.
2428
2429    Parameters
2430    ----------
2431    Z : ndarray
2432        The hierarchical clustering encoded with the matrix returned
2433        by the `linkage` function.
2434    t : scalar
2435        For criteria 'inconsistent', 'distance' or 'monocrit',
2436         this is the threshold to apply when forming flat clusters.
2437        For 'maxclust' or 'maxclust_monocrit' criteria,
2438         this would be max number of clusters requested.
2439    criterion : str, optional
2440        The criterion to use in forming flat clusters. This can
2441        be any of the following values:
2442
2443          ``inconsistent`` :
2444              If a cluster node and all its
2445              descendants have an inconsistent value less than or equal
2446              to `t`, then all its leaf descendants belong to the
2447              same flat cluster. When no non-singleton cluster meets
2448              this criterion, every node is assigned to its own
2449              cluster. (Default)
2450
2451          ``distance`` :
2452              Forms flat clusters so that the original
2453              observations in each flat cluster have no greater a
2454              cophenetic distance than `t`.
2455
2456          ``maxclust`` :
2457              Finds a minimum threshold ``r`` so that
2458              the cophenetic distance between any two original
2459              observations in the same flat cluster is no more than
2460              ``r`` and no more than `t` flat clusters are formed.
2461
2462          ``monocrit`` :
2463              Forms a flat cluster from a cluster node c
2464              with index i when ``monocrit[j] <= t``.
2465
2466              For example, to threshold on the maximum mean distance
2467              as computed in the inconsistency matrix R with a
2468              threshold of 0.8 do::
2469
2470                  MR = maxRstat(Z, R, 3)
2471                  fcluster(Z, t=0.8, criterion='monocrit', monocrit=MR)
2472
2473          ``maxclust_monocrit`` :
2474              Forms a flat cluster from a
2475              non-singleton cluster node ``c`` when ``monocrit[i] <=
2476              r`` for all cluster indices ``i`` below and including
2477              ``c``. ``r`` is minimized such that no more than ``t``
2478              flat clusters are formed. monocrit must be
2479              monotonic. For example, to minimize the threshold t on
2480              maximum inconsistency values so that no more than 3 flat
2481              clusters are formed, do::
2482
2483                  MI = maxinconsts(Z, R)
2484                  fcluster(Z, t=3, criterion='maxclust_monocrit', monocrit=MI)
2485    depth : int, optional
2486        The maximum depth to perform the inconsistency calculation.
2487        It has no meaning for the other criteria. Default is 2.
2488    R : ndarray, optional
2489        The inconsistency matrix to use for the 'inconsistent'
2490        criterion. This matrix is computed if not provided.
2491    monocrit : ndarray, optional
2492        An array of length n-1. `monocrit[i]` is the
2493        statistics upon which non-singleton i is thresholded. The
2494        monocrit vector must be monotonic, i.e., given a node c with
2495        index i, for all node indices j corresponding to nodes
2496        below c, ``monocrit[i] >= monocrit[j]``.
2497
2498    Returns
2499    -------
2500    fcluster : ndarray
2501        An array of length ``n``. ``T[i]`` is the flat cluster number to
2502        which original observation ``i`` belongs.
2503
2504    See Also
2505    --------
2506    linkage : for information about hierarchical clustering methods work.
2507
2508    Examples
2509    --------
2510    >>> from scipy.cluster.hierarchy import ward, fcluster
2511    >>> from scipy.spatial.distance import pdist
2512
2513    All cluster linkage methods - e.g., `scipy.cluster.hierarchy.ward`
2514    generate a linkage matrix ``Z`` as their output:
2515
2516    >>> X = [[0, 0], [0, 1], [1, 0],
2517    ...      [0, 4], [0, 3], [1, 4],
2518    ...      [4, 0], [3, 0], [4, 1],
2519    ...      [4, 4], [3, 4], [4, 3]]
2520
2521    >>> Z = ward(pdist(X))
2522
2523    >>> Z
2524    array([[ 0.        ,  1.        ,  1.        ,  2.        ],
2525           [ 3.        ,  4.        ,  1.        ,  2.        ],
2526           [ 6.        ,  7.        ,  1.        ,  2.        ],
2527           [ 9.        , 10.        ,  1.        ,  2.        ],
2528           [ 2.        , 12.        ,  1.29099445,  3.        ],
2529           [ 5.        , 13.        ,  1.29099445,  3.        ],
2530           [ 8.        , 14.        ,  1.29099445,  3.        ],
2531           [11.        , 15.        ,  1.29099445,  3.        ],
2532           [16.        , 17.        ,  5.77350269,  6.        ],
2533           [18.        , 19.        ,  5.77350269,  6.        ],
2534           [20.        , 21.        ,  8.16496581, 12.        ]])
2535
2536    This matrix represents a dendrogram, where the first and second elements
2537    are the two clusters merged at each step, the third element is the
2538    distance between these clusters, and the fourth element is the size of
2539    the new cluster - the number of original data points included.
2540
2541    `scipy.cluster.hierarchy.fcluster` can be used to flatten the
2542    dendrogram, obtaining as a result an assignation of the original data
2543    points to single clusters.
2544
2545    This assignation mostly depends on a distance threshold ``t`` - the maximum
2546    inter-cluster distance allowed:
2547
2548    >>> fcluster(Z, t=0.9, criterion='distance')
2549    array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12], dtype=int32)
2550
2551    >>> fcluster(Z, t=1.1, criterion='distance')
2552    array([1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 7, 8], dtype=int32)
2553
2554    >>> fcluster(Z, t=3, criterion='distance')
2555    array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
2556
2557    >>> fcluster(Z, t=9, criterion='distance')
2558    array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)
2559
2560    In the first case, the threshold ``t`` is too small to allow any two
2561    samples in the data to form a cluster, so 12 different clusters are
2562    returned.
2563
2564    In the second case, the threshold is large enough to allow the first
2565    4 points to be merged with their nearest neighbors. So, here, only 8
2566    clusters are returned.
2567
2568    The third case, with a much higher threshold, allows for up to 8 data
2569    points to be connected - so 4 clusters are returned here.
2570
2571    Lastly, the threshold of the fourth case is large enough to allow for
2572    all data points to be merged together - so a single cluster is returned.
2573
2574    """
2575    Z = np.asarray(Z, order='c')
2576    is_valid_linkage(Z, throw=True, name='Z')
2577
2578    n = Z.shape[0] + 1
2579    T = np.zeros((n,), dtype='i')
2580
2581    # Since the C code does not support striding using strides.
2582    # The dimensions are used instead.
2583    [Z] = _copy_arrays_if_base_present([Z])
2584
2585    if criterion == 'inconsistent':
2586        if R is None:
2587            R = inconsistent(Z, depth)
2588        else:
2589            R = np.asarray(R, order='c')
2590            is_valid_im(R, throw=True, name='R')
2591            # Since the C code does not support striding using strides.
2592            # The dimensions are used instead.
2593            [R] = _copy_arrays_if_base_present([R])
2594        _hierarchy.cluster_in(Z, R, T, float(t), int(n))
2595    elif criterion == 'distance':
2596        _hierarchy.cluster_dist(Z, T, float(t), int(n))
2597    elif criterion == 'maxclust':
2598        _hierarchy.cluster_maxclust_dist(Z, T, int(n), int(t))
2599    elif criterion == 'monocrit':
2600        [monocrit] = _copy_arrays_if_base_present([monocrit])
2601        _hierarchy.cluster_monocrit(Z, monocrit, T, float(t), int(n))
2602    elif criterion == 'maxclust_monocrit':
2603        [monocrit] = _copy_arrays_if_base_present([monocrit])
2604        _hierarchy.cluster_maxclust_monocrit(Z, monocrit, T, int(n), int(t))
2605    else:
2606        raise ValueError('Invalid cluster formation criterion: %s'
2607                         % str(criterion))
2608    return T
2609
2610
2611def fclusterdata(X, t, criterion='inconsistent',
2612                 metric='euclidean', depth=2, method='single', R=None):
2613    """
2614    Cluster observation data using a given metric.
2615
2616    Clusters the original observations in the n-by-m data
2617    matrix X (n observations in m dimensions), using the euclidean
2618    distance metric to calculate distances between original observations,
2619    performs hierarchical clustering using the single linkage algorithm,
2620    and forms flat clusters using the inconsistency method with `t` as the
2621    cut-off threshold.
2622
2623    A 1-D array ``T`` of length ``n`` is returned. ``T[i]`` is
2624    the index of the flat cluster to which the original observation ``i``
2625    belongs.
2626
2627    Parameters
2628    ----------
2629    X : (N, M) ndarray
2630        N by M data matrix with N observations in M dimensions.
2631    t : scalar
2632        For criteria 'inconsistent', 'distance' or 'monocrit',
2633         this is the threshold to apply when forming flat clusters.
2634        For 'maxclust' or 'maxclust_monocrit' criteria,
2635         this would be max number of clusters requested.
2636    criterion : str, optional
2637        Specifies the criterion for forming flat clusters. Valid
2638        values are 'inconsistent' (default), 'distance', or 'maxclust'
2639        cluster formation algorithms. See `fcluster` for descriptions.
2640    metric : str or function, optional
2641        The distance metric for calculating pairwise distances. See
2642        ``distance.pdist`` for descriptions and linkage to verify
2643        compatibility with the linkage method.
2644    depth : int, optional
2645        The maximum depth for the inconsistency calculation. See
2646        `inconsistent` for more information.
2647    method : str, optional
2648        The linkage method to use (single, complete, average,
2649        weighted, median centroid, ward). See `linkage` for more
2650        information. Default is "single".
2651    R : ndarray, optional
2652        The inconsistency matrix. It will be computed if necessary
2653        if it is not passed.
2654
2655    Returns
2656    -------
2657    fclusterdata : ndarray
2658        A vector of length n. T[i] is the flat cluster number to
2659        which original observation i belongs.
2660
2661    See Also
2662    --------
2663    scipy.spatial.distance.pdist : pairwise distance metrics
2664
2665    Notes
2666    -----
2667    This function is similar to the MATLAB function ``clusterdata``.
2668
2669    Examples
2670    --------
2671    >>> from scipy.cluster.hierarchy import fclusterdata
2672
2673    This is a convenience method that abstracts all the steps to perform in a
2674    typical SciPy's hierarchical clustering workflow.
2675
2676    * Transform the input data into a condensed matrix with `scipy.spatial.distance.pdist`.
2677
2678    * Apply a clustering method.
2679
2680    * Obtain flat clusters at a user defined distance threshold ``t`` using `scipy.cluster.hierarchy.fcluster`.
2681
2682    >>> X = [[0, 0], [0, 1], [1, 0],
2683    ...      [0, 4], [0, 3], [1, 4],
2684    ...      [4, 0], [3, 0], [4, 1],
2685    ...      [4, 4], [3, 4], [4, 3]]
2686
2687    >>> fclusterdata(X, t=1)
2688    array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32)
2689
2690    The output here (for the dataset ``X``, distance threshold ``t``, and the
2691    default settings) is four clusters with three data points each.
2692
2693    """
2694    X = np.asarray(X, order='c', dtype=np.double)
2695
2696    if type(X) != np.ndarray or len(X.shape) != 2:
2697        raise TypeError('The observation matrix X must be an n by m numpy '
2698                        'array.')
2699
2700    Y = distance.pdist(X, metric=metric)
2701    Z = linkage(Y, method=method)
2702    if R is None:
2703        R = inconsistent(Z, d=depth)
2704    else:
2705        R = np.asarray(R, order='c')
2706    T = fcluster(Z, criterion=criterion, depth=depth, R=R, t=t)
2707    return T
2708
2709
2710def leaves_list(Z):
2711    """
2712    Return a list of leaf node ids.
2713
2714    The return corresponds to the observation vector index as it appears
2715    in the tree from left to right. Z is a linkage matrix.
2716
2717    Parameters
2718    ----------
2719    Z : ndarray
2720        The hierarchical clustering encoded as a matrix.  `Z` is
2721        a linkage matrix.  See `linkage` for more information.
2722
2723    Returns
2724    -------
2725    leaves_list : ndarray
2726        The list of leaf node ids.
2727
2728    See Also
2729    --------
2730    dendrogram : for information about dendrogram structure.
2731
2732    Examples
2733    --------
2734    >>> from scipy.cluster.hierarchy import ward, dendrogram, leaves_list
2735    >>> from scipy.spatial.distance import pdist
2736    >>> from matplotlib import pyplot as plt
2737
2738    >>> X = [[0, 0], [0, 1], [1, 0],
2739    ...      [0, 4], [0, 3], [1, 4],
2740    ...      [4, 0], [3, 0], [4, 1],
2741    ...      [4, 4], [3, 4], [4, 3]]
2742
2743    >>> Z = ward(pdist(X))
2744
2745    The linkage matrix ``Z`` represents a dendrogram, that is, a tree that
2746    encodes the structure of the clustering performed.
2747    `scipy.cluster.hierarchy.leaves_list` shows the mapping between
2748    indices in the ``X`` dataset and leaves in the dendrogram:
2749
2750    >>> leaves_list(Z)
2751    array([ 2,  0,  1,  5,  3,  4,  8,  6,  7, 11,  9, 10], dtype=int32)
2752
2753    >>> fig = plt.figure(figsize=(25, 10))
2754    >>> dn = dendrogram(Z)
2755    >>> plt.show()
2756
2757    """
2758    Z = np.asarray(Z, order='c')
2759    is_valid_linkage(Z, throw=True, name='Z')
2760    n = Z.shape[0] + 1
2761    ML = np.zeros((n,), dtype='i')
2762    [Z] = _copy_arrays_if_base_present([Z])
2763    _hierarchy.prelist(Z, ML, int(n))
2764    return ML
2765
2766
2767# Maps number of leaves to text size.
2768#
2769# p <= 20, size="12"
2770# 20 < p <= 30, size="10"
2771# 30 < p <= 50, size="8"
2772# 50 < p <= np.inf, size="6"
2773
2774_dtextsizes = {20: 12, 30: 10, 50: 8, 85: 6, np.inf: 5}
2775_drotation = {20: 0, 40: 45, np.inf: 90}
2776_dtextsortedkeys = list(_dtextsizes.keys())
2777_dtextsortedkeys.sort()
2778_drotationsortedkeys = list(_drotation.keys())
2779_drotationsortedkeys.sort()
2780
2781
2782def _remove_dups(L):
2783    """
2784    Remove duplicates AND preserve the original order of the elements.
2785
2786    The set class is not guaranteed to do this.
2787    """
2788    seen_before = set([])
2789    L2 = []
2790    for i in L:
2791        if i not in seen_before:
2792            seen_before.add(i)
2793            L2.append(i)
2794    return L2
2795
2796
2797def _get_tick_text_size(p):
2798    for k in _dtextsortedkeys:
2799        if p <= k:
2800            return _dtextsizes[k]
2801
2802
2803def _get_tick_rotation(p):
2804    for k in _drotationsortedkeys:
2805        if p <= k:
2806            return _drotation[k]
2807
2808
2809def _plot_dendrogram(icoords, dcoords, ivl, p, n, mh, orientation,
2810                     no_labels, color_list, leaf_font_size=None,
2811                     leaf_rotation=None, contraction_marks=None,
2812                     ax=None, above_threshold_color='C0'):
2813    # Import matplotlib here so that it's not imported unless dendrograms
2814    # are plotted. Raise an informative error if importing fails.
2815    try:
2816        # if an axis is provided, don't use pylab at all
2817        if ax is None:
2818            import matplotlib.pylab
2819        import matplotlib.patches
2820        import matplotlib.collections
2821    except ImportError as e:
2822        raise ImportError("You must install the matplotlib library to plot "
2823                          "the dendrogram. Use no_plot=True to calculate the "
2824                          "dendrogram without plotting.") from e
2825
2826    if ax is None:
2827        ax = matplotlib.pylab.gca()
2828        # if we're using pylab, we want to trigger a draw at the end
2829        trigger_redraw = True
2830    else:
2831        trigger_redraw = False
2832
2833    # Independent variable plot width
2834    ivw = len(ivl) * 10
2835    # Dependent variable plot height
2836    dvw = mh + mh * 0.05
2837
2838    iv_ticks = np.arange(5, len(ivl) * 10 + 5, 10)
2839    if orientation in ('top', 'bottom'):
2840        if orientation == 'top':
2841            ax.set_ylim([0, dvw])
2842            ax.set_xlim([0, ivw])
2843        else:
2844            ax.set_ylim([dvw, 0])
2845            ax.set_xlim([0, ivw])
2846
2847        xlines = icoords
2848        ylines = dcoords
2849        if no_labels:
2850            ax.set_xticks([])
2851            ax.set_xticklabels([])
2852        else:
2853            ax.set_xticks(iv_ticks)
2854
2855            if orientation == 'top':
2856                ax.xaxis.set_ticks_position('bottom')
2857            else:
2858                ax.xaxis.set_ticks_position('top')
2859
2860            # Make the tick marks invisible because they cover up the links
2861            for line in ax.get_xticklines():
2862                line.set_visible(False)
2863
2864            leaf_rot = (float(_get_tick_rotation(len(ivl)))
2865                        if (leaf_rotation is None) else leaf_rotation)
2866            leaf_font = (float(_get_tick_text_size(len(ivl)))
2867                         if (leaf_font_size is None) else leaf_font_size)
2868            ax.set_xticklabels(ivl, rotation=leaf_rot, size=leaf_font)
2869
2870    elif orientation in ('left', 'right'):
2871        if orientation == 'left':
2872            ax.set_xlim([dvw, 0])
2873            ax.set_ylim([0, ivw])
2874        else:
2875            ax.set_xlim([0, dvw])
2876            ax.set_ylim([0, ivw])
2877
2878        xlines = dcoords
2879        ylines = icoords
2880        if no_labels:
2881            ax.set_yticks([])
2882            ax.set_yticklabels([])
2883        else:
2884            ax.set_yticks(iv_ticks)
2885
2886            if orientation == 'left':
2887                ax.yaxis.set_ticks_position('right')
2888            else:
2889                ax.yaxis.set_ticks_position('left')
2890
2891            # Make the tick marks invisible because they cover up the links
2892            for line in ax.get_yticklines():
2893                line.set_visible(False)
2894
2895            leaf_font = (float(_get_tick_text_size(len(ivl)))
2896                         if (leaf_font_size is None) else leaf_font_size)
2897
2898            if leaf_rotation is not None:
2899                ax.set_yticklabels(ivl, rotation=leaf_rotation, size=leaf_font)
2900            else:
2901                ax.set_yticklabels(ivl, size=leaf_font)
2902
2903    # Let's use collections instead. This way there is a separate legend item
2904    # for each tree grouping, rather than stupidly one for each line segment.
2905    colors_used = _remove_dups(color_list)
2906    color_to_lines = {}
2907    for color in colors_used:
2908        color_to_lines[color] = []
2909    for (xline, yline, color) in zip(xlines, ylines, color_list):
2910        color_to_lines[color].append(list(zip(xline, yline)))
2911
2912    colors_to_collections = {}
2913    # Construct the collections.
2914    for color in colors_used:
2915        coll = matplotlib.collections.LineCollection(color_to_lines[color],
2916                                                     colors=(color,))
2917        colors_to_collections[color] = coll
2918
2919    # Add all the groupings below the color threshold.
2920    for color in colors_used:
2921        if color != above_threshold_color:
2922            ax.add_collection(colors_to_collections[color])
2923    # If there's a grouping of links above the color threshold, it goes last.
2924    if above_threshold_color in colors_to_collections:
2925        ax.add_collection(colors_to_collections[above_threshold_color])
2926
2927    if contraction_marks is not None:
2928        Ellipse = matplotlib.patches.Ellipse
2929        for (x, y) in contraction_marks:
2930            if orientation in ('left', 'right'):
2931                e = Ellipse((y, x), width=dvw / 100, height=1.0)
2932            else:
2933                e = Ellipse((x, y), width=1.0, height=dvw / 100)
2934            ax.add_artist(e)
2935            e.set_clip_box(ax.bbox)
2936            e.set_alpha(0.5)
2937            e.set_facecolor('k')
2938
2939    if trigger_redraw:
2940        matplotlib.pylab.draw_if_interactive()
2941
2942
2943# C0  is used for above threshhold color
2944_link_line_colors_default = ('C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9')
2945_link_line_colors = list(_link_line_colors_default)
2946
2947
2948def set_link_color_palette(palette):
2949    """
2950    Set list of matplotlib color codes for use by dendrogram.
2951
2952    Note that this palette is global (i.e., setting it once changes the colors
2953    for all subsequent calls to `dendrogram`) and that it affects only the
2954    the colors below ``color_threshold``.
2955
2956    Note that `dendrogram` also accepts a custom coloring function through its
2957    ``link_color_func`` keyword, which is more flexible and non-global.
2958
2959    Parameters
2960    ----------
2961    palette : list of str or None
2962        A list of matplotlib color codes.  The order of the color codes is the
2963        order in which the colors are cycled through when color thresholding in
2964        the dendrogram.
2965
2966        If ``None``, resets the palette to its default (which are matplotlib
2967        default colors C1 to C9).
2968
2969    Returns
2970    -------
2971    None
2972
2973    See Also
2974    --------
2975    dendrogram
2976
2977    Notes
2978    -----
2979    Ability to reset the palette with ``None`` added in SciPy 0.17.0.
2980
2981    Examples
2982    --------
2983    >>> from scipy.cluster import hierarchy
2984    >>> ytdist = np.array([662., 877., 255., 412., 996., 295., 468., 268.,
2985    ...                    400., 754., 564., 138., 219., 869., 669.])
2986    >>> Z = hierarchy.linkage(ytdist, 'single')
2987    >>> dn = hierarchy.dendrogram(Z, no_plot=True)
2988    >>> dn['color_list']
2989    ['C1', 'C0', 'C0', 'C0', 'C0']
2990    >>> hierarchy.set_link_color_palette(['c', 'm', 'y', 'k'])
2991    >>> dn = hierarchy.dendrogram(Z, no_plot=True, above_threshold_color='b')
2992    >>> dn['color_list']
2993    ['c', 'b', 'b', 'b', 'b']
2994    >>> dn = hierarchy.dendrogram(Z, no_plot=True, color_threshold=267,
2995    ...                           above_threshold_color='k')
2996    >>> dn['color_list']
2997    ['c', 'm', 'm', 'k', 'k']
2998
2999    Now, reset the color palette to its default:
3000
3001    >>> hierarchy.set_link_color_palette(None)
3002
3003    """
3004    if palette is None:
3005        # reset to its default
3006        palette = _link_line_colors_default
3007    elif type(palette) not in (list, tuple):
3008        raise TypeError("palette must be a list or tuple")
3009    _ptypes = [isinstance(p, str) for p in palette]
3010
3011    if False in _ptypes:
3012        raise TypeError("all palette list elements must be color strings")
3013
3014    global _link_line_colors
3015    _link_line_colors = palette
3016
3017
3018def dendrogram(Z, p=30, truncate_mode=None, color_threshold=None,
3019               get_leaves=True, orientation='top', labels=None,
3020               count_sort=False, distance_sort=False, show_leaf_counts=True,
3021               no_plot=False, no_labels=False, leaf_font_size=None,
3022               leaf_rotation=None, leaf_label_func=None,
3023               show_contracted=False, link_color_func=None, ax=None,
3024               above_threshold_color='C0'):
3025    """
3026    Plot the hierarchical clustering as a dendrogram.
3027
3028    The dendrogram illustrates how each cluster is
3029    composed by drawing a U-shaped link between a non-singleton
3030    cluster and its children. The top of the U-link indicates a
3031    cluster merge. The two legs of the U-link indicate which clusters
3032    were merged. The length of the two legs of the U-link represents
3033    the distance between the child clusters. It is also the
3034    cophenetic distance between original observations in the two
3035    children clusters.
3036
3037    Parameters
3038    ----------
3039    Z : ndarray
3040        The linkage matrix encoding the hierarchical clustering to
3041        render as a dendrogram. See the ``linkage`` function for more
3042        information on the format of ``Z``.
3043    p : int, optional
3044        The ``p`` parameter for ``truncate_mode``.
3045    truncate_mode : str, optional
3046        The dendrogram can be hard to read when the original
3047        observation matrix from which the linkage is derived is
3048        large. Truncation is used to condense the dendrogram. There
3049        are several modes:
3050
3051        ``None``
3052          No truncation is performed (default).
3053          Note: ``'none'`` is an alias for ``None`` that's kept for
3054          backward compatibility.
3055
3056        ``'lastp'``
3057          The last ``p`` non-singleton clusters formed in the linkage are the
3058          only non-leaf nodes in the linkage; they correspond to rows
3059          ``Z[n-p-2:end]`` in ``Z``. All other non-singleton clusters are
3060          contracted into leaf nodes.
3061
3062        ``'level'``
3063          No more than ``p`` levels of the dendrogram tree are displayed.
3064          A "level" includes all nodes with ``p`` merges from the final merge.
3065
3066          Note: ``'mtica'`` is an alias for ``'level'`` that's kept for
3067          backward compatibility.
3068
3069    color_threshold : double, optional
3070        For brevity, let :math:`t` be the ``color_threshold``.
3071        Colors all the descendent links below a cluster node
3072        :math:`k` the same color if :math:`k` is the first node below
3073        the cut threshold :math:`t`. All links connecting nodes with
3074        distances greater than or equal to the threshold are colored
3075        with de default matplotlib color ``'C0'``. If :math:`t` is less
3076        than or equal to zero, all nodes are colored ``'C0'``.
3077        If ``color_threshold`` is None or 'default',
3078        corresponding with MATLAB(TM) behavior, the threshold is set to
3079        ``0.7*max(Z[:,2])``.
3080
3081    get_leaves : bool, optional
3082        Includes a list ``R['leaves']=H`` in the result
3083        dictionary. For each :math:`i`, ``H[i] == j``, cluster node
3084        ``j`` appears in position ``i`` in the left-to-right traversal
3085        of the leaves, where :math:`j < 2n-1` and :math:`i < n`.
3086    orientation : str, optional
3087        The direction to plot the dendrogram, which can be any
3088        of the following strings:
3089
3090        ``'top'``
3091          Plots the root at the top, and plot descendent links going downwards.
3092          (default).
3093
3094        ``'bottom'``
3095          Plots the root at the bottom, and plot descendent links going
3096          upwards.
3097
3098        ``'left'``
3099          Plots the root at the left, and plot descendent links going right.
3100
3101        ``'right'``
3102          Plots the root at the right, and plot descendent links going left.
3103
3104    labels : ndarray, optional
3105        By default, ``labels`` is None so the index of the original observation
3106        is used to label the leaf nodes.  Otherwise, this is an :math:`n`-sized
3107        sequence, with ``n == Z.shape[0] + 1``. The ``labels[i]`` value is the
3108        text to put under the :math:`i` th leaf node only if it corresponds to
3109        an original observation and not a non-singleton cluster.
3110    count_sort : str or bool, optional
3111        For each node n, the order (visually, from left-to-right) n's
3112        two descendent links are plotted is determined by this
3113        parameter, which can be any of the following values:
3114
3115        ``False``
3116          Nothing is done.
3117
3118        ``'ascending'`` or ``True``
3119          The child with the minimum number of original objects in its cluster
3120          is plotted first.
3121
3122        ``'descending'``
3123          The child with the maximum number of original objects in its cluster
3124          is plotted first.
3125
3126        Note, ``distance_sort`` and ``count_sort`` cannot both be True.
3127    distance_sort : str or bool, optional
3128        For each node n, the order (visually, from left-to-right) n's
3129        two descendent links are plotted is determined by this
3130        parameter, which can be any of the following values:
3131
3132        ``False``
3133          Nothing is done.
3134
3135        ``'ascending'`` or ``True``
3136          The child with the minimum distance between its direct descendents is
3137          plotted first.
3138
3139        ``'descending'``
3140          The child with the maximum distance between its direct descendents is
3141          plotted first.
3142
3143        Note ``distance_sort`` and ``count_sort`` cannot both be True.
3144    show_leaf_counts : bool, optional
3145         When True, leaf nodes representing :math:`k>1` original
3146         observation are labeled with the number of observations they
3147         contain in parentheses.
3148    no_plot : bool, optional
3149        When True, the final rendering is not performed. This is
3150        useful if only the data structures computed for the rendering
3151        are needed or if matplotlib is not available.
3152    no_labels : bool, optional
3153        When True, no labels appear next to the leaf nodes in the
3154        rendering of the dendrogram.
3155    leaf_rotation : double, optional
3156        Specifies the angle (in degrees) to rotate the leaf
3157        labels. When unspecified, the rotation is based on the number of
3158        nodes in the dendrogram (default is 0).
3159    leaf_font_size : int, optional
3160        Specifies the font size (in points) of the leaf labels. When
3161        unspecified, the size based on the number of nodes in the
3162        dendrogram.
3163    leaf_label_func : lambda or function, optional
3164        When ``leaf_label_func`` is a callable function, for each
3165        leaf with cluster index :math:`k < 2n-1`. The function
3166        is expected to return a string with the label for the
3167        leaf.
3168
3169        Indices :math:`k < n` correspond to original observations
3170        while indices :math:`k \\geq n` correspond to non-singleton
3171        clusters.
3172
3173        For example, to label singletons with their node id and
3174        non-singletons with their id, count, and inconsistency
3175        coefficient, simply do::
3176
3177            # First define the leaf label function.
3178            def llf(id):
3179                if id < n:
3180                    return str(id)
3181                else:
3182                    return '[%d %d %1.2f]' % (id, count, R[n-id,3])
3183
3184            # The text for the leaf nodes is going to be big so force
3185            # a rotation of 90 degrees.
3186            dendrogram(Z, leaf_label_func=llf, leaf_rotation=90)
3187
3188            # leaf_label_func can also be used together with ``truncate_mode`` parameter,
3189            # in which case you will get your leaves labeled after truncation:
3190            dendrogram(Z, leaf_label_func=llf, leaf_rotation=90,
3191                       truncate_mode='level', p=2)
3192
3193    show_contracted : bool, optional
3194        When True the heights of non-singleton nodes contracted
3195        into a leaf node are plotted as crosses along the link
3196        connecting that leaf node.  This really is only useful when
3197        truncation is used (see ``truncate_mode`` parameter).
3198    link_color_func : callable, optional
3199        If given, `link_color_function` is called with each non-singleton id
3200        corresponding to each U-shaped link it will paint. The function is
3201        expected to return the color to paint the link, encoded as a matplotlib
3202        color string code. For example::
3203
3204            dendrogram(Z, link_color_func=lambda k: colors[k])
3205
3206        colors the direct links below each untruncated non-singleton node
3207        ``k`` using ``colors[k]``.
3208    ax : matplotlib Axes instance, optional
3209        If None and `no_plot` is not True, the dendrogram will be plotted
3210        on the current axes.  Otherwise if `no_plot` is not True the
3211        dendrogram will be plotted on the given ``Axes`` instance. This can be
3212        useful if the dendrogram is part of a more complex figure.
3213    above_threshold_color : str, optional
3214        This matplotlib color string sets the color of the links above the
3215        color_threshold. The default is ``'C0'``.
3216
3217    Returns
3218    -------
3219    R : dict
3220        A dictionary of data structures computed to render the
3221        dendrogram. Its has the following keys:
3222
3223        ``'color_list'``
3224          A list of color names. The k'th element represents the color of the
3225          k'th link.
3226
3227        ``'icoord'`` and ``'dcoord'``
3228          Each of them is a list of lists. Let ``icoord = [I1, I2, ..., Ip]``
3229          where ``Ik = [xk1, xk2, xk3, xk4]`` and ``dcoord = [D1, D2, ..., Dp]``
3230          where ``Dk = [yk1, yk2, yk3, yk4]``, then the k'th link painted is
3231          ``(xk1, yk1)`` - ``(xk2, yk2)`` - ``(xk3, yk3)`` - ``(xk4, yk4)``.
3232
3233        ``'ivl'``
3234          A list of labels corresponding to the leaf nodes.
3235
3236        ``'leaves'``
3237          For each i, ``H[i] == j``, cluster node ``j`` appears in position
3238          ``i`` in the left-to-right traversal of the leaves, where
3239          :math:`j < 2n-1` and :math:`i < n`. If ``j`` is less than ``n``, the
3240          ``i``-th leaf node corresponds to an original observation.
3241          Otherwise, it corresponds to a non-singleton cluster.
3242
3243        ``'leaves_color_list'``
3244          A list of color names. The k'th element represents the color of the
3245          k'th leaf.
3246
3247    See Also
3248    --------
3249    linkage, set_link_color_palette
3250
3251    Notes
3252    -----
3253    It is expected that the distances in ``Z[:,2]`` be monotonic, otherwise
3254    crossings appear in the dendrogram.
3255
3256    Examples
3257    --------
3258    >>> from scipy.cluster import hierarchy
3259    >>> import matplotlib.pyplot as plt
3260
3261    A very basic example:
3262
3263    >>> ytdist = np.array([662., 877., 255., 412., 996., 295., 468., 268.,
3264    ...                    400., 754., 564., 138., 219., 869., 669.])
3265    >>> Z = hierarchy.linkage(ytdist, 'single')
3266    >>> plt.figure()
3267    >>> dn = hierarchy.dendrogram(Z)
3268
3269    Now, plot in given axes, improve the color scheme and use both vertical and
3270    horizontal orientations:
3271
3272    >>> hierarchy.set_link_color_palette(['m', 'c', 'y', 'k'])
3273    >>> fig, axes = plt.subplots(1, 2, figsize=(8, 3))
3274    >>> dn1 = hierarchy.dendrogram(Z, ax=axes[0], above_threshold_color='y',
3275    ...                            orientation='top')
3276    >>> dn2 = hierarchy.dendrogram(Z, ax=axes[1],
3277    ...                            above_threshold_color='#bcbddc',
3278    ...                            orientation='right')
3279    >>> hierarchy.set_link_color_palette(None)  # reset to default after use
3280    >>> plt.show()
3281
3282    """
3283    # This feature was thought about but never implemented (still useful?):
3284    #
3285    #         ... = dendrogram(..., leaves_order=None)
3286    #
3287    #         Plots the leaves in the order specified by a vector of
3288    #         original observation indices. If the vector contains duplicates
3289    #         or results in a crossing, an exception will be thrown. Passing
3290    #         None orders leaf nodes based on the order they appear in the
3291    #         pre-order traversal.
3292    Z = np.asarray(Z, order='c')
3293
3294    if orientation not in ["top", "left", "bottom", "right"]:
3295        raise ValueError("orientation must be one of 'top', 'left', "
3296                         "'bottom', or 'right'")
3297
3298    if labels is not None and Z.shape[0] + 1 != len(labels):
3299        raise ValueError("Dimensions of Z and labels must be consistent.")
3300
3301    is_valid_linkage(Z, throw=True, name='Z')
3302    Zs = Z.shape
3303    n = Zs[0] + 1
3304    if type(p) in (int, float):
3305        p = int(p)
3306    else:
3307        raise TypeError('The second argument must be a number')
3308
3309    if truncate_mode not in ('lastp', 'mlab', 'mtica', 'level', 'none', None):
3310        # 'mlab' and 'mtica' are kept working for backwards compat.
3311        raise ValueError('Invalid truncation mode.')
3312
3313    if truncate_mode == 'lastp' or truncate_mode == 'mlab':
3314        if p > n or p == 0:
3315            p = n
3316
3317    if truncate_mode == 'mtica':
3318        # 'mtica' is an alias
3319        truncate_mode = 'level'
3320
3321    if truncate_mode == 'level':
3322        if p <= 0:
3323            p = np.inf
3324
3325    if get_leaves:
3326        lvs = []
3327    else:
3328        lvs = None
3329
3330    icoord_list = []
3331    dcoord_list = []
3332    color_list = []
3333    current_color = [0]
3334    currently_below_threshold = [False]
3335    ivl = []  # list of leaves
3336
3337    if color_threshold is None or (isinstance(color_threshold, str) and
3338                                   color_threshold == 'default'):
3339        color_threshold = max(Z[:, 2]) * 0.7
3340
3341    R = {'icoord': icoord_list, 'dcoord': dcoord_list, 'ivl': ivl,
3342         'leaves': lvs, 'color_list': color_list}
3343
3344    # Empty list will be filled in _dendrogram_calculate_info
3345    contraction_marks = [] if show_contracted else None
3346
3347    _dendrogram_calculate_info(
3348        Z=Z, p=p,
3349        truncate_mode=truncate_mode,
3350        color_threshold=color_threshold,
3351        get_leaves=get_leaves,
3352        orientation=orientation,
3353        labels=labels,
3354        count_sort=count_sort,
3355        distance_sort=distance_sort,
3356        show_leaf_counts=show_leaf_counts,
3357        i=2*n - 2,
3358        iv=0.0,
3359        ivl=ivl,
3360        n=n,
3361        icoord_list=icoord_list,
3362        dcoord_list=dcoord_list,
3363        lvs=lvs,
3364        current_color=current_color,
3365        color_list=color_list,
3366        currently_below_threshold=currently_below_threshold,
3367        leaf_label_func=leaf_label_func,
3368        contraction_marks=contraction_marks,
3369        link_color_func=link_color_func,
3370        above_threshold_color=above_threshold_color)
3371
3372    if not no_plot:
3373        mh = max(Z[:, 2])
3374        _plot_dendrogram(icoord_list, dcoord_list, ivl, p, n, mh, orientation,
3375                         no_labels, color_list,
3376                         leaf_font_size=leaf_font_size,
3377                         leaf_rotation=leaf_rotation,
3378                         contraction_marks=contraction_marks,
3379                         ax=ax,
3380                         above_threshold_color=above_threshold_color)
3381
3382    R["leaves_color_list"] = _get_leaves_color_list(R)
3383
3384    return R
3385
3386
3387def _get_leaves_color_list(R):
3388    leaves_color_list = [None] * len(R['leaves'])
3389    for link_x, link_y, link_color in zip(R['icoord'],
3390                                          R['dcoord'],
3391                                          R['color_list']):
3392        for (xi, yi) in zip(link_x, link_y):
3393            if yi == 0.0:  # if yi is 0.0, the point is a leaf
3394                # xi of leaves are      5, 15, 25, 35, ... (see `iv_ticks`)
3395                # index of leaves are   0,  1,  2,  3, ... as below
3396                leaf_index = (int(xi) - 5) // 10
3397                # each leaf has a same color of its link.
3398                leaves_color_list[leaf_index] = link_color
3399    return leaves_color_list
3400
3401
3402def _append_singleton_leaf_node(Z, p, n, level, lvs, ivl, leaf_label_func,
3403                                i, labels):
3404    # If the leaf id structure is not None and is a list then the caller
3405    # to dendrogram has indicated that cluster id's corresponding to the
3406    # leaf nodes should be recorded.
3407
3408    if lvs is not None:
3409        lvs.append(int(i))
3410
3411    # If leaf node labels are to be displayed...
3412    if ivl is not None:
3413        # If a leaf_label_func has been provided, the label comes from the
3414        # string returned from the leaf_label_func, which is a function
3415        # passed to dendrogram.
3416        if leaf_label_func:
3417            ivl.append(leaf_label_func(int(i)))
3418        else:
3419            # Otherwise, if the dendrogram caller has passed a labels list
3420            # for the leaf nodes, use it.
3421            if labels is not None:
3422                ivl.append(labels[int(i - n)])
3423            else:
3424                # Otherwise, use the id as the label for the leaf.x
3425                ivl.append(str(int(i)))
3426
3427
3428def _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl, leaf_label_func,
3429                                   i, labels, show_leaf_counts):
3430    # If the leaf id structure is not None and is a list then the caller
3431    # to dendrogram has indicated that cluster id's corresponding to the
3432    # leaf nodes should be recorded.
3433
3434    if lvs is not None:
3435        lvs.append(int(i))
3436    if ivl is not None:
3437        if leaf_label_func:
3438            ivl.append(leaf_label_func(int(i)))
3439        else:
3440            if show_leaf_counts:
3441                ivl.append("(" + str(int(Z[i - n, 3])) + ")")
3442            else:
3443                ivl.append("")
3444
3445
3446def _append_contraction_marks(Z, iv, i, n, contraction_marks):
3447    _append_contraction_marks_sub(Z, iv, int(Z[i - n, 0]), n, contraction_marks)
3448    _append_contraction_marks_sub(Z, iv, int(Z[i - n, 1]), n, contraction_marks)
3449
3450
3451def _append_contraction_marks_sub(Z, iv, i, n, contraction_marks):
3452    if i >= n:
3453        contraction_marks.append((iv, Z[i - n, 2]))
3454        _append_contraction_marks_sub(Z, iv, int(Z[i - n, 0]), n, contraction_marks)
3455        _append_contraction_marks_sub(Z, iv, int(Z[i - n, 1]), n, contraction_marks)
3456
3457
3458def _dendrogram_calculate_info(Z, p, truncate_mode,
3459                               color_threshold=np.inf, get_leaves=True,
3460                               orientation='top', labels=None,
3461                               count_sort=False, distance_sort=False,
3462                               show_leaf_counts=False, i=-1, iv=0.0,
3463                               ivl=[], n=0, icoord_list=[], dcoord_list=[],
3464                               lvs=None, mhr=False,
3465                               current_color=[], color_list=[],
3466                               currently_below_threshold=[],
3467                               leaf_label_func=None, level=0,
3468                               contraction_marks=None,
3469                               link_color_func=None,
3470                               above_threshold_color='C0'):
3471    """
3472    Calculate the endpoints of the links as well as the labels for the
3473    the dendrogram rooted at the node with index i. iv is the independent
3474    variable value to plot the left-most leaf node below the root node i
3475    (if orientation='top', this would be the left-most x value where the
3476    plotting of this root node i and its descendents should begin).
3477
3478    ivl is a list to store the labels of the leaf nodes. The leaf_label_func
3479    is called whenever ivl != None, labels == None, and
3480    leaf_label_func != None. When ivl != None and labels != None, the
3481    labels list is used only for labeling the leaf nodes. When
3482    ivl == None, no labels are generated for leaf nodes.
3483
3484    When get_leaves==True, a list of leaves is built as they are visited
3485    in the dendrogram.
3486
3487    Returns a tuple with l being the independent variable coordinate that
3488    corresponds to the midpoint of cluster to the left of cluster i if
3489    i is non-singleton, otherwise the independent coordinate of the leaf
3490    node if i is a leaf node.
3491
3492    Returns
3493    -------
3494    A tuple (left, w, h, md), where:
3495        * left is the independent variable coordinate of the center of the
3496          the U of the subtree
3497
3498        * w is the amount of space used for the subtree (in independent
3499          variable units)
3500
3501        * h is the height of the subtree in dependent variable units
3502
3503        * md is the ``max(Z[*,2]``) for all nodes ``*`` below and including
3504          the target node.
3505
3506    """
3507    if n == 0:
3508        raise ValueError("Invalid singleton cluster count n.")
3509
3510    if i == -1:
3511        raise ValueError("Invalid root cluster index i.")
3512
3513    if truncate_mode == 'lastp':
3514        # If the node is a leaf node but corresponds to a non-singleton
3515        # cluster, its label is either the empty string or the number of
3516        # original observations belonging to cluster i.
3517        if 2*n - p > i >= n:
3518            d = Z[i - n, 2]
3519            _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl,
3520                                           leaf_label_func, i, labels,
3521                                           show_leaf_counts)
3522            if contraction_marks is not None:
3523                _append_contraction_marks(Z, iv + 5.0, i, n, contraction_marks)
3524            return (iv + 5.0, 10.0, 0.0, d)
3525        elif i < n:
3526            _append_singleton_leaf_node(Z, p, n, level, lvs, ivl,
3527                                        leaf_label_func, i, labels)
3528            return (iv + 5.0, 10.0, 0.0, 0.0)
3529    elif truncate_mode == 'level':
3530        if i > n and level > p:
3531            d = Z[i - n, 2]
3532            _append_nonsingleton_leaf_node(Z, p, n, level, lvs, ivl,
3533                                           leaf_label_func, i, labels,
3534                                           show_leaf_counts)
3535            if contraction_marks is not None:
3536                _append_contraction_marks(Z, iv + 5.0, i, n, contraction_marks)
3537            return (iv + 5.0, 10.0, 0.0, d)
3538        elif i < n:
3539            _append_singleton_leaf_node(Z, p, n, level, lvs, ivl,
3540                                        leaf_label_func, i, labels)
3541            return (iv + 5.0, 10.0, 0.0, 0.0)
3542    elif truncate_mode in ('mlab',):
3543        msg = "Mode 'mlab' is deprecated in scipy 0.19.0 (it never worked)."
3544        warnings.warn(msg, DeprecationWarning)
3545
3546    # Otherwise, only truncate if we have a leaf node.
3547    #
3548    # Only place leaves if they correspond to original observations.
3549    if i < n:
3550        _append_singleton_leaf_node(Z, p, n, level, lvs, ivl,
3551                                    leaf_label_func, i, labels)
3552        return (iv + 5.0, 10.0, 0.0, 0.0)
3553
3554    # !!! Otherwise, we don't have a leaf node, so work on plotting a
3555    # non-leaf node.
3556    # Actual indices of a and b
3557    aa = int(Z[i - n, 0])
3558    ab = int(Z[i - n, 1])
3559    if aa >= n:
3560        # The number of singletons below cluster a
3561        na = Z[aa - n, 3]
3562        # The distance between a's two direct children.
3563        da = Z[aa - n, 2]
3564    else:
3565        na = 1
3566        da = 0.0
3567    if ab >= n:
3568        nb = Z[ab - n, 3]
3569        db = Z[ab - n, 2]
3570    else:
3571        nb = 1
3572        db = 0.0
3573
3574    if count_sort == 'ascending' or count_sort == True:
3575        # If a has a count greater than b, it and its descendents should
3576        # be drawn to the right. Otherwise, to the left.
3577        if na > nb:
3578            # The cluster index to draw to the left (ua) will be ab
3579            # and the one to draw to the right (ub) will be aa
3580            ua = ab
3581            ub = aa
3582        else:
3583            ua = aa
3584            ub = ab
3585    elif count_sort == 'descending':
3586        # If a has a count less than or equal to b, it and its
3587        # descendents should be drawn to the left. Otherwise, to
3588        # the right.
3589        if na > nb:
3590            ua = aa
3591            ub = ab
3592        else:
3593            ua = ab
3594            ub = aa
3595    elif distance_sort == 'ascending' or distance_sort == True:
3596        # If a has a distance greater than b, it and its descendents should
3597        # be drawn to the right. Otherwise, to the left.
3598        if da > db:
3599            ua = ab
3600            ub = aa
3601        else:
3602            ua = aa
3603            ub = ab
3604    elif distance_sort == 'descending':
3605        # If a has a distance less than or equal to b, it and its
3606        # descendents should be drawn to the left. Otherwise, to
3607        # the right.
3608        if da > db:
3609            ua = aa
3610            ub = ab
3611        else:
3612            ua = ab
3613            ub = aa
3614    else:
3615        ua = aa
3616        ub = ab
3617
3618    # Updated iv variable and the amount of space used.
3619    (uiva, uwa, uah, uamd) = \
3620        _dendrogram_calculate_info(
3621            Z=Z, p=p,
3622            truncate_mode=truncate_mode,
3623            color_threshold=color_threshold,
3624            get_leaves=get_leaves,
3625            orientation=orientation,
3626            labels=labels,
3627            count_sort=count_sort,
3628            distance_sort=distance_sort,
3629            show_leaf_counts=show_leaf_counts,
3630            i=ua, iv=iv, ivl=ivl, n=n,
3631            icoord_list=icoord_list,
3632            dcoord_list=dcoord_list, lvs=lvs,
3633            current_color=current_color,
3634            color_list=color_list,
3635            currently_below_threshold=currently_below_threshold,
3636            leaf_label_func=leaf_label_func,
3637            level=level + 1, contraction_marks=contraction_marks,
3638            link_color_func=link_color_func,
3639            above_threshold_color=above_threshold_color)
3640
3641    h = Z[i - n, 2]
3642    if h >= color_threshold or color_threshold <= 0:
3643        c = above_threshold_color
3644
3645        if currently_below_threshold[0]:
3646            current_color[0] = (current_color[0] + 1) % len(_link_line_colors)
3647        currently_below_threshold[0] = False
3648    else:
3649        currently_below_threshold[0] = True
3650        c = _link_line_colors[current_color[0]]
3651
3652    (uivb, uwb, ubh, ubmd) = \
3653        _dendrogram_calculate_info(
3654            Z=Z, p=p,
3655            truncate_mode=truncate_mode,
3656            color_threshold=color_threshold,
3657            get_leaves=get_leaves,
3658            orientation=orientation,
3659            labels=labels,
3660            count_sort=count_sort,
3661            distance_sort=distance_sort,
3662            show_leaf_counts=show_leaf_counts,
3663            i=ub, iv=iv + uwa, ivl=ivl, n=n,
3664            icoord_list=icoord_list,
3665            dcoord_list=dcoord_list, lvs=lvs,
3666            current_color=current_color,
3667            color_list=color_list,
3668            currently_below_threshold=currently_below_threshold,
3669            leaf_label_func=leaf_label_func,
3670            level=level + 1, contraction_marks=contraction_marks,
3671            link_color_func=link_color_func,
3672            above_threshold_color=above_threshold_color)
3673
3674    max_dist = max(uamd, ubmd, h)
3675
3676    icoord_list.append([uiva, uiva, uivb, uivb])
3677    dcoord_list.append([uah, h, h, ubh])
3678    if link_color_func is not None:
3679        v = link_color_func(int(i))
3680        if not isinstance(v, str):
3681            raise TypeError("link_color_func must return a matplotlib "
3682                            "color string!")
3683        color_list.append(v)
3684    else:
3685        color_list.append(c)
3686
3687    return (((uiva + uivb) / 2), uwa + uwb, h, max_dist)
3688
3689
3690def is_isomorphic(T1, T2):
3691    """
3692    Determine if two different cluster assignments are equivalent.
3693
3694    Parameters
3695    ----------
3696    T1 : array_like
3697        An assignment of singleton cluster ids to flat cluster ids.
3698    T2 : array_like
3699        An assignment of singleton cluster ids to flat cluster ids.
3700
3701    Returns
3702    -------
3703    b : bool
3704        Whether the flat cluster assignments `T1` and `T2` are
3705        equivalent.
3706
3707    See Also
3708    --------
3709    linkage : for a description of what a linkage matrix is.
3710    fcluster : for the creation of flat cluster assignments.
3711
3712    Examples
3713    --------
3714    >>> from scipy.cluster.hierarchy import fcluster, is_isomorphic
3715    >>> from scipy.cluster.hierarchy import single, complete
3716    >>> from scipy.spatial.distance import pdist
3717
3718    Two flat cluster assignments can be isomorphic if they represent the same
3719    cluster assignment, with different labels.
3720
3721    For example, we can use the `scipy.cluster.hierarchy.single`: method
3722    and flatten the output to four clusters:
3723
3724    >>> X = [[0, 0], [0, 1], [1, 0],
3725    ...      [0, 4], [0, 3], [1, 4],
3726    ...      [4, 0], [3, 0], [4, 1],
3727    ...      [4, 4], [3, 4], [4, 3]]
3728
3729    >>> Z = single(pdist(X))
3730    >>> T = fcluster(Z, 1, criterion='distance')
3731    >>> T
3732    array([3, 3, 3, 4, 4, 4, 2, 2, 2, 1, 1, 1], dtype=int32)
3733
3734    We can then do the same using the
3735    `scipy.cluster.hierarchy.complete`: method:
3736
3737    >>> Z = complete(pdist(X))
3738    >>> T_ = fcluster(Z, 1.5, criterion='distance')
3739    >>> T_
3740    array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
3741
3742    As we can see, in both cases we obtain four clusters and all the data
3743    points are distributed in the same way - the only thing that changes
3744    are the flat cluster labels (3 => 1, 4 =>2, 2 =>3 and 4 =>1), so both
3745    cluster assignments are isomorphic:
3746
3747    >>> is_isomorphic(T, T_)
3748    True
3749
3750    """
3751    T1 = np.asarray(T1, order='c')
3752    T2 = np.asarray(T2, order='c')
3753
3754    if type(T1) != np.ndarray:
3755        raise TypeError('T1 must be a numpy array.')
3756    if type(T2) != np.ndarray:
3757        raise TypeError('T2 must be a numpy array.')
3758
3759    T1S = T1.shape
3760    T2S = T2.shape
3761
3762    if len(T1S) != 1:
3763        raise ValueError('T1 must be one-dimensional.')
3764    if len(T2S) != 1:
3765        raise ValueError('T2 must be one-dimensional.')
3766    if T1S[0] != T2S[0]:
3767        raise ValueError('T1 and T2 must have the same number of elements.')
3768    n = T1S[0]
3769    d1 = {}
3770    d2 = {}
3771    for i in range(0, n):
3772        if T1[i] in d1:
3773            if not T2[i] in d2:
3774                return False
3775            if d1[T1[i]] != T2[i] or d2[T2[i]] != T1[i]:
3776                return False
3777        elif T2[i] in d2:
3778            return False
3779        else:
3780            d1[T1[i]] = T2[i]
3781            d2[T2[i]] = T1[i]
3782    return True
3783
3784
3785def maxdists(Z):
3786    """
3787    Return the maximum distance between any non-singleton cluster.
3788
3789    Parameters
3790    ----------
3791    Z : ndarray
3792        The hierarchical clustering encoded as a matrix. See
3793        ``linkage`` for more information.
3794
3795    Returns
3796    -------
3797    maxdists : ndarray
3798        A ``(n-1)`` sized numpy array of doubles; ``MD[i]`` represents
3799        the maximum distance between any cluster (including
3800        singletons) below and including the node with index i. More
3801        specifically, ``MD[i] = Z[Q(i)-n, 2].max()`` where ``Q(i)`` is the
3802        set of all node indices below and including node i.
3803
3804    See Also
3805    --------
3806    linkage : for a description of what a linkage matrix is.
3807    is_monotonic : for testing for monotonicity of a linkage matrix.
3808
3809    Examples
3810    --------
3811    >>> from scipy.cluster.hierarchy import median, maxdists
3812    >>> from scipy.spatial.distance import pdist
3813
3814    Given a linkage matrix ``Z``, `scipy.cluster.hierarchy.maxdists`
3815    computes for each new cluster generated (i.e., for each row of the linkage
3816    matrix) what is the maximum distance between any two child clusters.
3817
3818    Due to the nature of hierarchical clustering, in many cases this is going
3819    to be just the distance between the two child clusters that were merged
3820    to form the current one - that is, Z[:,2].
3821
3822    However, for non-monotonic cluster assignments such as
3823    `scipy.cluster.hierarchy.median` clustering this is not always the
3824    case: There may be cluster formations were the distance between the two
3825    clusters merged is smaller than the distance between their children.
3826
3827    We can see this in an example:
3828
3829    >>> X = [[0, 0], [0, 1], [1, 0],
3830    ...      [0, 4], [0, 3], [1, 4],
3831    ...      [4, 0], [3, 0], [4, 1],
3832    ...      [4, 4], [3, 4], [4, 3]]
3833
3834    >>> Z = median(pdist(X))
3835    >>> Z
3836    array([[ 0.        ,  1.        ,  1.        ,  2.        ],
3837           [ 3.        ,  4.        ,  1.        ,  2.        ],
3838           [ 9.        , 10.        ,  1.        ,  2.        ],
3839           [ 6.        ,  7.        ,  1.        ,  2.        ],
3840           [ 2.        , 12.        ,  1.11803399,  3.        ],
3841           [ 5.        , 13.        ,  1.11803399,  3.        ],
3842           [ 8.        , 15.        ,  1.11803399,  3.        ],
3843           [11.        , 14.        ,  1.11803399,  3.        ],
3844           [18.        , 19.        ,  3.        ,  6.        ],
3845           [16.        , 17.        ,  3.5       ,  6.        ],
3846           [20.        , 21.        ,  3.25      , 12.        ]])
3847    >>> maxdists(Z)
3848    array([1.        , 1.        , 1.        , 1.        , 1.11803399,
3849           1.11803399, 1.11803399, 1.11803399, 3.        , 3.5       ,
3850           3.5       ])
3851
3852    Note that while the distance between the two clusters merged when creating the
3853    last cluster is 3.25, there are two children (clusters 16 and 17) whose distance
3854    is larger (3.5). Thus, `scipy.cluster.hierarchy.maxdists` returns 3.5 in
3855    this case.
3856
3857    """
3858    Z = np.asarray(Z, order='c', dtype=np.double)
3859    is_valid_linkage(Z, throw=True, name='Z')
3860
3861    n = Z.shape[0] + 1
3862    MD = np.zeros((n - 1,))
3863    [Z] = _copy_arrays_if_base_present([Z])
3864    _hierarchy.get_max_dist_for_each_cluster(Z, MD, int(n))
3865    return MD
3866
3867
3868def maxinconsts(Z, R):
3869    """
3870    Return the maximum inconsistency coefficient for each
3871    non-singleton cluster and its children.
3872
3873    Parameters
3874    ----------
3875    Z : ndarray
3876        The hierarchical clustering encoded as a matrix. See
3877        `linkage` for more information.
3878    R : ndarray
3879        The inconsistency matrix.
3880
3881    Returns
3882    -------
3883    MI : ndarray
3884        A monotonic ``(n-1)``-sized numpy array of doubles.
3885
3886    See Also
3887    --------
3888    linkage : for a description of what a linkage matrix is.
3889    inconsistent : for the creation of a inconsistency matrix.
3890
3891    Examples
3892    --------
3893    >>> from scipy.cluster.hierarchy import median, inconsistent, maxinconsts
3894    >>> from scipy.spatial.distance import pdist
3895
3896    Given a data set ``X``, we can apply a clustering method to obtain a
3897    linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can
3898    be also used to obtain the inconsistency matrix ``R`` associated to
3899    this clustering process:
3900
3901    >>> X = [[0, 0], [0, 1], [1, 0],
3902    ...      [0, 4], [0, 3], [1, 4],
3903    ...      [4, 0], [3, 0], [4, 1],
3904    ...      [4, 4], [3, 4], [4, 3]]
3905
3906    >>> Z = median(pdist(X))
3907    >>> R = inconsistent(Z)
3908    >>> Z
3909    array([[ 0.        ,  1.        ,  1.        ,  2.        ],
3910           [ 3.        ,  4.        ,  1.        ,  2.        ],
3911           [ 9.        , 10.        ,  1.        ,  2.        ],
3912           [ 6.        ,  7.        ,  1.        ,  2.        ],
3913           [ 2.        , 12.        ,  1.11803399,  3.        ],
3914           [ 5.        , 13.        ,  1.11803399,  3.        ],
3915           [ 8.        , 15.        ,  1.11803399,  3.        ],
3916           [11.        , 14.        ,  1.11803399,  3.        ],
3917           [18.        , 19.        ,  3.        ,  6.        ],
3918           [16.        , 17.        ,  3.5       ,  6.        ],
3919           [20.        , 21.        ,  3.25      , 12.        ]])
3920    >>> R
3921    array([[1.        , 0.        , 1.        , 0.        ],
3922           [1.        , 0.        , 1.        , 0.        ],
3923           [1.        , 0.        , 1.        , 0.        ],
3924           [1.        , 0.        , 1.        , 0.        ],
3925           [1.05901699, 0.08346263, 2.        , 0.70710678],
3926           [1.05901699, 0.08346263, 2.        , 0.70710678],
3927           [1.05901699, 0.08346263, 2.        , 0.70710678],
3928           [1.05901699, 0.08346263, 2.        , 0.70710678],
3929           [1.74535599, 1.08655358, 3.        , 1.15470054],
3930           [1.91202266, 1.37522872, 3.        , 1.15470054],
3931           [3.25      , 0.25      , 3.        , 0.        ]])
3932
3933    Here, `scipy.cluster.hierarchy.maxinconsts` can be used to compute
3934    the maximum value of the inconsistency statistic (the last column of
3935    ``R``) for each non-singleton cluster and its children:
3936
3937    >>> maxinconsts(Z, R)
3938    array([0.        , 0.        , 0.        , 0.        , 0.70710678,
3939           0.70710678, 0.70710678, 0.70710678, 1.15470054, 1.15470054,
3940           1.15470054])
3941
3942    """
3943    Z = np.asarray(Z, order='c')
3944    R = np.asarray(R, order='c')
3945    is_valid_linkage(Z, throw=True, name='Z')
3946    is_valid_im(R, throw=True, name='R')
3947
3948    n = Z.shape[0] + 1
3949    if Z.shape[0] != R.shape[0]:
3950        raise ValueError("The inconsistency matrix and linkage matrix each "
3951                         "have a different number of rows.")
3952    MI = np.zeros((n - 1,))
3953    [Z, R] = _copy_arrays_if_base_present([Z, R])
3954    _hierarchy.get_max_Rfield_for_each_cluster(Z, R, MI, int(n), 3)
3955    return MI
3956
3957
3958def maxRstat(Z, R, i):
3959    """
3960    Return the maximum statistic for each non-singleton cluster and its
3961    children.
3962
3963    Parameters
3964    ----------
3965    Z : array_like
3966        The hierarchical clustering encoded as a matrix. See `linkage` for more
3967        information.
3968    R : array_like
3969        The inconsistency matrix.
3970    i : int
3971        The column of `R` to use as the statistic.
3972
3973    Returns
3974    -------
3975    MR : ndarray
3976        Calculates the maximum statistic for the i'th column of the
3977        inconsistency matrix `R` for each non-singleton cluster
3978        node. ``MR[j]`` is the maximum over ``R[Q(j)-n, i]``, where
3979        ``Q(j)`` the set of all node ids corresponding to nodes below
3980        and including ``j``.
3981
3982    See Also
3983    --------
3984    linkage : for a description of what a linkage matrix is.
3985    inconsistent : for the creation of a inconsistency matrix.
3986
3987    Examples
3988    --------
3989    >>> from scipy.cluster.hierarchy import median, inconsistent, maxRstat
3990    >>> from scipy.spatial.distance import pdist
3991
3992    Given a data set ``X``, we can apply a clustering method to obtain a
3993    linkage matrix ``Z``. `scipy.cluster.hierarchy.inconsistent` can
3994    be also used to obtain the inconsistency matrix ``R`` associated to
3995    this clustering process:
3996
3997    >>> X = [[0, 0], [0, 1], [1, 0],
3998    ...      [0, 4], [0, 3], [1, 4],
3999    ...      [4, 0], [3, 0], [4, 1],
4000    ...      [4, 4], [3, 4], [4, 3]]
4001
4002    >>> Z = median(pdist(X))
4003    >>> R = inconsistent(Z)
4004    >>> R
4005    array([[1.        , 0.        , 1.        , 0.        ],
4006           [1.        , 0.        , 1.        , 0.        ],
4007           [1.        , 0.        , 1.        , 0.        ],
4008           [1.        , 0.        , 1.        , 0.        ],
4009           [1.05901699, 0.08346263, 2.        , 0.70710678],
4010           [1.05901699, 0.08346263, 2.        , 0.70710678],
4011           [1.05901699, 0.08346263, 2.        , 0.70710678],
4012           [1.05901699, 0.08346263, 2.        , 0.70710678],
4013           [1.74535599, 1.08655358, 3.        , 1.15470054],
4014           [1.91202266, 1.37522872, 3.        , 1.15470054],
4015           [3.25      , 0.25      , 3.        , 0.        ]])
4016
4017    `scipy.cluster.hierarchy.maxRstat` can be used to compute
4018    the maximum value of each column of ``R``, for each non-singleton
4019    cluster and its children:
4020
4021    >>> maxRstat(Z, R, 0)
4022    array([1.        , 1.        , 1.        , 1.        , 1.05901699,
4023           1.05901699, 1.05901699, 1.05901699, 1.74535599, 1.91202266,
4024           3.25      ])
4025    >>> maxRstat(Z, R, 1)
4026    array([0.        , 0.        , 0.        , 0.        , 0.08346263,
4027           0.08346263, 0.08346263, 0.08346263, 1.08655358, 1.37522872,
4028           1.37522872])
4029    >>> maxRstat(Z, R, 3)
4030    array([0.        , 0.        , 0.        , 0.        , 0.70710678,
4031           0.70710678, 0.70710678, 0.70710678, 1.15470054, 1.15470054,
4032           1.15470054])
4033
4034    """
4035    Z = np.asarray(Z, order='c')
4036    R = np.asarray(R, order='c')
4037    is_valid_linkage(Z, throw=True, name='Z')
4038    is_valid_im(R, throw=True, name='R')
4039    if type(i) is not int:
4040        raise TypeError('The third argument must be an integer.')
4041    if i < 0 or i > 3:
4042        raise ValueError('i must be an integer between 0 and 3 inclusive.')
4043
4044    if Z.shape[0] != R.shape[0]:
4045        raise ValueError("The inconsistency matrix and linkage matrix each "
4046                         "have a different number of rows.")
4047
4048    n = Z.shape[0] + 1
4049    MR = np.zeros((n - 1,))
4050    [Z, R] = _copy_arrays_if_base_present([Z, R])
4051    _hierarchy.get_max_Rfield_for_each_cluster(Z, R, MR, int(n), i)
4052    return MR
4053
4054
4055def leaders(Z, T):
4056    """
4057    Return the root nodes in a hierarchical clustering.
4058
4059    Returns the root nodes in a hierarchical clustering corresponding
4060    to a cut defined by a flat cluster assignment vector ``T``. See
4061    the ``fcluster`` function for more information on the format of ``T``.
4062
4063    For each flat cluster :math:`j` of the :math:`k` flat clusters
4064    represented in the n-sized flat cluster assignment vector ``T``,
4065    this function finds the lowest cluster node :math:`i` in the linkage
4066    tree Z, such that:
4067
4068      * leaf descendants belong only to flat cluster j
4069        (i.e., ``T[p]==j`` for all :math:`p` in :math:`S(i)`, where
4070        :math:`S(i)` is the set of leaf ids of descendant leaf nodes
4071        with cluster node :math:`i`)
4072
4073      * there does not exist a leaf that is not a descendant with
4074        :math:`i` that also belongs to cluster :math:`j`
4075        (i.e., ``T[q]!=j`` for all :math:`q` not in :math:`S(i)`). If
4076        this condition is violated, ``T`` is not a valid cluster
4077        assignment vector, and an exception will be thrown.
4078
4079    Parameters
4080    ----------
4081    Z : ndarray
4082        The hierarchical clustering encoded as a matrix. See
4083        `linkage` for more information.
4084    T : ndarray
4085        The flat cluster assignment vector.
4086
4087    Returns
4088    -------
4089    L : ndarray
4090        The leader linkage node id's stored as a k-element 1-D array,
4091        where ``k`` is the number of flat clusters found in ``T``.
4092
4093        ``L[j]=i`` is the linkage cluster node id that is the
4094        leader of flat cluster with id M[j]. If ``i < n``, ``i``
4095        corresponds to an original observation, otherwise it
4096        corresponds to a non-singleton cluster.
4097    M : ndarray
4098        The leader linkage node id's stored as a k-element 1-D array, where
4099        ``k`` is the number of flat clusters found in ``T``. This allows the
4100        set of flat cluster ids to be any arbitrary set of ``k`` integers.
4101
4102        For example: if ``L[3]=2`` and ``M[3]=8``, the flat cluster with
4103        id 8's leader is linkage node 2.
4104
4105    See Also
4106    --------
4107    fcluster : for the creation of flat cluster assignments.
4108
4109    Examples
4110    --------
4111    >>> from scipy.cluster.hierarchy import ward, fcluster, leaders
4112    >>> from scipy.spatial.distance import pdist
4113
4114    Given a linkage matrix ``Z`` - obtained after apply a clustering method
4115    to a dataset ``X`` - and a flat cluster assignment array ``T``:
4116
4117    >>> X = [[0, 0], [0, 1], [1, 0],
4118    ...      [0, 4], [0, 3], [1, 4],
4119    ...      [4, 0], [3, 0], [4, 1],
4120    ...      [4, 4], [3, 4], [4, 3]]
4121
4122    >>> Z = ward(pdist(X))
4123    >>> Z
4124    array([[ 0.        ,  1.        ,  1.        ,  2.        ],
4125           [ 3.        ,  4.        ,  1.        ,  2.        ],
4126           [ 6.        ,  7.        ,  1.        ,  2.        ],
4127           [ 9.        , 10.        ,  1.        ,  2.        ],
4128           [ 2.        , 12.        ,  1.29099445,  3.        ],
4129           [ 5.        , 13.        ,  1.29099445,  3.        ],
4130           [ 8.        , 14.        ,  1.29099445,  3.        ],
4131           [11.        , 15.        ,  1.29099445,  3.        ],
4132           [16.        , 17.        ,  5.77350269,  6.        ],
4133           [18.        , 19.        ,  5.77350269,  6.        ],
4134           [20.        , 21.        ,  8.16496581, 12.        ]])
4135
4136    >>> T = fcluster(Z, 3, criterion='distance')
4137    >>> T
4138    array([1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4], dtype=int32)
4139
4140    `scipy.cluster.hierarchy.leaders` returns the indices of the nodes
4141    in the dendrogram that are the leaders of each flat cluster:
4142
4143    >>> L, M = leaders(Z, T)
4144    >>> L
4145    array([16, 17, 18, 19], dtype=int32)
4146
4147    (remember that indices 0-11 point to the 12 data points in ``X``,
4148    whereas indices 12-22 point to the 11 rows of ``Z``)
4149
4150    `scipy.cluster.hierarchy.leaders` also returns the indices of
4151    the flat clusters in ``T``:
4152
4153    >>> M
4154    array([1, 2, 3, 4], dtype=int32)
4155
4156    """
4157    Z = np.asarray(Z, order='c')
4158    T = np.asarray(T, order='c')
4159    if type(T) != np.ndarray or T.dtype != 'i':
4160        raise TypeError('T must be a one-dimensional numpy array of integers.')
4161    is_valid_linkage(Z, throw=True, name='Z')
4162    if len(T) != Z.shape[0] + 1:
4163        raise ValueError('Mismatch: len(T)!=Z.shape[0] + 1.')
4164
4165    Cl = np.unique(T)
4166    kk = len(Cl)
4167    L = np.zeros((kk,), dtype='i')
4168    M = np.zeros((kk,), dtype='i')
4169    n = Z.shape[0] + 1
4170    [Z, T] = _copy_arrays_if_base_present([Z, T])
4171    s = _hierarchy.leaders(Z, T, L, M, int(kk), int(n))
4172    if s >= 0:
4173        raise ValueError(('T is not a valid assignment vector. Error found '
4174                          'when examining linkage node %d (< 2n-1).') % s)
4175    return (L, M)
4176