1#!/usr/local/bin/python3.8
2# -*- coding: utf-8 -*-
3
4
5import random
6from itertools import chain, islice
7import bisect
8import math
9
10from mpmath import fsum
11from mathics.core.util import robust_min
12
13# publications used for this file:
14
15# [Kurita1991] Takito Kurita: "An Efficient Agglomerative Clustering Algorithm Using A Heap", Pattern Recognition,
16# Volume 24 Issue 3, 1991.
17# [Ng1994] Raymond T. Ng, Jiawei Han: Efficient and Effective Clustering Methods for Spatial Data Mining.
18# VLDB 1994: 144-155
19# [Rangel2016] E. Rangel, W. Hendrix, A. Agrawal, W.-keng Liao, and A. Choudhary, “AGORAS: A Fast Algorithm for
20# Estimating Medoids in Large Datasets,” in Proceedings of International Conference on Computational Science (ICCS),
21# 2016.
22# [Hamerly2003] Greg Hamerly, Charles Elkan, Learning the k in k-means. In proceedings of the seventeenth
23# annual conference on neural information processing systems (NIPS), pages 281-288, December 2003.
24# [Desgraupes2013] Bernard Desgraupes, Package clusterCrit for R: Clustering Indices, available online at
25# https://cran.r-project.org/web/packages/clusterCrit/vignettes/clusterCrit.pdf
26# [Greutert2003] Andreas Greutert, "Methoden zur Schätzung der Clusteranzahl", Ausgabe: 29. Oktober 2003.
27# Diplomarbeit. ETH Zürich.
28# [Frey2007] Brendan J. Frey, Delbert Dueck, "Clustering by Passing Messages Between Data Points",
29# Science 16 Feb 2007: Vol. 315, Issue 5814, pp. 972-976
30# [Arthur2007] Arthur, D.; Vassilvitskii, S. (2007). "k-means++: the advantages of careful seeding",
31# Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms. Society for Industrial and
32# Applied Mathematics Philadelphia, PA, USA. pp. 1027–1035.
33# [Hamerly2010] Greg Hamerly, Making k-means even faster In proceedings of the 2010 SIAM international
34# conference on data mining (SDM 2010), April 2010.
35
36# for agglomerative clustering, we use [Kurita1991].
37
38# for hierarchical clustering, we use ClaraNS (see [Ng1994]), but we start local searches not with a random sample,
39# but with a seed computed via AGORAS (see [Rangel2016]). for clustering without knowing the number of clusters k,
40# we use the approach described in [Hamerly2003]. further improvements might be gained through [Frey2007].
41
42
43def _index(i, j):  # i > j, returns j + sum(1, 2, ..., i - 1)
44    # x x x
45    # a x x
46    # b c x
47
48    # a = 0, b = 1, c = 2
49    return j + ((i - 1) * i) // 2
50
51
52def _ordered2(a, cmp):
53    iterator = iter(a)
54
55    min_i1 = 0
56    min_i2 = 1
57
58    min_x1 = next(iterator)
59    min_x2 = next(iterator)
60
61    if not cmp(min_x1, min_x2):
62        min_i1, min_i2 = min_i2, min_i1
63        min_x1, min_x2 = min_x2, min_x1
64
65    try:
66        i = 2
67        while True:
68            x = next(iterator)
69            if cmp(x, min_x1):
70                min_x2 = min_x1
71                min_i2 = min_i1
72                min_x1 = x
73                min_i1 = i
74            elif cmp(x, min_x2):
75                min_x2 = x
76                min_i2 = i
77            i += 1
78    except StopIteration:
79        return min_i1, min_x1, min_i2, min_x2
80
81
82def _smallest2(a):
83    return _ordered2(a, lambda x, y: x < y)
84
85
86def _largest2(a):
87    return _ordered2(a, lambda x, y: x > y)
88
89
90def _components(clusters, n):
91    components = [0] * n
92    for i, c in enumerate(clusters):
93        component_index = i + 1
94        for j in c:
95            components[j] = component_index
96    return components
97
98
99def _clusters(x, a, k):
100    clusters = [[] for _ in range(k)]
101    add = [c.append for c in clusters]
102    for i, j in enumerate(a):
103        add[j](x[i])
104    return clusters
105
106
107def _ratio_bigger_than(a, b):
108    # check if (a1 / a2) > (b1 / b2)
109    a1, a2 = a
110    b1, b2 = b
111    return a1 * b2 > b1 * a2
112
113
114def _pairwise_sum(a, b):
115    return [x + y for x, y in zip(a, b)]
116
117
118class InfiniteSilhouette(Exception):
119    # thrown when two clusters have distance 0
120    pass
121
122
123def _silhouette(a, b):
124    if a is None:
125        # a is infinite, i.e. only one element
126        # in the cluster and thus no distances?
127        return 0.
128
129    # for the formula, see [Desgraupes2013].
130    try:
131        s = (b - a) / max(a, b)
132    except ZeroDivisionError:  # max(a, b) == 0?
133        raise InfiniteSilhouette()
134
135    return s
136
137
138class PrecomputedDistances(object):
139    def __init__(self, distances):
140        self._distances = distances
141
142    def distance(self, i, j):
143        return self._distances[_index(max(i, j), min(i, j))]
144
145    def matrix(self):
146        return self._distances
147
148
149class LazyDistances(object):
150    def __init__(self):
151        self._computed = {}
152
153    def distance(self, i, j):
154        computed = self._computed
155        index = _index(max(i, j), min(i, j))
156        d = computed.get(index, None)
157        if d is None:
158            # we are probably in the middle of a (seeded) random
159            # computation, and we do not want the distance function
160            # to mess with that state.
161            random_state = random.getstate()
162            try:
163                d = self._compute_distance(i, j)
164            finally:
165                random.setstate(random_state)
166            computed[index] = d
167        return d
168
169    def matrix(self):
170        raise ValueError('LazyDistances does not support matrix()')
171
172    def _compute_distance(self, i, j):
173        raise ValueError('_compute_distance was not implemented')
174
175
176def _shuffled_range(n):
177    # returns all numbers from [0, ..., n - 1] in random order, never returning any number twice.
178
179    # basically a Fisher-Yates shuffle that does not materialize the data, which means it's especially suited for
180    # k << n, where k is the count of random numbers actually retrieved.
181
182    # performs surprisingly well even if all random numbers are retrieved. getting all available numbers from
183    # shuffled_range(10^7) only takes only 70% longer than generating just the raw 10 million random numbers.
184
185    a = dict()
186
187    def nth(k):
188        return a.get(k, k)
189
190    for i in range(n - 1):
191        ai = nth(i)
192
193        j = random.randint(i, n - 1)
194        aj = nth(j)
195
196        a[j] = ai
197        yield aj
198
199    yield nth(n - 1)
200
201
202def _shuffled_tuples(*t):
203    # like _shuffled_range, but not returning integers, but tuples
204    # of integers within given ranges.
205
206    # we map the tuple values to a linear index, then use _shuffled_range.
207
208    # e.g. for t = (2, 3), we map:
209    # 0 -> (0, 0); 1 -> (1, 0); 2 -> (0, 1)
210    # 3 -> (1, 1); 4 -> (0, 2); 5 -> (1, 2)
211
212    n = 1
213    for x in t:
214        n *= x
215    indices = iter(_shuffled_range(n))
216
217    while True:
218        n = next(indices)
219        r = []
220        for x in t:
221            r.append(n % x)
222            n = n // x
223        yield r
224
225
226def _unmapped(u, v, distance):
227    # maps each object in u to the most similar object in v
228    # and returns a version of v from which all unmapped objects
229    # have been removed.
230
231    mapped = [False] * len(v)
232    for uu in u:
233        min_d = None
234        min_i = None
235        for i, vv in enumerate(v):
236            if uu == vv:
237                d = 0
238            else:
239                d = distance(uu, vv)
240            if min_d is None or d < min_d:
241                min_d = d
242                min_i = i
243        mapped[min_i] = True
244    return [vv for m, vv in zip(mapped, v) if m]
245
246
247def _agoras(n, k, distance):  # see [Rangel2016]
248    gamma = 0.577215664901532  # Euler-Mascheroni constant
249    m = int(math.ceil(k * (math.log(k) + gamma)))
250    r = k * math.log(k) + gamma * k
251    p = tuple(range(n))
252
253    if r > n:
254        # if k (and thus r) is very large in regard to n, we fall
255        # back to simple sampling, since AGORAS is slow then.
256        return random.sample(p, k)
257
258    while True:
259        s = [random.sample(p, min(int(r), n)) for _ in range(m)]
260        for i in range(m - 1):
261            s[i + 1] = _unmapped(s[i], s[i + 1], distance)
262            if len(s[i + 1]) < k:
263                r += r * (m - (i + 1)) / m
264                break
265        if len(s[-1]) > k:
266            r *= 0.95
267        elif len(s[-1]) == k:
268            return s[-1]
269
270
271class SplitCriterion(object):
272    # note that as this runs as part of an O(n) algorithm, this
273    # should not introduce an O(n^2) algorithm. see [Desgraupes2013]
274    # and [Greutert2003] for possible algorithms,
275
276    def should_split(self, siblings, merged, unmerged, distance):
277        raise NotImplementedError()
278
279
280class ApproximateSilhouetteSplitCriterion(SplitCriterion):
281    # a fast, approximate version of the Silhouette index (see e.g.
282    # [Desgraupes2013], that operates on medoids instead of points,
283    # thereby reducing its runtime to O(nk) as opposed to O(n^2).
284
285    def __init__(self, last_criterion=None):
286        self._last_criterion = last_criterion
287
288    def should_split(self, siblings, merged, unmerged, distance):
289        try:
290            criterion = self._approximate_global_silhouette_index(
291                siblings + unmerged, distance)
292        except InfiniteSilhouette:
293            # zero distance between two clusters, do not accept split.
294            return False, None
295
296        # if the index is bigger than before, then more clusters are
297        # in the right places than before, and we accept the split.
298        if self._last_criterion is None or criterion > self._last_criterion:
299            return True, ApproximateSilhouetteSplitCriterion(criterion)
300        else:
301            return False, None
302
303    def _approximate_global_silhouette_index(self, clusters, distance):
304        if len(clusters) <= 1:
305            return -1.
306        else:
307            return fsum(self._approximate_mean_silhouette_widths(clusters, distance)) / len(clusters)
308
309    def _approximate_mean_silhouette_widths(self, clusters, distance):
310        d_in = self._approximate_within_distances(clusters, distance)
311        d_out = self._approximate_mins_of_betweens_distances(clusters, distance)
312        # the mean is just s(i) here, as we only use medoids for approximation.
313        return (_silhouette(a, b) for a, b in zip(d_in, d_out))
314
315    def _approximate_within_distances(self, clusters, distance):
316        for c in clusters:
317            s, n = c.within(distance)
318            if n == 1:
319                yield None
320            else:
321                yield s / (n - 1)
322
323    def _approximate_mins_of_betweens_distances(self, clusters, distance):
324        def medoids():
325            for i, a in enumerate(clusters):
326                yield i, a.medoid
327
328        def other_medoids(i):
329            for j, c in enumerate(clusters):
330                if i != j:
331                    yield c.medoid
332
333        def other_members(i):
334            for j, c in enumerate(clusters):
335                if i != j:
336                    for p in c.members:
337                        yield p
338
339        other = other_medoids  # fast version
340
341        for i, a in medoids():
342            yield robust_min((distance(a, b) for b in other(i)))
343
344AutomaticSplitCriterion = ApproximateSilhouetteSplitCriterion
345
346
347class _Cluster:
348    def __init__(self, medoid, members):
349        self.medoid = medoid
350        self.members = members
351        self._within = None
352
353    def translate(self, i_to_i0):
354        return _Cluster(i_to_i0[self.medoid], [i_to_i0[i] for i in self.members])
355
356    def within(self, distance):
357        if self._within is None:
358            # s_w is the sum of within-cluster (point to its medoid) distances.
359            m = self.medoid
360            s_w = fsum(distance(i, m) for i in self.members if i != m)
361            n_w = len(self.members)
362            self._within = (s_w, n_w)
363
364        return self._within
365
366
367class _Medoids:
368    def __init__(self, clusterer, k):
369        distance = clusterer._distance_lambda()
370        n = clusterer._n
371
372        self._n = n
373        self._k = k
374        self._distance = distance
375        self._reset_random_swap()
376
377        self._selected = _agoras(n, k, distance)
378        self._selected.sort()
379
380        self._clusters = [None] * n
381        self._cost = fsum(self._update_clusters(self._unselected))
382        self._debug = clusterer.debug
383
384    def clusters(self):
385        clusters = self._clusters
386        solution = []
387        for i in self._selected:
388            members = [j for j in self._unselected if clusters[j][0] == i]
389            # preserve the original order of the elements by inserting i.
390            bisect.insort(members, i)
391            solution.append(_Cluster(i, members))
392        return solution
393
394    @property
395    def _unselected(self):
396        selected = self._selected
397        k = 0
398
399        z = selected[k]
400        k += 1
401        for i in range(self._n):
402            if i == z:
403                if k < len(selected):
404                    z = selected[k]
405                    k += 1
406                else:
407                    z = -1
408            else:
409                yield i
410
411    def _reset_random_swap(self):
412        self._random_swap = iter(_shuffled_tuples(self._k, self._n - self._k))
413
414    def _next_random_swap(self):
415        # we modify ClaraNS by keeping track of the randomly picked swaps and not trying the same swap twice if no
416        # improvement was made in the meantime. this is where the next not-yet-picked tuple is produced.
417
418        n_i, h = next(self._random_swap)
419
420        for j in self._selected:
421            if h >= j:
422                h += 1
423
424        return self._selected[n_i], h
425
426    def _update_clusters(self, a):
427        selected = self._selected
428        clusters = self._clusters
429        distance = self._distance
430
431        for j in a:
432            s1, d1, s2, d2 = _smallest2(distance(i, j) for i in selected)
433            i1 = selected[s1]
434            i2 = selected[s2]
435            assert i1 != i2
436            clusters[j] = (i1, i2)
437            yield d1
438
439    def cost(self):
440        return self._cost
441
442    def swap(self):
443        try:
444            i, h = self._next_random_swap()
445        except StopIteration:
446            self._debug('all swaps tried')
447            return False
448
449        self._debug('eval swap', i, h)
450
451        # try to swap medoid i with non-medoid h
452        clusters = self._clusters
453        distance = self._distance
454
455        # we differentiate fast_updates and slow_updates. fast_updates
456        # need to update only the second-nearest medoid. slow_updates
457        # need to update the nearest and second-nearest medoid.
458        fast_updates = []
459
460        def calculate_t():
461            # i attaches to a medoid
462            yield min(distance(i, j) for j in chain(self._selected, [h]) if j != i)
463
464            # h detaches from its medoid
465            yield -distance(h, clusters[h][0])
466
467            # look at all other points
468            for j in self._unselected:
469                if j == h:
470                    continue
471                # see [Ng1994] for a description of the following calculations
472                n1, n2 = clusters[j]  # nearest two medoids
473                dh = distance(j, h)  # d(Oj, Oh)
474                if n1 == i:  # is j in cluster i?
475                    d2 = distance(j, n2)  # d(Oj, Oj,2)
476                    if dh >= d2:  # case (1); j does not go to h
477                        yield d2 - distance(j, i)
478                        fast_updates.append((j, n2))
479                    else:  # case (2); j goes to h
480                        yield dh - distance(j, i)
481                        fast_updates.append((j, h))
482                else:
483                    k = clusters[j][0]
484                    d2 = distance(j, k)
485                    if dh >= d2:  # case (3)
486                        # j stays in current cluster. second nearest medoid
487                        # to j might change with the introduction of h though.
488                        fast_updates.append((j, k))
489                    else:  # case (4)
490                        yield dh - d2
491                        fast_updates.append((j, h))
492
493        t = fsum(calculate_t())
494
495        if t < 0:  # swap is an improvement?
496            self._debug('ACCEPT swap t:%f' % t, i, h)
497            self._cost += t
498
499            selected = self._selected
500            del selected[selected.index(i)]
501            bisect.insort(selected, h)
502
503            slow_updates = [i]  # update i's distances to medoids
504            for j, m in fast_updates:
505                if m == i:
506                    slow_updates.append(j)
507                else:
508                    min_d = None
509                    min_k = None
510                    for k in selected:
511                        if k != m:
512                            d = distance(k, j)
513                            if min_d is None or d < min_d:
514                                min_d = d
515                                min_k = k
516                    assert m != min_k and min_k is not None
517                    clusters[j] = (m, min_k)
518
519            # update other second distances.
520            for _ in self._update_clusters(slow_updates):
521                pass  # ignore distances (no influence on cost here).
522
523            # we swapped, so we want to allow previously used partners.
524            self._reset_random_swap()
525
526            return True
527        else:
528            return False
529
530
531class _Clusterer:
532    debug_output = False
533
534    def __init__(self, n, i_to_i0, medoid0, siblings, p0, d0):
535        self._n = n
536        self._i_to_i0 = i_to_i0
537        self._medoid0 = medoid0
538        self._siblings = siblings
539        self._p0 = p0
540        self._d0 = d0
541
542    def _distance_lambda(self):
543        d0 = self._d0
544        i_to_i0 = self._i_to_i0
545
546        def distance(i, j):
547            return d0(i_to_i0[i], i_to_i0[j])
548
549        return distance
550
551    def debug(self, s, *i):
552        if self.debug_output:
553            print(s, [self._p0[self._i_to_i0[j]] for j in i])
554
555    def with_k(self, k):
556        # this implements CLARANS as described in [Ng1994]. we modify it
557        # by doing an AGORAS seed in _Medoids().
558
559        n = self._n
560        # [Ng1994] recommends the following values
561        num_local = 2  # number of local minima to search for
562        max_neighbours = min(max(0.0125 * k * (n - k), 250), k * (n - k))
563
564        min_cost_medoids = None
565        for i in range(num_local):
566            medoids = _Medoids(self, k)
567
568            self.debug('new local %f' % medoids.cost())
569            j = 1
570            while j <= max_neighbours:
571                if medoids.swap():
572                    self.debug('NEW MEDOIDS (%f)' % medoids.cost(), *medoids._selected)
573                    j = 1
574                else:
575                    j += 1
576
577            self.debug('end local %f' % medoids.cost())
578            if min_cost_medoids is None or medoids.cost() < min_cost_medoids.cost():
579                min_cost_medoids = medoids
580
581        return min_cost_medoids.clusters()
582
583    def without_k(self, criterion):
584        # we estimate k using the approach described in [Hamerly2003].
585
586        # this chunks the points (0, 1, ..., n - 1) into clusters. each point i
587        # corresponds to a global point index i_to_i0[i]. d0 is a distance function
588        # that takes two global point indices.
589
590        n = self._n
591        i_to_i0 = self._i_to_i0
592
593        if n < 2:
594            return [[i_to_i0[0]]]
595
596        clusters = self.with_k(2)
597        clusters0 = [c.translate(i_to_i0) for c in clusters]
598
599        merged = _Cluster(self._medoid0, list(chain(*[c.members for c in clusters0])))
600
601        split, new_criterion = criterion.should_split(
602            self._siblings, merged, clusters0, self._d0)
603
604        if self.debug_output:
605            print([[self._p0[i] for i in c.members] for c in self._siblings + clusters0], split, new_criterion)
606
607        if not split:
608            return [[i_to_i0[i] for i in range(n)]]
609        else:
610            r = []
611            for i, c in enumerate(clusters):
612                t = clusters0[i].members
613                d = clusters0[1 - i]
614
615                sub = _Clusterer(len(t), t, clusters0[i].medoid, self._siblings + [d], self._p0, self._d0)
616                r.extend(sub.without_k(new_criterion))
617            return r
618
619
620def optimize(p, k, distances, mode='clusters', seed=12345, granularity=1.):
621    if k == 1:
622        return [p]
623
624    random_state = random.getstate()
625    try:
626        # we want the same output on every call on the same data, so we use
627        # a fixed random seed at this point.
628        random.seed(seed)
629
630        clusterer = _Clusterer(
631            len(p), tuple(range(len(p))), None, [], p, distances.distance)
632
633        if isinstance(k, tuple) and len(k) == 2:
634            criterion = k[0](**k[1])
635            assert isinstance(criterion, SplitCriterion)
636            clusters = clusterer.without_k(criterion)
637        elif isinstance(k, int):
638            clusters = [c.members for c in clusterer.with_k(k)]
639        else:
640            raise ValueError('illegal parameter k "%s"' % str(k))
641
642        # sort clusters by order of their first element in the original list.
643        clusters = sorted(clusters, key=lambda c: c[0])
644
645        if mode == 'clusters':
646            return list(map(lambda c: map(lambda i: p[i], c), clusters))
647        elif mode == 'components':
648            return _components(clusters, len(p))
649        else:
650            raise ValueError('illegal mode %s' % mode)
651    finally:
652        random.setstate(random_state)
653
654
655class MergeCriterion(object):
656    def __init__(self, distances, n):
657        self.distances = distances
658        self.n = n
659
660    def merge(self, clusters, i, j, d_min, save):
661        raise NotImplementedError()
662
663    def _fast_distance(self):
664        distances = self.distances
665        return lambda x, y: distances[_index(max(x, y), min(x, y))]
666
667
668class FixedDistanceCriterion(MergeCriterion):
669    def __init__(self, distances, n, merge_limit):
670        super(FixedDistanceCriterion, self).__init__(distances, n)
671        self._merge_limit = merge_limit
672
673    def merge(self, clusters, i, j, d_min, save):
674        return d_min <= self._merge_limit
675
676
677class _DunnMergeCriterion(MergeCriterion):
678    # implements a Dunn index, as for example described in [Desgraupes2013].
679
680    def __init__(self, distances, n, merge_limit=None):
681        # merge_limit: do not merge points above this distance; if the merged
682        # distance falls above this limit, the clustering is stopped and the
683        # best clustering so far is returned.
684
685        super(_DunnMergeCriterion, self).__init__(distances, n)
686
687        self._diameters = [0.] * n
688        self._max_diameter = 0.
689        self._best_dunn = None
690
691        self._merge_limit = merge_limit
692
693    def merge(self, clusters, i, j, d_min, save):
694        # save the current partition if it's better than before.
695
696        dunn = (d_min, self._max_diameter)
697        if self._best_dunn is None or _ratio_bigger_than(dunn, self._best_dunn):
698            save()
699            if self._max_diameter > 0.:
700                self._best_dunn = dunn
701
702        # now perform the merge.
703
704        if self._merge_limit is not None and d_min > self._merge_limit:
705            return False
706
707        distance = self._fast_distance()
708        new_diameter = max(distance(x, y) for x in clusters[i] for y in clusters[j])
709
710        diameters = self._diameters
711        diameters[i] = max(diameters[i], diameters[j], new_diameter)
712        diameters[j] = None
713        self._max_diameter = max(self._max_diameter, diameters[i])
714
715        return True
716
717
718AutomaticMergeCriterion = _DunnMergeCriterion
719
720
721def agglomerate(points_and_weights, k, distances, mode='clusters'):
722    # this is an implementation of heap-based clustering as described
723    # by [Kurita1991].
724
725    # runs in O(N^2 log(N)) when N items are clustered, with memory
726    # of O(N^2).
727
728    # two notes on this implementation:
729
730    # (1) we work with 0-based indices, whereas Kurita's pseudocode
731    # uses 1-based indices.
732
733    # (2) when comparing heap elements h[i], h[j], we actually
734    # compare tuples of (distance, index, pair of elements), which
735    # guarantees that clusterings will produce the same results
736    # no matter how the input data is ordered, and that h[i] != h[j]
737    # as long as i != j.
738
739    # parameters:
740
741    # points: list of points to cluster
742    # k: number of clusters to form or None for automatic detection
743    # distances: an instance of PrecomputedDistances
744    # mode: 'clusters' returns clusters, 'dominant' returns one dominant
745    # representant of each cluster only, 'components' returns the index of
746    # the cluster each element is in for each element.
747
748    if mode == 'dominant':
749        points, weight_ = points_and_weights
750        weight = [x for x in weight_]
751    else:
752        points = points_and_weights
753
754    clusters = [[i] for i in range(len(points))]
755
756    def shiftdown(s, heap, where):
757        e = len(heap) - 1
758
759        i = s
760        j = 2 * i + 1  # i.e. 2 * (i + 1) - 1
761        if j > e:
762            return
763
764        x, p, _ = xp = heap[i]
765
766        while j <= e:
767            if j < e and heap[j] > heap[j + 1]:
768                j += 1
769            if xp <= heap[j]:
770                break
771
772            xx, pp, _ = heap[i] = heap[j]
773            where[pp] = i
774
775            i = j
776            j = 2 * i + 1
777
778        heap[i] = xp
779        where[p] = i
780
781    def shiftup(s, heap, where):
782        i = s
783        j = (i + 1) // 2 - 1
784        if j < 0:
785            return
786
787        x, p, _ = xp = heap[i]
788
789        while j >= 0:
790            if heap[j] <= xp:
791                break
792
793            xx, pp, _ = heap[i] = heap[j]
794            where[pp] = i
795
796            i = j
797            j = (i + 1) // 2 - 1
798
799        heap[i] = xp
800        where[p] = i
801
802    def update(s, xp, heap, where):
803        xxpp = heap[s]
804        assert xp != xxpp
805
806        if xp > xxpp:
807            heap[s] = xp
808            shiftdown(s, heap, where)
809        else:
810            heap[s] = xp
811            shiftup(s, heap, where)
812
813    def remove(s, heap, where):
814        if s == len(heap) - 1:
815            heap = heap[:-1]
816        else:
817            xx, pp, _ = xxpp = heap[-1]
818            heap = heap[:-1]
819            where[pp] = s
820            update(s, xxpp, heap, where)
821
822        return heap
823
824    def unmerged_pairs(i, j, n):
825        # yields all unmerged pairs (1, i), (2, i), ...
826        # except for (i, i) and (i, j)
827
828        # example of unmerged_pairs() output (none merged yet):
829        # n = 4, i = 1, j = 3
830        # unmerged_pairs(1, 3, 4) -> (1, 0), (2, 1)
831        # unmerged_pairs(3, 1, 4) -> (3, 0), (3, 2)
832
833        for r in range(i):  # all (i, _)
834            if r != j and clusters[r]:  # r is not yet merged?
835                yield _index(i, r)
836
837        # skip (i, i)
838
839        for r in range(i + 1, n):  # all (_, i)
840            if r != j and clusters[r]:  # r is not yet merged?
841                yield _index(r, i)
842
843    def reduce():
844        n = len(points)
845        triangular_distance_matrix = distances.matrix()
846
847        if isinstance(k, tuple) and len(k) == 2:
848            criterion = k[0](triangular_distance_matrix, n, **k[1])
849            assert isinstance(criterion, MergeCriterion)
850            n_clusters_target = 1
851        elif isinstance(k, int):
852            criterion = None
853            n_clusters_target = k
854        else:
855            raise ValueError('illegal k "%s"' % str(k))
856
857        pairs = [(points[i], points[j]) for i in range(n) for j in range(i)]
858        lookup = [(i, j) for i in range(n) for j in range(i)]
859
860        where = list(range(len(triangular_distance_matrix)))
861        heap = [(d, z, u) for d, z, u in zip(
862            triangular_distance_matrix, where, pairs)]
863
864        for s in range(len(heap) // 2 - 1, -1, -1):  # ..., 1, 0
865            shiftdown(s, heap, where)
866
867        n_clusters = n
868
869        # if the criterion does not call save(), "best" is always the current (last) configuration.
870        # save() allows to put a different configuration into "best" and keep on clustering and
871        # return the "best" configuration later on as result.
872
873        if mode in ('clusters', 'components'):
874            dominant = False
875            best = [clusters]
876
877            def save():  # save current configuration
878                best[0] = [c[:] for c in clusters if c]
879
880            def result():
881                best_clusters = best[0]
882
883                # sort, so clusters appear in order of their first element appearance in the original list.
884                r = sorted([sorted(c) for c in best_clusters if c], key=lambda c: c[0])
885
886                if mode == 'components':
887                    return _components(r, n)
888                elif mode == 'clusters':
889                    return [[points[i] for i in c] for c in r]
890        elif mode == 'dominant':
891            dominant = True
892            best = [clusters, weight]
893
894            def save():  # save current configuration
895                best[0] = [c[:] for c in clusters if c]
896                best[1] = weight[:]
897
898            def result():
899                best_clusters, best_weight = best
900                prototypes = [(points[i], best_weight[i], c) for i, c in enumerate(best_clusters) if c is not None]
901                return sorted(prototypes, key=lambda t: t[1], reverse=True)  # most weighted first
902        else:
903            raise ValueError('illegal mode %s' % mode)
904
905        while len(heap) > 0 and n_clusters > n_clusters_target:
906            d, p, _ = heap[0]
907
908            i, j = lookup[p]
909
910            if dominant:
911                if weight[j] > weight[i]:  # always merge smaller (j) into larger, dominant (i)
912                    i, j = j, i
913            elif i > j:
914                i, j = j, i  # merge later chunk to earlier one to preserve order
915
916            if criterion and not criterion.merge(clusters, i, j, d, save):
917                break
918
919            heap = remove(where[p], heap, where)  # remove distance (i, j)
920
921            # look at each connection to i, and each connection to j, so that we look at the same
922            # points connection to the cluster, i.e. (a=(1, i), b=(1, j)), (a=(2, i), b=(2,i)), etc.
923            for a, b in zip(unmerged_pairs(i, j, n), unmerged_pairs(j, i, n)):
924                # first, remove the distance to j from the heap, as only the distance to i should
925                # remain, as the new cluster will be i.
926
927                y, py, u = heap[where[b]]
928                heap = remove(where[b], heap, where)
929
930                # now update the distance to i with j's distance, if j was a shorter distance. this
931                # implements a "single linkage" clustering, where the distance of an outside point to
932                # the cluster is always the shortest distance from that outside point to any point in
933                # the cluster.
934
935                # in the "dominant" mode, we do not want this. instead, we want to keep the distance
936                # from outside points to the dominant element in the cluster, which is always "a",
937                # so that the clustering depends on the distance of cluster center to new elements.
938                # using single linkage with "dominant" would fray the borders of the clustering, so
939                # that the dominant elements are not really dominant anymore.
940
941                if not dominant:  # use single linkage?
942                    x, px, _ = heap[where[a]]
943                    if y < x:  # compare only values here, and not tuples (x, p)
944                        update(where[a], (y, px, u), heap, where)
945
946            if dominant:
947                weight[i] += weight[j]
948                weight[j] = 0
949
950            clusters[i].extend(clusters[j])
951            clusters[j] = None
952
953            n_clusters -= 1
954
955        return result()
956
957    return reduce()
958
959
960class _KMeans:
961    def __init__(self, x, d, epsilon):
962        self.x = x
963        self.d = d
964        self.epsilon = epsilon
965
966    def _pick_initial(self):
967        # use k-means++, see [Arthur2007]
968
969        x = self.x
970        d = self.d
971
972        candidates = list(range(len(x)))
973
974        i = random.randint(0, len(candidates) - 1)
975        new_centroid = x[candidates[i]]
976        yield new_centroid
977
978        def swap_delete(a, i):
979            j = len(a) - 1
980            a[i] = a[j]
981            del a[j]
982
983        def random_choice(m, p):  # weighted random choice
984            r = random.uniform(0, m)
985            for i, z in enumerate(chain(p, [None])):
986                if z is None:
987                    return i - 1
988                if r < z:
989                    return i
990                r -= z
991
992        def d2(x, y):
993            distance = d(x, y)
994            return distance * distance
995
996        swap_delete(candidates, i)
997
998        distances = [d2(new_centroid, x[candidate]) for candidate in candidates]
999        sum_of_distances = sum(distances)
1000        while True:
1001            i = random_choice(sum_of_distances, distances)
1002            new_centroid = x[candidates[i]]
1003            yield new_centroid
1004
1005            swap_delete(candidates, i)
1006            sum_of_distances -= distances[i]
1007            swap_delete(distances, i)
1008
1009            for i, candidate_distance in enumerate(zip(candidates, distances)):
1010                candidate, old_distance = candidate_distance
1011                new_distance = d2(new_centroid, x[candidate])
1012                if new_distance < old_distance:
1013                    distances[i] = new_distance
1014                    sum_of_distances -= old_distance - new_distance
1015
1016    def _distances(self, c):
1017        d = self.d
1018        k = len(c)
1019
1020        between = {}
1021        for j1, c1 in enumerate(c):
1022            z = j1 * k
1023            for j2, c2 in enumerate(c[:j1]):
1024                between[z + j2] = d(c1, c2)
1025
1026        return lambda x, y: between[max(x, y) * k + min(x, y)]
1027
1028    def _kmeans(self, k):
1029        # implements an improved version of kmeans, see [Hamerly2010].
1030        x = self.x
1031        d = self.d
1032
1033        assert k <= len(x)
1034
1035        # pick initial clusters. actually quite an important step.
1036        c = list(islice(self._pick_initial(), 0, k))
1037        assert len(c) == k
1038
1039        q = [0] * len(c)
1040        cc = [[0] * len(x[0]) for _ in range(k)]
1041
1042        a = [0] * len(x)
1043        u = [0] * len(x)
1044        l = [0] * len(x)
1045
1046        for i, xi in enumerate(x):
1047            ai, u[i], _, l[i] = _smallest2(d(xi, cj) for cj in c)
1048            q[ai] += 1
1049            cc[ai] = _pairwise_sum(cc[ai], xi)
1050            a[i] = ai
1051
1052        s = [0] * len(c)
1053        p = [0] * len(c)
1054        change = None
1055
1056        while change is None or change > self.epsilon:
1057            # find cluster distances
1058            cd = self._distances(c)
1059            s = [min(cd(j1, j2) for j2 in range(k) if j2 != j1) for j1 in range(k)]
1060
1061            # find new assignments
1062            for i, ai in enumerate(a):
1063                m = max(s[ai] / 2., l[i])
1064
1065                if u[i] > m:
1066                    xi = x[i]
1067                    u[i] = d(xi, c[ai])
1068
1069                    if u[i] > m:
1070                        new_ai, u[i], _, l[i] = _smallest2(d(xi, cj) for cj in c)
1071
1072                        if new_ai != ai:
1073                            q[ai] -= 1
1074                            q[new_ai] += 1
1075                            cc[ai] = [a - b for a, b in zip(cc[ai], xi)]
1076                            cc[new_ai] = [a + b for a, b in zip(cc[new_ai], xi)]
1077
1078                            a[i] = new_ai
1079
1080            # move centers
1081            empty_cluster = False
1082
1083            for j in range(len(c)):
1084                qj = q[j]
1085                if qj == 0:
1086                    empty_cluster = True
1087                    break
1088                c_old = c[j]
1089                c_new = [ccj / qj for ccj in cc[j]]
1090                distance = d(c_old, c_new)
1091                c[j] = c_new
1092                p[j] = distance
1093
1094            if empty_cluster:
1095                break
1096
1097            # update bounds
1098            r1, p1, r2, p2 = _largest2(p)
1099            for i, ai in enumerate(a):
1100                u[i] += p[ai]
1101                if r1 == ai:
1102                    l[i] -= p2
1103                else:
1104                    l[i] -= p1
1105
1106            change = sum(p)
1107
1108        # compute an approximate silhouette index
1109        within = [0] * len(c)
1110        for i, ai in enumerate(a):
1111            within[ai] += d(x[i], c[ai])
1112        for j in range(len(c)):
1113            if q[j] == 1:
1114                return a, -1.  # no good config
1115            within[j] /= q[j] - 1
1116
1117        silhouette = fsum(_silhouette(a, b) for a, b in zip(within, s)) / len(c)
1118        return a, silhouette
1119
1120    def _optimize(self, solutions):
1121        best_a = None
1122        best_silhouette = None
1123        best_k = None
1124
1125        for a, silhouette, k in solutions():
1126            if best_silhouette is None:
1127                pass
1128            elif silhouette <= best_silhouette:
1129                break
1130            best_silhouette = silhouette
1131            best_a = a
1132            best_k = k
1133
1134        return best_a, best_silhouette, best_k
1135
1136    def with_k(self, k):
1137        def solutions():
1138            while True:
1139                a, s = self._kmeans(k)
1140                yield a, s, k
1141
1142        return self._optimize(solutions)
1143
1144    def without_k(self):
1145        def solutions():
1146            k = 2
1147            while k < len(self.x):
1148                yield self.with_k(k)
1149                k += 1
1150
1151        return self._optimize(solutions)
1152
1153
1154def _squared_euclidean_distance(a, b):
1155    s = None
1156    for x, y in zip(a, b):
1157        d = x - y
1158        if s is None:
1159            s = d * d
1160        else:
1161            s += d * d
1162    return s
1163
1164
1165def _clusters(x, a, k):
1166    clusters = [[] for _ in range(k)]
1167    add = [c.append for c in clusters]
1168    for i, j in enumerate(a):
1169        add[j](x[i])
1170    return clusters
1171
1172
1173def kmeans(x, x_repr, k, mode, seed, epsilon):
1174    assert len(x) == len(x_repr)
1175
1176    random.seed(seed)
1177    km = _KMeans(x, _squared_euclidean_distance, epsilon)
1178
1179    if k is None:
1180        a, _, k = km.without_k()
1181    else:
1182        assert 1 < k < len(x)
1183        a, _, k = km.with_k(k)
1184
1185    if mode == 'clusters':
1186        return _clusters(x_repr, a, k)
1187    elif mode == 'components':
1188        return a
1189    else:
1190        raise ValueError('illegal mode %s' % mode)
1191