1#!/usr/local/bin/python3.8 2# -*- coding: utf-8 -*- 3 4 5import random 6from itertools import chain, islice 7import bisect 8import math 9 10from mpmath import fsum 11from mathics.core.util import robust_min 12 13# publications used for this file: 14 15# [Kurita1991] Takito Kurita: "An Efficient Agglomerative Clustering Algorithm Using A Heap", Pattern Recognition, 16# Volume 24 Issue 3, 1991. 17# [Ng1994] Raymond T. Ng, Jiawei Han: Efficient and Effective Clustering Methods for Spatial Data Mining. 18# VLDB 1994: 144-155 19# [Rangel2016] E. Rangel, W. Hendrix, A. Agrawal, W.-keng Liao, and A. Choudhary, “AGORAS: A Fast Algorithm for 20# Estimating Medoids in Large Datasets,” in Proceedings of International Conference on Computational Science (ICCS), 21# 2016. 22# [Hamerly2003] Greg Hamerly, Charles Elkan, Learning the k in k-means. In proceedings of the seventeenth 23# annual conference on neural information processing systems (NIPS), pages 281-288, December 2003. 24# [Desgraupes2013] Bernard Desgraupes, Package clusterCrit for R: Clustering Indices, available online at 25# https://cran.r-project.org/web/packages/clusterCrit/vignettes/clusterCrit.pdf 26# [Greutert2003] Andreas Greutert, "Methoden zur Schätzung der Clusteranzahl", Ausgabe: 29. Oktober 2003. 27# Diplomarbeit. ETH Zürich. 28# [Frey2007] Brendan J. Frey, Delbert Dueck, "Clustering by Passing Messages Between Data Points", 29# Science 16 Feb 2007: Vol. 315, Issue 5814, pp. 972-976 30# [Arthur2007] Arthur, D.; Vassilvitskii, S. (2007). "k-means++: the advantages of careful seeding", 31# Proceedings of the eighteenth annual ACM-SIAM symposium on Discrete algorithms. Society for Industrial and 32# Applied Mathematics Philadelphia, PA, USA. pp. 1027–1035. 33# [Hamerly2010] Greg Hamerly, Making k-means even faster In proceedings of the 2010 SIAM international 34# conference on data mining (SDM 2010), April 2010. 35 36# for agglomerative clustering, we use [Kurita1991]. 37 38# for hierarchical clustering, we use ClaraNS (see [Ng1994]), but we start local searches not with a random sample, 39# but with a seed computed via AGORAS (see [Rangel2016]). for clustering without knowing the number of clusters k, 40# we use the approach described in [Hamerly2003]. further improvements might be gained through [Frey2007]. 41 42 43def _index(i, j): # i > j, returns j + sum(1, 2, ..., i - 1) 44 # x x x 45 # a x x 46 # b c x 47 48 # a = 0, b = 1, c = 2 49 return j + ((i - 1) * i) // 2 50 51 52def _ordered2(a, cmp): 53 iterator = iter(a) 54 55 min_i1 = 0 56 min_i2 = 1 57 58 min_x1 = next(iterator) 59 min_x2 = next(iterator) 60 61 if not cmp(min_x1, min_x2): 62 min_i1, min_i2 = min_i2, min_i1 63 min_x1, min_x2 = min_x2, min_x1 64 65 try: 66 i = 2 67 while True: 68 x = next(iterator) 69 if cmp(x, min_x1): 70 min_x2 = min_x1 71 min_i2 = min_i1 72 min_x1 = x 73 min_i1 = i 74 elif cmp(x, min_x2): 75 min_x2 = x 76 min_i2 = i 77 i += 1 78 except StopIteration: 79 return min_i1, min_x1, min_i2, min_x2 80 81 82def _smallest2(a): 83 return _ordered2(a, lambda x, y: x < y) 84 85 86def _largest2(a): 87 return _ordered2(a, lambda x, y: x > y) 88 89 90def _components(clusters, n): 91 components = [0] * n 92 for i, c in enumerate(clusters): 93 component_index = i + 1 94 for j in c: 95 components[j] = component_index 96 return components 97 98 99def _clusters(x, a, k): 100 clusters = [[] for _ in range(k)] 101 add = [c.append for c in clusters] 102 for i, j in enumerate(a): 103 add[j](x[i]) 104 return clusters 105 106 107def _ratio_bigger_than(a, b): 108 # check if (a1 / a2) > (b1 / b2) 109 a1, a2 = a 110 b1, b2 = b 111 return a1 * b2 > b1 * a2 112 113 114def _pairwise_sum(a, b): 115 return [x + y for x, y in zip(a, b)] 116 117 118class InfiniteSilhouette(Exception): 119 # thrown when two clusters have distance 0 120 pass 121 122 123def _silhouette(a, b): 124 if a is None: 125 # a is infinite, i.e. only one element 126 # in the cluster and thus no distances? 127 return 0. 128 129 # for the formula, see [Desgraupes2013]. 130 try: 131 s = (b - a) / max(a, b) 132 except ZeroDivisionError: # max(a, b) == 0? 133 raise InfiniteSilhouette() 134 135 return s 136 137 138class PrecomputedDistances(object): 139 def __init__(self, distances): 140 self._distances = distances 141 142 def distance(self, i, j): 143 return self._distances[_index(max(i, j), min(i, j))] 144 145 def matrix(self): 146 return self._distances 147 148 149class LazyDistances(object): 150 def __init__(self): 151 self._computed = {} 152 153 def distance(self, i, j): 154 computed = self._computed 155 index = _index(max(i, j), min(i, j)) 156 d = computed.get(index, None) 157 if d is None: 158 # we are probably in the middle of a (seeded) random 159 # computation, and we do not want the distance function 160 # to mess with that state. 161 random_state = random.getstate() 162 try: 163 d = self._compute_distance(i, j) 164 finally: 165 random.setstate(random_state) 166 computed[index] = d 167 return d 168 169 def matrix(self): 170 raise ValueError('LazyDistances does not support matrix()') 171 172 def _compute_distance(self, i, j): 173 raise ValueError('_compute_distance was not implemented') 174 175 176def _shuffled_range(n): 177 # returns all numbers from [0, ..., n - 1] in random order, never returning any number twice. 178 179 # basically a Fisher-Yates shuffle that does not materialize the data, which means it's especially suited for 180 # k << n, where k is the count of random numbers actually retrieved. 181 182 # performs surprisingly well even if all random numbers are retrieved. getting all available numbers from 183 # shuffled_range(10^7) only takes only 70% longer than generating just the raw 10 million random numbers. 184 185 a = dict() 186 187 def nth(k): 188 return a.get(k, k) 189 190 for i in range(n - 1): 191 ai = nth(i) 192 193 j = random.randint(i, n - 1) 194 aj = nth(j) 195 196 a[j] = ai 197 yield aj 198 199 yield nth(n - 1) 200 201 202def _shuffled_tuples(*t): 203 # like _shuffled_range, but not returning integers, but tuples 204 # of integers within given ranges. 205 206 # we map the tuple values to a linear index, then use _shuffled_range. 207 208 # e.g. for t = (2, 3), we map: 209 # 0 -> (0, 0); 1 -> (1, 0); 2 -> (0, 1) 210 # 3 -> (1, 1); 4 -> (0, 2); 5 -> (1, 2) 211 212 n = 1 213 for x in t: 214 n *= x 215 indices = iter(_shuffled_range(n)) 216 217 while True: 218 n = next(indices) 219 r = [] 220 for x in t: 221 r.append(n % x) 222 n = n // x 223 yield r 224 225 226def _unmapped(u, v, distance): 227 # maps each object in u to the most similar object in v 228 # and returns a version of v from which all unmapped objects 229 # have been removed. 230 231 mapped = [False] * len(v) 232 for uu in u: 233 min_d = None 234 min_i = None 235 for i, vv in enumerate(v): 236 if uu == vv: 237 d = 0 238 else: 239 d = distance(uu, vv) 240 if min_d is None or d < min_d: 241 min_d = d 242 min_i = i 243 mapped[min_i] = True 244 return [vv for m, vv in zip(mapped, v) if m] 245 246 247def _agoras(n, k, distance): # see [Rangel2016] 248 gamma = 0.577215664901532 # Euler-Mascheroni constant 249 m = int(math.ceil(k * (math.log(k) + gamma))) 250 r = k * math.log(k) + gamma * k 251 p = tuple(range(n)) 252 253 if r > n: 254 # if k (and thus r) is very large in regard to n, we fall 255 # back to simple sampling, since AGORAS is slow then. 256 return random.sample(p, k) 257 258 while True: 259 s = [random.sample(p, min(int(r), n)) for _ in range(m)] 260 for i in range(m - 1): 261 s[i + 1] = _unmapped(s[i], s[i + 1], distance) 262 if len(s[i + 1]) < k: 263 r += r * (m - (i + 1)) / m 264 break 265 if len(s[-1]) > k: 266 r *= 0.95 267 elif len(s[-1]) == k: 268 return s[-1] 269 270 271class SplitCriterion(object): 272 # note that as this runs as part of an O(n) algorithm, this 273 # should not introduce an O(n^2) algorithm. see [Desgraupes2013] 274 # and [Greutert2003] for possible algorithms, 275 276 def should_split(self, siblings, merged, unmerged, distance): 277 raise NotImplementedError() 278 279 280class ApproximateSilhouetteSplitCriterion(SplitCriterion): 281 # a fast, approximate version of the Silhouette index (see e.g. 282 # [Desgraupes2013], that operates on medoids instead of points, 283 # thereby reducing its runtime to O(nk) as opposed to O(n^2). 284 285 def __init__(self, last_criterion=None): 286 self._last_criterion = last_criterion 287 288 def should_split(self, siblings, merged, unmerged, distance): 289 try: 290 criterion = self._approximate_global_silhouette_index( 291 siblings + unmerged, distance) 292 except InfiniteSilhouette: 293 # zero distance between two clusters, do not accept split. 294 return False, None 295 296 # if the index is bigger than before, then more clusters are 297 # in the right places than before, and we accept the split. 298 if self._last_criterion is None or criterion > self._last_criterion: 299 return True, ApproximateSilhouetteSplitCriterion(criterion) 300 else: 301 return False, None 302 303 def _approximate_global_silhouette_index(self, clusters, distance): 304 if len(clusters) <= 1: 305 return -1. 306 else: 307 return fsum(self._approximate_mean_silhouette_widths(clusters, distance)) / len(clusters) 308 309 def _approximate_mean_silhouette_widths(self, clusters, distance): 310 d_in = self._approximate_within_distances(clusters, distance) 311 d_out = self._approximate_mins_of_betweens_distances(clusters, distance) 312 # the mean is just s(i) here, as we only use medoids for approximation. 313 return (_silhouette(a, b) for a, b in zip(d_in, d_out)) 314 315 def _approximate_within_distances(self, clusters, distance): 316 for c in clusters: 317 s, n = c.within(distance) 318 if n == 1: 319 yield None 320 else: 321 yield s / (n - 1) 322 323 def _approximate_mins_of_betweens_distances(self, clusters, distance): 324 def medoids(): 325 for i, a in enumerate(clusters): 326 yield i, a.medoid 327 328 def other_medoids(i): 329 for j, c in enumerate(clusters): 330 if i != j: 331 yield c.medoid 332 333 def other_members(i): 334 for j, c in enumerate(clusters): 335 if i != j: 336 for p in c.members: 337 yield p 338 339 other = other_medoids # fast version 340 341 for i, a in medoids(): 342 yield robust_min((distance(a, b) for b in other(i))) 343 344AutomaticSplitCriterion = ApproximateSilhouetteSplitCriterion 345 346 347class _Cluster: 348 def __init__(self, medoid, members): 349 self.medoid = medoid 350 self.members = members 351 self._within = None 352 353 def translate(self, i_to_i0): 354 return _Cluster(i_to_i0[self.medoid], [i_to_i0[i] for i in self.members]) 355 356 def within(self, distance): 357 if self._within is None: 358 # s_w is the sum of within-cluster (point to its medoid) distances. 359 m = self.medoid 360 s_w = fsum(distance(i, m) for i in self.members if i != m) 361 n_w = len(self.members) 362 self._within = (s_w, n_w) 363 364 return self._within 365 366 367class _Medoids: 368 def __init__(self, clusterer, k): 369 distance = clusterer._distance_lambda() 370 n = clusterer._n 371 372 self._n = n 373 self._k = k 374 self._distance = distance 375 self._reset_random_swap() 376 377 self._selected = _agoras(n, k, distance) 378 self._selected.sort() 379 380 self._clusters = [None] * n 381 self._cost = fsum(self._update_clusters(self._unselected)) 382 self._debug = clusterer.debug 383 384 def clusters(self): 385 clusters = self._clusters 386 solution = [] 387 for i in self._selected: 388 members = [j for j in self._unselected if clusters[j][0] == i] 389 # preserve the original order of the elements by inserting i. 390 bisect.insort(members, i) 391 solution.append(_Cluster(i, members)) 392 return solution 393 394 @property 395 def _unselected(self): 396 selected = self._selected 397 k = 0 398 399 z = selected[k] 400 k += 1 401 for i in range(self._n): 402 if i == z: 403 if k < len(selected): 404 z = selected[k] 405 k += 1 406 else: 407 z = -1 408 else: 409 yield i 410 411 def _reset_random_swap(self): 412 self._random_swap = iter(_shuffled_tuples(self._k, self._n - self._k)) 413 414 def _next_random_swap(self): 415 # we modify ClaraNS by keeping track of the randomly picked swaps and not trying the same swap twice if no 416 # improvement was made in the meantime. this is where the next not-yet-picked tuple is produced. 417 418 n_i, h = next(self._random_swap) 419 420 for j in self._selected: 421 if h >= j: 422 h += 1 423 424 return self._selected[n_i], h 425 426 def _update_clusters(self, a): 427 selected = self._selected 428 clusters = self._clusters 429 distance = self._distance 430 431 for j in a: 432 s1, d1, s2, d2 = _smallest2(distance(i, j) for i in selected) 433 i1 = selected[s1] 434 i2 = selected[s2] 435 assert i1 != i2 436 clusters[j] = (i1, i2) 437 yield d1 438 439 def cost(self): 440 return self._cost 441 442 def swap(self): 443 try: 444 i, h = self._next_random_swap() 445 except StopIteration: 446 self._debug('all swaps tried') 447 return False 448 449 self._debug('eval swap', i, h) 450 451 # try to swap medoid i with non-medoid h 452 clusters = self._clusters 453 distance = self._distance 454 455 # we differentiate fast_updates and slow_updates. fast_updates 456 # need to update only the second-nearest medoid. slow_updates 457 # need to update the nearest and second-nearest medoid. 458 fast_updates = [] 459 460 def calculate_t(): 461 # i attaches to a medoid 462 yield min(distance(i, j) for j in chain(self._selected, [h]) if j != i) 463 464 # h detaches from its medoid 465 yield -distance(h, clusters[h][0]) 466 467 # look at all other points 468 for j in self._unselected: 469 if j == h: 470 continue 471 # see [Ng1994] for a description of the following calculations 472 n1, n2 = clusters[j] # nearest two medoids 473 dh = distance(j, h) # d(Oj, Oh) 474 if n1 == i: # is j in cluster i? 475 d2 = distance(j, n2) # d(Oj, Oj,2) 476 if dh >= d2: # case (1); j does not go to h 477 yield d2 - distance(j, i) 478 fast_updates.append((j, n2)) 479 else: # case (2); j goes to h 480 yield dh - distance(j, i) 481 fast_updates.append((j, h)) 482 else: 483 k = clusters[j][0] 484 d2 = distance(j, k) 485 if dh >= d2: # case (3) 486 # j stays in current cluster. second nearest medoid 487 # to j might change with the introduction of h though. 488 fast_updates.append((j, k)) 489 else: # case (4) 490 yield dh - d2 491 fast_updates.append((j, h)) 492 493 t = fsum(calculate_t()) 494 495 if t < 0: # swap is an improvement? 496 self._debug('ACCEPT swap t:%f' % t, i, h) 497 self._cost += t 498 499 selected = self._selected 500 del selected[selected.index(i)] 501 bisect.insort(selected, h) 502 503 slow_updates = [i] # update i's distances to medoids 504 for j, m in fast_updates: 505 if m == i: 506 slow_updates.append(j) 507 else: 508 min_d = None 509 min_k = None 510 for k in selected: 511 if k != m: 512 d = distance(k, j) 513 if min_d is None or d < min_d: 514 min_d = d 515 min_k = k 516 assert m != min_k and min_k is not None 517 clusters[j] = (m, min_k) 518 519 # update other second distances. 520 for _ in self._update_clusters(slow_updates): 521 pass # ignore distances (no influence on cost here). 522 523 # we swapped, so we want to allow previously used partners. 524 self._reset_random_swap() 525 526 return True 527 else: 528 return False 529 530 531class _Clusterer: 532 debug_output = False 533 534 def __init__(self, n, i_to_i0, medoid0, siblings, p0, d0): 535 self._n = n 536 self._i_to_i0 = i_to_i0 537 self._medoid0 = medoid0 538 self._siblings = siblings 539 self._p0 = p0 540 self._d0 = d0 541 542 def _distance_lambda(self): 543 d0 = self._d0 544 i_to_i0 = self._i_to_i0 545 546 def distance(i, j): 547 return d0(i_to_i0[i], i_to_i0[j]) 548 549 return distance 550 551 def debug(self, s, *i): 552 if self.debug_output: 553 print(s, [self._p0[self._i_to_i0[j]] for j in i]) 554 555 def with_k(self, k): 556 # this implements CLARANS as described in [Ng1994]. we modify it 557 # by doing an AGORAS seed in _Medoids(). 558 559 n = self._n 560 # [Ng1994] recommends the following values 561 num_local = 2 # number of local minima to search for 562 max_neighbours = min(max(0.0125 * k * (n - k), 250), k * (n - k)) 563 564 min_cost_medoids = None 565 for i in range(num_local): 566 medoids = _Medoids(self, k) 567 568 self.debug('new local %f' % medoids.cost()) 569 j = 1 570 while j <= max_neighbours: 571 if medoids.swap(): 572 self.debug('NEW MEDOIDS (%f)' % medoids.cost(), *medoids._selected) 573 j = 1 574 else: 575 j += 1 576 577 self.debug('end local %f' % medoids.cost()) 578 if min_cost_medoids is None or medoids.cost() < min_cost_medoids.cost(): 579 min_cost_medoids = medoids 580 581 return min_cost_medoids.clusters() 582 583 def without_k(self, criterion): 584 # we estimate k using the approach described in [Hamerly2003]. 585 586 # this chunks the points (0, 1, ..., n - 1) into clusters. each point i 587 # corresponds to a global point index i_to_i0[i]. d0 is a distance function 588 # that takes two global point indices. 589 590 n = self._n 591 i_to_i0 = self._i_to_i0 592 593 if n < 2: 594 return [[i_to_i0[0]]] 595 596 clusters = self.with_k(2) 597 clusters0 = [c.translate(i_to_i0) for c in clusters] 598 599 merged = _Cluster(self._medoid0, list(chain(*[c.members for c in clusters0]))) 600 601 split, new_criterion = criterion.should_split( 602 self._siblings, merged, clusters0, self._d0) 603 604 if self.debug_output: 605 print([[self._p0[i] for i in c.members] for c in self._siblings + clusters0], split, new_criterion) 606 607 if not split: 608 return [[i_to_i0[i] for i in range(n)]] 609 else: 610 r = [] 611 for i, c in enumerate(clusters): 612 t = clusters0[i].members 613 d = clusters0[1 - i] 614 615 sub = _Clusterer(len(t), t, clusters0[i].medoid, self._siblings + [d], self._p0, self._d0) 616 r.extend(sub.without_k(new_criterion)) 617 return r 618 619 620def optimize(p, k, distances, mode='clusters', seed=12345, granularity=1.): 621 if k == 1: 622 return [p] 623 624 random_state = random.getstate() 625 try: 626 # we want the same output on every call on the same data, so we use 627 # a fixed random seed at this point. 628 random.seed(seed) 629 630 clusterer = _Clusterer( 631 len(p), tuple(range(len(p))), None, [], p, distances.distance) 632 633 if isinstance(k, tuple) and len(k) == 2: 634 criterion = k[0](**k[1]) 635 assert isinstance(criterion, SplitCriterion) 636 clusters = clusterer.without_k(criterion) 637 elif isinstance(k, int): 638 clusters = [c.members for c in clusterer.with_k(k)] 639 else: 640 raise ValueError('illegal parameter k "%s"' % str(k)) 641 642 # sort clusters by order of their first element in the original list. 643 clusters = sorted(clusters, key=lambda c: c[0]) 644 645 if mode == 'clusters': 646 return list(map(lambda c: map(lambda i: p[i], c), clusters)) 647 elif mode == 'components': 648 return _components(clusters, len(p)) 649 else: 650 raise ValueError('illegal mode %s' % mode) 651 finally: 652 random.setstate(random_state) 653 654 655class MergeCriterion(object): 656 def __init__(self, distances, n): 657 self.distances = distances 658 self.n = n 659 660 def merge(self, clusters, i, j, d_min, save): 661 raise NotImplementedError() 662 663 def _fast_distance(self): 664 distances = self.distances 665 return lambda x, y: distances[_index(max(x, y), min(x, y))] 666 667 668class FixedDistanceCriterion(MergeCriterion): 669 def __init__(self, distances, n, merge_limit): 670 super(FixedDistanceCriterion, self).__init__(distances, n) 671 self._merge_limit = merge_limit 672 673 def merge(self, clusters, i, j, d_min, save): 674 return d_min <= self._merge_limit 675 676 677class _DunnMergeCriterion(MergeCriterion): 678 # implements a Dunn index, as for example described in [Desgraupes2013]. 679 680 def __init__(self, distances, n, merge_limit=None): 681 # merge_limit: do not merge points above this distance; if the merged 682 # distance falls above this limit, the clustering is stopped and the 683 # best clustering so far is returned. 684 685 super(_DunnMergeCriterion, self).__init__(distances, n) 686 687 self._diameters = [0.] * n 688 self._max_diameter = 0. 689 self._best_dunn = None 690 691 self._merge_limit = merge_limit 692 693 def merge(self, clusters, i, j, d_min, save): 694 # save the current partition if it's better than before. 695 696 dunn = (d_min, self._max_diameter) 697 if self._best_dunn is None or _ratio_bigger_than(dunn, self._best_dunn): 698 save() 699 if self._max_diameter > 0.: 700 self._best_dunn = dunn 701 702 # now perform the merge. 703 704 if self._merge_limit is not None and d_min > self._merge_limit: 705 return False 706 707 distance = self._fast_distance() 708 new_diameter = max(distance(x, y) for x in clusters[i] for y in clusters[j]) 709 710 diameters = self._diameters 711 diameters[i] = max(diameters[i], diameters[j], new_diameter) 712 diameters[j] = None 713 self._max_diameter = max(self._max_diameter, diameters[i]) 714 715 return True 716 717 718AutomaticMergeCriterion = _DunnMergeCriterion 719 720 721def agglomerate(points_and_weights, k, distances, mode='clusters'): 722 # this is an implementation of heap-based clustering as described 723 # by [Kurita1991]. 724 725 # runs in O(N^2 log(N)) when N items are clustered, with memory 726 # of O(N^2). 727 728 # two notes on this implementation: 729 730 # (1) we work with 0-based indices, whereas Kurita's pseudocode 731 # uses 1-based indices. 732 733 # (2) when comparing heap elements h[i], h[j], we actually 734 # compare tuples of (distance, index, pair of elements), which 735 # guarantees that clusterings will produce the same results 736 # no matter how the input data is ordered, and that h[i] != h[j] 737 # as long as i != j. 738 739 # parameters: 740 741 # points: list of points to cluster 742 # k: number of clusters to form or None for automatic detection 743 # distances: an instance of PrecomputedDistances 744 # mode: 'clusters' returns clusters, 'dominant' returns one dominant 745 # representant of each cluster only, 'components' returns the index of 746 # the cluster each element is in for each element. 747 748 if mode == 'dominant': 749 points, weight_ = points_and_weights 750 weight = [x for x in weight_] 751 else: 752 points = points_and_weights 753 754 clusters = [[i] for i in range(len(points))] 755 756 def shiftdown(s, heap, where): 757 e = len(heap) - 1 758 759 i = s 760 j = 2 * i + 1 # i.e. 2 * (i + 1) - 1 761 if j > e: 762 return 763 764 x, p, _ = xp = heap[i] 765 766 while j <= e: 767 if j < e and heap[j] > heap[j + 1]: 768 j += 1 769 if xp <= heap[j]: 770 break 771 772 xx, pp, _ = heap[i] = heap[j] 773 where[pp] = i 774 775 i = j 776 j = 2 * i + 1 777 778 heap[i] = xp 779 where[p] = i 780 781 def shiftup(s, heap, where): 782 i = s 783 j = (i + 1) // 2 - 1 784 if j < 0: 785 return 786 787 x, p, _ = xp = heap[i] 788 789 while j >= 0: 790 if heap[j] <= xp: 791 break 792 793 xx, pp, _ = heap[i] = heap[j] 794 where[pp] = i 795 796 i = j 797 j = (i + 1) // 2 - 1 798 799 heap[i] = xp 800 where[p] = i 801 802 def update(s, xp, heap, where): 803 xxpp = heap[s] 804 assert xp != xxpp 805 806 if xp > xxpp: 807 heap[s] = xp 808 shiftdown(s, heap, where) 809 else: 810 heap[s] = xp 811 shiftup(s, heap, where) 812 813 def remove(s, heap, where): 814 if s == len(heap) - 1: 815 heap = heap[:-1] 816 else: 817 xx, pp, _ = xxpp = heap[-1] 818 heap = heap[:-1] 819 where[pp] = s 820 update(s, xxpp, heap, where) 821 822 return heap 823 824 def unmerged_pairs(i, j, n): 825 # yields all unmerged pairs (1, i), (2, i), ... 826 # except for (i, i) and (i, j) 827 828 # example of unmerged_pairs() output (none merged yet): 829 # n = 4, i = 1, j = 3 830 # unmerged_pairs(1, 3, 4) -> (1, 0), (2, 1) 831 # unmerged_pairs(3, 1, 4) -> (3, 0), (3, 2) 832 833 for r in range(i): # all (i, _) 834 if r != j and clusters[r]: # r is not yet merged? 835 yield _index(i, r) 836 837 # skip (i, i) 838 839 for r in range(i + 1, n): # all (_, i) 840 if r != j and clusters[r]: # r is not yet merged? 841 yield _index(r, i) 842 843 def reduce(): 844 n = len(points) 845 triangular_distance_matrix = distances.matrix() 846 847 if isinstance(k, tuple) and len(k) == 2: 848 criterion = k[0](triangular_distance_matrix, n, **k[1]) 849 assert isinstance(criterion, MergeCriterion) 850 n_clusters_target = 1 851 elif isinstance(k, int): 852 criterion = None 853 n_clusters_target = k 854 else: 855 raise ValueError('illegal k "%s"' % str(k)) 856 857 pairs = [(points[i], points[j]) for i in range(n) for j in range(i)] 858 lookup = [(i, j) for i in range(n) for j in range(i)] 859 860 where = list(range(len(triangular_distance_matrix))) 861 heap = [(d, z, u) for d, z, u in zip( 862 triangular_distance_matrix, where, pairs)] 863 864 for s in range(len(heap) // 2 - 1, -1, -1): # ..., 1, 0 865 shiftdown(s, heap, where) 866 867 n_clusters = n 868 869 # if the criterion does not call save(), "best" is always the current (last) configuration. 870 # save() allows to put a different configuration into "best" and keep on clustering and 871 # return the "best" configuration later on as result. 872 873 if mode in ('clusters', 'components'): 874 dominant = False 875 best = [clusters] 876 877 def save(): # save current configuration 878 best[0] = [c[:] for c in clusters if c] 879 880 def result(): 881 best_clusters = best[0] 882 883 # sort, so clusters appear in order of their first element appearance in the original list. 884 r = sorted([sorted(c) for c in best_clusters if c], key=lambda c: c[0]) 885 886 if mode == 'components': 887 return _components(r, n) 888 elif mode == 'clusters': 889 return [[points[i] for i in c] for c in r] 890 elif mode == 'dominant': 891 dominant = True 892 best = [clusters, weight] 893 894 def save(): # save current configuration 895 best[0] = [c[:] for c in clusters if c] 896 best[1] = weight[:] 897 898 def result(): 899 best_clusters, best_weight = best 900 prototypes = [(points[i], best_weight[i], c) for i, c in enumerate(best_clusters) if c is not None] 901 return sorted(prototypes, key=lambda t: t[1], reverse=True) # most weighted first 902 else: 903 raise ValueError('illegal mode %s' % mode) 904 905 while len(heap) > 0 and n_clusters > n_clusters_target: 906 d, p, _ = heap[0] 907 908 i, j = lookup[p] 909 910 if dominant: 911 if weight[j] > weight[i]: # always merge smaller (j) into larger, dominant (i) 912 i, j = j, i 913 elif i > j: 914 i, j = j, i # merge later chunk to earlier one to preserve order 915 916 if criterion and not criterion.merge(clusters, i, j, d, save): 917 break 918 919 heap = remove(where[p], heap, where) # remove distance (i, j) 920 921 # look at each connection to i, and each connection to j, so that we look at the same 922 # points connection to the cluster, i.e. (a=(1, i), b=(1, j)), (a=(2, i), b=(2,i)), etc. 923 for a, b in zip(unmerged_pairs(i, j, n), unmerged_pairs(j, i, n)): 924 # first, remove the distance to j from the heap, as only the distance to i should 925 # remain, as the new cluster will be i. 926 927 y, py, u = heap[where[b]] 928 heap = remove(where[b], heap, where) 929 930 # now update the distance to i with j's distance, if j was a shorter distance. this 931 # implements a "single linkage" clustering, where the distance of an outside point to 932 # the cluster is always the shortest distance from that outside point to any point in 933 # the cluster. 934 935 # in the "dominant" mode, we do not want this. instead, we want to keep the distance 936 # from outside points to the dominant element in the cluster, which is always "a", 937 # so that the clustering depends on the distance of cluster center to new elements. 938 # using single linkage with "dominant" would fray the borders of the clustering, so 939 # that the dominant elements are not really dominant anymore. 940 941 if not dominant: # use single linkage? 942 x, px, _ = heap[where[a]] 943 if y < x: # compare only values here, and not tuples (x, p) 944 update(where[a], (y, px, u), heap, where) 945 946 if dominant: 947 weight[i] += weight[j] 948 weight[j] = 0 949 950 clusters[i].extend(clusters[j]) 951 clusters[j] = None 952 953 n_clusters -= 1 954 955 return result() 956 957 return reduce() 958 959 960class _KMeans: 961 def __init__(self, x, d, epsilon): 962 self.x = x 963 self.d = d 964 self.epsilon = epsilon 965 966 def _pick_initial(self): 967 # use k-means++, see [Arthur2007] 968 969 x = self.x 970 d = self.d 971 972 candidates = list(range(len(x))) 973 974 i = random.randint(0, len(candidates) - 1) 975 new_centroid = x[candidates[i]] 976 yield new_centroid 977 978 def swap_delete(a, i): 979 j = len(a) - 1 980 a[i] = a[j] 981 del a[j] 982 983 def random_choice(m, p): # weighted random choice 984 r = random.uniform(0, m) 985 for i, z in enumerate(chain(p, [None])): 986 if z is None: 987 return i - 1 988 if r < z: 989 return i 990 r -= z 991 992 def d2(x, y): 993 distance = d(x, y) 994 return distance * distance 995 996 swap_delete(candidates, i) 997 998 distances = [d2(new_centroid, x[candidate]) for candidate in candidates] 999 sum_of_distances = sum(distances) 1000 while True: 1001 i = random_choice(sum_of_distances, distances) 1002 new_centroid = x[candidates[i]] 1003 yield new_centroid 1004 1005 swap_delete(candidates, i) 1006 sum_of_distances -= distances[i] 1007 swap_delete(distances, i) 1008 1009 for i, candidate_distance in enumerate(zip(candidates, distances)): 1010 candidate, old_distance = candidate_distance 1011 new_distance = d2(new_centroid, x[candidate]) 1012 if new_distance < old_distance: 1013 distances[i] = new_distance 1014 sum_of_distances -= old_distance - new_distance 1015 1016 def _distances(self, c): 1017 d = self.d 1018 k = len(c) 1019 1020 between = {} 1021 for j1, c1 in enumerate(c): 1022 z = j1 * k 1023 for j2, c2 in enumerate(c[:j1]): 1024 between[z + j2] = d(c1, c2) 1025 1026 return lambda x, y: between[max(x, y) * k + min(x, y)] 1027 1028 def _kmeans(self, k): 1029 # implements an improved version of kmeans, see [Hamerly2010]. 1030 x = self.x 1031 d = self.d 1032 1033 assert k <= len(x) 1034 1035 # pick initial clusters. actually quite an important step. 1036 c = list(islice(self._pick_initial(), 0, k)) 1037 assert len(c) == k 1038 1039 q = [0] * len(c) 1040 cc = [[0] * len(x[0]) for _ in range(k)] 1041 1042 a = [0] * len(x) 1043 u = [0] * len(x) 1044 l = [0] * len(x) 1045 1046 for i, xi in enumerate(x): 1047 ai, u[i], _, l[i] = _smallest2(d(xi, cj) for cj in c) 1048 q[ai] += 1 1049 cc[ai] = _pairwise_sum(cc[ai], xi) 1050 a[i] = ai 1051 1052 s = [0] * len(c) 1053 p = [0] * len(c) 1054 change = None 1055 1056 while change is None or change > self.epsilon: 1057 # find cluster distances 1058 cd = self._distances(c) 1059 s = [min(cd(j1, j2) for j2 in range(k) if j2 != j1) for j1 in range(k)] 1060 1061 # find new assignments 1062 for i, ai in enumerate(a): 1063 m = max(s[ai] / 2., l[i]) 1064 1065 if u[i] > m: 1066 xi = x[i] 1067 u[i] = d(xi, c[ai]) 1068 1069 if u[i] > m: 1070 new_ai, u[i], _, l[i] = _smallest2(d(xi, cj) for cj in c) 1071 1072 if new_ai != ai: 1073 q[ai] -= 1 1074 q[new_ai] += 1 1075 cc[ai] = [a - b for a, b in zip(cc[ai], xi)] 1076 cc[new_ai] = [a + b for a, b in zip(cc[new_ai], xi)] 1077 1078 a[i] = new_ai 1079 1080 # move centers 1081 empty_cluster = False 1082 1083 for j in range(len(c)): 1084 qj = q[j] 1085 if qj == 0: 1086 empty_cluster = True 1087 break 1088 c_old = c[j] 1089 c_new = [ccj / qj for ccj in cc[j]] 1090 distance = d(c_old, c_new) 1091 c[j] = c_new 1092 p[j] = distance 1093 1094 if empty_cluster: 1095 break 1096 1097 # update bounds 1098 r1, p1, r2, p2 = _largest2(p) 1099 for i, ai in enumerate(a): 1100 u[i] += p[ai] 1101 if r1 == ai: 1102 l[i] -= p2 1103 else: 1104 l[i] -= p1 1105 1106 change = sum(p) 1107 1108 # compute an approximate silhouette index 1109 within = [0] * len(c) 1110 for i, ai in enumerate(a): 1111 within[ai] += d(x[i], c[ai]) 1112 for j in range(len(c)): 1113 if q[j] == 1: 1114 return a, -1. # no good config 1115 within[j] /= q[j] - 1 1116 1117 silhouette = fsum(_silhouette(a, b) for a, b in zip(within, s)) / len(c) 1118 return a, silhouette 1119 1120 def _optimize(self, solutions): 1121 best_a = None 1122 best_silhouette = None 1123 best_k = None 1124 1125 for a, silhouette, k in solutions(): 1126 if best_silhouette is None: 1127 pass 1128 elif silhouette <= best_silhouette: 1129 break 1130 best_silhouette = silhouette 1131 best_a = a 1132 best_k = k 1133 1134 return best_a, best_silhouette, best_k 1135 1136 def with_k(self, k): 1137 def solutions(): 1138 while True: 1139 a, s = self._kmeans(k) 1140 yield a, s, k 1141 1142 return self._optimize(solutions) 1143 1144 def without_k(self): 1145 def solutions(): 1146 k = 2 1147 while k < len(self.x): 1148 yield self.with_k(k) 1149 k += 1 1150 1151 return self._optimize(solutions) 1152 1153 1154def _squared_euclidean_distance(a, b): 1155 s = None 1156 for x, y in zip(a, b): 1157 d = x - y 1158 if s is None: 1159 s = d * d 1160 else: 1161 s += d * d 1162 return s 1163 1164 1165def _clusters(x, a, k): 1166 clusters = [[] for _ in range(k)] 1167 add = [c.append for c in clusters] 1168 for i, j in enumerate(a): 1169 add[j](x[i]) 1170 return clusters 1171 1172 1173def kmeans(x, x_repr, k, mode, seed, epsilon): 1174 assert len(x) == len(x_repr) 1175 1176 random.seed(seed) 1177 km = _KMeans(x, _squared_euclidean_distance, epsilon) 1178 1179 if k is None: 1180 a, _, k = km.without_k() 1181 else: 1182 assert 1 < k < len(x) 1183 a, _, k = km.with_k(k) 1184 1185 if mode == 'clusters': 1186 return _clusters(x_repr, a, k) 1187 elif mode == 'components': 1188 return a 1189 else: 1190 raise ValueError('illegal mode %s' % mode) 1191