1""" 2A module of classification schemes for choropleth mapping. 3""" 4import numpy as np 5import scipy.stats as stats 6import copy 7from sklearn.cluster import KMeans as KMEANS 8from warnings import warn as Warn 9 10__author__ = "Sergio J. Rey" 11 12__all__ = [ 13 "MapClassifier", 14 "quantile", 15 "BoxPlot", 16 "EqualInterval", 17 "FisherJenks", 18 "FisherJenksSampled", 19 "JenksCaspall", 20 "JenksCaspallForced", 21 "JenksCaspallSampled", 22 "HeadTailBreaks", 23 "MaxP", 24 "MaximumBreaks", 25 "NaturalBreaks", 26 "Quantiles", 27 "Percentiles", 28 "StdMean", 29 "UserDefined", 30 "gadf", 31 "KClassifiers", 32 "CLASSIFIERS", 33] 34 35CLASSIFIERS = ( 36 "BoxPlot", 37 "EqualInterval", 38 "FisherJenks", 39 "FisherJenksSampled", 40 "HeadTailBreaks", 41 "JenksCaspall", 42 "JenksCaspallForced", 43 "JenksCaspallSampled", 44 "MaxP", 45 "MaximumBreaks", 46 "NaturalBreaks", 47 "Quantiles", 48 "Percentiles", 49 "StdMean", 50 "UserDefined", 51) 52 53K = 5 # default number of classes in any map scheme with this as an argument 54SEEDRANGE = 1000000 # range for drawing random ints from for Natural Breaks 55 56 57FMT = "{:.2f}" 58 59try: 60 from numba import jit 61except ImportError: 62 63 def jit(func): 64 return func 65 66 67def _format_intervals(mc, fmt="{:.0f}"): 68 """ 69 Helper methods to format legend intervals 70 71 72 Parameters 73 ---------- 74 75 mc: MapClassifier 76 77 fmt: str 78 specification of formatting for legend 79 80 Returns 81 ------- 82 tuple: 83 edges: list 84 k strings for class intervals 85 max_width: int 86 length of largest interval string 87 lower_open: boolean 88 True: lower bound of first interval is open 89 False: lower bound of first interval is closed 90 91 Notes 92 ----- 93 For some classifiers, it is possible that the upper bound of the first interval is less than the minimum value of the attribute that is being classified. In these cases `lower_open=True` and the lower bound of the interval is set to `np.NINF`. 94 """ 95 96 97 lowest = mc.y.min() 98 if hasattr(mc, 'lowest'): 99 if mc.lowest is not None: 100 lowest = mc.lowest 101 lower_open = False 102 if lowest > mc.bins[0]: 103 lowest = np.NINF 104 lower_open = True 105 edges = [lowest] 106 edges.extend(mc.bins) 107 edges = [fmt.format(edge) for edge in edges] 108 max_width = max([len(edge) for edge in edges]) 109 return edges, max_width, lower_open 110 111 112def _get_mpl_labels(mc, fmt="{:.1f}"): 113 """ 114 Helper method to format legend intervals for matplotlib (and geopandas) 115 116 Parameters 117 ---------- 118 119 mc: MapClassifier 120 121 fmt: str 122 specification of formatting for legend 123 124 Returns 125 ------- 126 intervals: list 127 k strings for class intervals 128 """ 129 edges, max_width, lower_open = _format_intervals(mc, fmt) 130 k = len(edges) - 1 131 left = ["["] 132 if lower_open: 133 left = ["("] 134 left.extend("(" * k) 135 right = "]" * (k + 1) 136 lower = ["{:>{width}}".format(edges[i], width=max_width) for i in range(k)] 137 upper = ["{:>{width}}".format(edges[i], width=max_width) for i in range(1, k + 1)] 138 lower = [l + r for l, r in zip(left, lower)] 139 upper = [l + r for l, r in zip(upper, right)] 140 intervals = [l + ", " + r for l, r in zip(lower, upper)] 141 return intervals 142 143 144def _get_table(mc, fmt="{:.2f}"): 145 """ 146 Helper function to generate tabular classification report 147 148 Parameters 149 ---------- 150 151 mc: MapClassifier 152 153 fmt: str 154 specification of formatting for legend 155 156 Returns 157 ------- 158 table: string 159 formatted table of classification results 160 161 """ 162 intervals = _get_mpl_labels(mc, fmt) 163 interval_width = len(intervals[0]) 164 counts = list(map(str, mc.counts)) 165 count_width = max([len(count) for count in counts]) 166 count_width = max(count_width, len("count")) 167 interval_width = max(interval_width, len("interval")) 168 header = "{:^{width}}".format("Interval", width=interval_width) 169 header += " " + "{:>{width}}".format("Count", width=count_width) 170 title = "{:<{width}}".format(mc.name, width=len(header)) 171 header += "\n" + "-" * len(header) 172 table = [title, "", header] 173 for i, interval in enumerate(intervals): 174 row = interval + " | " + "{:>{width}}".format(counts[i], width=count_width) 175 table.append(row) 176 return "\n".join(table) 177 178 179def head_tail_breaks(values, cuts): 180 """ 181 head tail breaks helper function 182 """ 183 values = np.array(values) 184 mean = np.mean(values) 185 cuts.append(mean) 186 if len(set(values)) > 1: 187 return head_tail_breaks(values[values > mean], cuts) 188 return cuts 189 190 191def quantile(y, k=4): 192 """ 193 Calculates the quantiles for an array 194 195 Parameters 196 ---------- 197 y : array 198 (n,1), values to classify 199 k : int 200 number of quantiles 201 202 Returns 203 ------- 204 q : array 205 (n,1), quantile values 206 207 Examples 208 -------- 209 >>> import numpy as np 210 >>> import mapclassify as mc 211 >>> x = np.arange(1000) 212 >>> mc.classifiers.quantile(x) 213 array([249.75, 499.5 , 749.25, 999. ]) 214 >>> mc.classifiers.quantile(x, k = 3) 215 array([333., 666., 999.]) 216 217 Note that if there are enough ties that the quantile values repeat, we 218 collapse to pseudo quantiles in which case the number of classes will be 219 less than k 220 221 >>> x = [1.0] * 100 222 >>> x.extend([3.0] * 40) 223 >>> len(x) 224 140 225 >>> y = np.array(x) 226 >>> mc.classifiers.quantile(y) 227 array([1., 3.]) 228 """ 229 230 w = 100.0 / k 231 p = np.arange(w, 100 + w, w) 232 if p[-1] > 100.0: 233 p[-1] = 100.0 234 q = np.array([stats.scoreatpercentile(y, pct) for pct in p]) 235 q = np.unique(q) 236 k_q = len(q) 237 if k_q < k: 238 Warn( 239 "Warning: Not enough unique values in array to form k classes", UserWarning 240 ) 241 Warn("Warning: setting k to %d" % k_q, UserWarning) 242 return q 243 244 245def binC(y, bins): 246 """ 247 Bin categorical/qualitative data 248 249 Parameters 250 ---------- 251 y : array 252 (n,q), categorical values 253 bins : array 254 (k,1), unique values associated with each bin 255 256 Return 257 ------ 258 b : array 259 (n,q), bin membership, values between 0 and k-1 260 261 Examples 262 -------- 263 >>> import numpy as np 264 >>> import mapclassify as mc 265 >>> np.random.seed(1) 266 >>> x = np.random.randint(2, 8, (10, 3)) 267 >>> bins = list(range(2, 8)) 268 >>> x 269 array([[7, 5, 6], 270 [2, 3, 5], 271 [7, 2, 2], 272 [3, 6, 7], 273 [6, 3, 4], 274 [6, 7, 4], 275 [6, 5, 6], 276 [4, 6, 7], 277 [4, 6, 3], 278 [3, 2, 7]]) 279 >>> y = mc.classifiers.binC(x, bins) 280 >>> y 281 array([[5, 3, 4], 282 [0, 1, 3], 283 [5, 0, 0], 284 [1, 4, 5], 285 [4, 1, 2], 286 [4, 5, 2], 287 [4, 3, 4], 288 [2, 4, 5], 289 [2, 4, 1], 290 [1, 0, 5]]) 291 """ 292 293 if np.ndim(y) == 1: 294 k = 1 295 n = np.shape(y)[0] 296 else: 297 n, k = np.shape(y) 298 b = np.zeros((n, k), dtype="int") 299 for i, bin in enumerate(bins): 300 b[np.nonzero(y == bin)] = i 301 302 # check for non-binned items and warn if needed 303 vals = set(y.flatten()) 304 for val in vals: 305 if val not in bins: 306 Warn("value not in bin: {}".format(val), UserWarning) 307 Warn("bins: {}".format(bins), UserWarning) 308 309 return b 310 311 312def bin(y, bins): 313 """ 314 bin interval/ratio data 315 316 Parameters 317 ---------- 318 y : array 319 (n,q), values to bin 320 bins : array 321 (k,1), upper bounds of each bin (monotonic) 322 323 Returns 324 ------- 325 b : array 326 (n,q), values of values between 0 and k-1 327 328 Examples 329 -------- 330 >>> import numpy as np 331 >>> import mapclassify as mc 332 >>> np.random.seed(1) 333 >>> x = np.random.randint(2, 20, (10, 3)) 334 >>> bins = [10, 15, 20] 335 >>> b = mc.classifiers.bin(x, bins) 336 >>> x 337 array([[ 7, 13, 14], 338 [10, 11, 13], 339 [ 7, 17, 2], 340 [18, 3, 14], 341 [ 9, 15, 8], 342 [ 7, 13, 12], 343 [16, 6, 11], 344 [19, 2, 15], 345 [11, 11, 9], 346 [ 3, 2, 19]]) 347 >>> b 348 array([[0, 1, 1], 349 [0, 1, 1], 350 [0, 2, 0], 351 [2, 0, 1], 352 [0, 1, 0], 353 [0, 1, 1], 354 [2, 0, 1], 355 [2, 0, 1], 356 [1, 1, 0], 357 [0, 0, 2]]) 358 """ 359 if np.ndim(y) == 1: 360 k = 1 361 n = np.shape(y)[0] 362 else: 363 n, k = np.shape(y) 364 b = np.zeros((n, k), dtype="int") 365 i = len(bins) 366 if type(bins) != list: 367 bins = bins.tolist() 368 binsc = copy.copy(bins) 369 while binsc: 370 i -= 1 371 c = binsc.pop(-1) 372 b[np.nonzero(y <= c)] = i 373 return b 374 375 376def bin1d(x, bins): 377 """ 378 Place values of a 1-d array into bins and determine counts of values in 379 each bin 380 381 Parameters 382 ---------- 383 x : array 384 (n, 1), values to bin 385 bins : array 386 (k,1), upper bounds of each bin (monotonic) 387 388 Returns 389 ------- 390 binIds : array 391 1-d array of integer bin Ids 392 393 counts : int 394 number of elements of x falling in each bin 395 396 Examples 397 -------- 398 >>> import numpy as np 399 >>> import mapclassify as mc 400 >>> x = np.arange(100, dtype = 'float') 401 >>> bins = [25, 74, 100] 402 >>> binIds, counts = mc.classifiers.bin1d(x, bins) 403 >>> binIds 404 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 405 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 406 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 407 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 408 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) 409 >>> counts 410 array([26, 49, 25]) 411 """ 412 left = [-float("inf")] 413 left.extend(bins[0:-1]) 414 right = bins 415 cuts = list(zip(left, right)) 416 k = len(bins) 417 binIds = np.zeros(x.shape, dtype="int") 418 while cuts: 419 k -= 1 420 l, r = cuts.pop(-1) 421 binIds += (x > l) * (x <= r) * k 422 counts = np.bincount(binIds, minlength=len(bins)) 423 return (binIds, counts) 424 425 426def load_example(): 427 """ 428 Helper function for doc tests 429 """ 430 from .datasets import calemp 431 432 return calemp.load() 433 434 435def _kmeans(y, k=5, n_init=10): 436 """ 437 Helper function to do k-means in one dimension 438 439 Parameters 440 ---------- 441 442 y : array 443 (n,1), values to classify 444 k : int 445 number of classes to form 446 447 n_init : int, default: 10 448 number of initial solutions. Best of initial results is returned. 449 """ 450 451 y = y * 1.0 # KMEANS needs float or double dtype 452 y.shape = (-1, 1) 453 result = KMEANS(n_clusters=k, init="k-means++", n_init=n_init).fit(y) 454 class_ids = result.labels_ 455 centroids = result.cluster_centers_ 456 binning = [] 457 for c in range(k): 458 values = y[class_ids == c] 459 binning.append([values.max(), len(values)]) 460 binning = np.array(binning) 461 binning = binning[binning[:, 0].argsort()] 462 cuts = binning[:, 0] 463 464 y_cent = np.zeros_like(y) 465 for c in range(k): 466 y_cent[class_ids == c] = centroids[c] 467 diffs = y - y_cent 468 diffs *= diffs 469 470 return class_ids, cuts, diffs.sum(), centroids 471 472 473def natural_breaks(values, k=5, init=10): 474 """ 475 natural breaks helper function 476 477 Jenks natural breaks is kmeans in one dimension 478 479 Parameters 480 ---------- 481 482 values : array 483 (n, 1) values to bin 484 485 k : int 486 Number of classes 487 488 init: int, default:10 489 Number of different solutions to obtain using different centroids. Best solution is returned. 490 491 492 """ 493 values = np.array(values) 494 uv = np.unique(values) 495 uvk = len(uv) 496 if uvk < k: 497 Warn( 498 "Warning: Not enough unique values in array to form k classes", UserWarning 499 ) 500 Warn("Warning: setting k to %d" % uvk, UserWarning) 501 k = uvk 502 kres = _kmeans(values, k, n_init=init) 503 sids = kres[-1] # centroids 504 fit = kres[-2] 505 class_ids = kres[0] 506 cuts = kres[1] 507 return (sids, class_ids, fit, cuts) 508 509 510@jit 511def _fisher_jenks_means(values, classes=5, sort=True): 512 """ 513 Jenks Optimal (Natural Breaks) algorithm implemented in Python. 514 515 Notes 516 ----- 517 The original Python code comes from here: 518 http://danieljlewis.org/2010/06/07/jenks-natural-breaks-algorithm-in-python/ 519 and is based on a JAVA and Fortran code available here: 520 https://stat.ethz.ch/pipermail/r-sig-geo/2006-March/000811.html 521 522 Returns class breaks such that classes are internally homogeneous while 523 assuring heterogeneity among classes. 524 525 """ 526 if sort: 527 values.sort() 528 n_data = len(values) 529 mat1 = np.zeros((n_data + 1, classes + 1), dtype=np.int32) 530 mat2 = np.zeros((n_data + 1, classes + 1), dtype=np.float32) 531 mat1[1, 1:] = 1 532 mat2[2:, 1:] = np.inf 533 534 v = np.float32(0) 535 for l in range(2, len(values) + 1): 536 s1 = np.float32(0) 537 s2 = np.float32(0) 538 w = np.float32(0) 539 for m in range(1, l + 1): 540 i3 = l - m + 1 541 val = np.float32(values[i3 - 1]) 542 s2 += val * val 543 s1 += val 544 w += np.float32(1) 545 v = s2 - (s1 * s1) / w 546 i4 = i3 - 1 547 if i4 != 0: 548 for j in range(2, classes + 1): 549 if mat2[l, j] >= (v + mat2[i4, j - 1]): 550 mat1[l, j] = i3 551 mat2[l, j] = v + mat2[i4, j - 1] 552 mat1[l, 1] = 1 553 mat2[l, 1] = v 554 555 k = len(values) 556 557 kclass = np.zeros(classes + 1, dtype=values.dtype) 558 kclass[classes] = values[len(values) - 1] 559 kclass[0] = values[0] 560 for countNum in range(classes, 1, -1): 561 pivot = mat1[k, countNum] 562 id = int(pivot - 2) 563 kclass[countNum - 1] = values[id] 564 k = int(pivot - 1) 565 return kclass 566 567 568class MapClassifier(object): 569 r""" 570 Abstract class for all map classifications :cite:`Slocum_2009` 571 572 For an array :math:`y` of :math:`n` values, a map classifier places each 573 value :math:`y_i` into one of :math:`k` mutually exclusive and exhaustive 574 classes. Each classifer defines the classes based on different criteria, 575 but in all cases the following hold for the classifiers in PySAL: 576 577 .. math:: C_j^l < y_i \le C_j^u \ \forall i \in C_j 578 579 where :math:`C_j` denotes class :math:`j` which has lower bound 580 :math:`C_j^l` and upper bound :math:`C_j^u`. 581 582 Map Classifiers Supported 583 584 * :class:`mapclassify.classifiers.BoxPlot` 585 * :class:`mapclassify.classifiers.EqualInterval` 586 * :class:`mapclassify.classifiers.FisherJenks` 587 * :class:`mapclassify.classifiers.FisherJenksSampled` 588 * :class:`mapclassify.classifiers.HeadTailBreaks` 589 * :class:`mapclassify.classifiers.JenksCaspall` 590 * :class:`mapclassify.classifiers.JenksCaspallForced` 591 * :class:`mapclassify.classifiers.JenksCaspallSampled` 592 * :class:`mapclassify.classifiers.MaxP` 593 * :class:`mapclassify.classifiers.MaximumBreaks` 594 * :class:`mapclassify.classifiers.NaturalBreaks` 595 * :class:`mapclassify.classifiers.Quantiles` 596 * :class:`mapclassify.classifiers.Percentiles` 597 * :class:`mapclassify.classifiers.StdMean` 598 * :class:`mapclassify.classifiers.UserDefined` 599 600 Utilities: 601 602 In addition to the classifiers, there are several utility functions that 603 can be used to evaluate the properties of a specific classifier, 604 or for automatic selection of a classifier and 605 number of classes. 606 607 * :func:`mapclassify.classifiers.gadf` 608 * :class:`mapclassify.classifiers.K_classifiers` 609 610 """ 611 612 def __init__(self, y): 613 y = np.asarray(y).flatten() 614 self.name = "Map Classifier" 615 self.fmt = FMT 616 self.y = y 617 self._classify() 618 self._summary() 619 620 def get_fmt(self): 621 return self._fmt 622 623 def set_fmt(self, fmt): 624 self._fmt = fmt 625 626 fmt = property(get_fmt, set_fmt) 627 628 def _summary(self): 629 yb = self.yb 630 self.classes = [np.nonzero(yb == c)[0].tolist() for c in range(self.k)] 631 self.tss = self.get_tss() 632 self.adcm = self.get_adcm() 633 self.gadf = self.get_gadf() 634 635 def _classify(self): 636 self._set_bins() 637 self.yb, self.counts = bin1d(self.y, self.bins) 638 639 def _update(self, data, *args, **kwargs): 640 """ 641 The only thing that *should* happen in this function is 642 1. input sanitization for pandas 643 2. classification/reclassification. 644 645 Using their __init__ methods, all classifiers can re-classify given 646 different input parameters or additional data. 647 648 If you've got a cleverer updating equation than the intial estimation 649 equation, remove the call to self.__init__ below and replace it with 650 the updating function. 651 """ 652 if data is not None: 653 data = np.asarray(data).flatten() 654 data = np.append(data.flatten(), self.y) 655 else: 656 data = self.y 657 self.__init__(data, *args, **kwargs) 658 659 @classmethod 660 def make(cls, *args, **kwargs): 661 """ 662 Configure and create a classifier that will consume data and produce 663 classifications, given the configuration options specified by this 664 function. 665 666 Note that this like a *partial application* of the relevant class 667 constructor. `make` creates a function that returns classifications; it 668 does not actually do the classification. 669 670 If you want to classify data directly, use the appropriate class 671 constructor, like Quantiles, Max_Breaks, etc. 672 673 If you *have* a classifier object, but want to find which bins new data 674 falls into, use find_bin. 675 676 Parameters 677 ---------- 678 *args : required positional arguments 679 all positional arguments required by the classifier, 680 excluding the input data. 681 rolling : bool 682 a boolean configuring the outputted classifier to use 683 a rolling classifier rather than a new classifier for 684 each input. If rolling, this adds the current data to 685 all of the previous data in the classifier, and 686 rebalances the bins, like a running median 687 computation. 688 return_object : bool 689 a boolean configuring the outputted classifier to 690 return the classifier object or not 691 return_bins : bool 692 a boolean configuring the outputted classifier to 693 return the bins/breaks or not 694 return_counts : bool 695 a boolean configuring the outputted classifier to 696 return the histogram of objects falling into each bin 697 or not 698 699 Returns 700 ------- 701 A function that consumes data and returns their bins (and object, 702 bins/breaks, or counts, if requested). 703 704 Note 705 ---- 706 This is most useful when you want to run a classifier many times 707 with a given configuration, such as when classifying many columns of an 708 array or dataframe using the same configuration. 709 710 Examples 711 -------- 712 >>> import libpysal as ps 713 >>> import mapclassify as mc 714 >>> import geopandas as gpd 715 >>> df = gpd.read_file(ps.examples.get_path('columbus.dbf')) 716 >>> classifier = mc.Quantiles.make(k=9) 717 >>> cl = df[['HOVAL', 'CRIME', 'INC']].apply(classifier) 718 >>> cl["HOVAL"].values[:10] 719 array([8, 7, 2, 4, 1, 3, 8, 5, 7, 8]) 720 >>> cl["CRIME"].values[:10] 721 array([0, 1, 3, 4, 6, 2, 0, 5, 3, 4]) 722 >>> cl["INC"].values[:10] 723 array([7, 8, 5, 0, 3, 5, 0, 3, 6, 4]) 724 >>> import pandas as pd; from numpy import linspace as lsp 725 >>> data = [lsp(3,8,num=10), lsp(10, 0, num=10), lsp(-5, 15, num=10)] 726 >>> data = pd.DataFrame(data).T 727 >>> data 728 0 1 2 729 0 3.000000 10.000000 -5.000000 730 1 3.555556 8.888889 -2.777778 731 2 4.111111 7.777778 -0.555556 732 3 4.666667 6.666667 1.666667 733 4 5.222222 5.555556 3.888889 734 5 5.777778 4.444444 6.111111 735 6 6.333333 3.333333 8.333333 736 7 6.888889 2.222222 10.555556 737 8 7.444444 1.111111 12.777778 738 9 8.000000 0.000000 15.000000 739 >>> data.apply(mc.Quantiles.make(rolling=True)) 740 0 1 2 741 0 0 4 0 742 1 0 4 0 743 2 1 4 0 744 3 1 3 0 745 4 2 2 1 746 5 2 1 2 747 6 3 0 4 748 7 3 0 4 749 8 4 0 4 750 9 4 0 4 751 >>> dbf = ps.io.open(ps.examples.get_path('baltim.dbf')) 752 >>> data = dbf.by_col_array('PRICE', 'LOTSZ', 'SQFT') 753 >>> my_bins = [1, 10, 20, 40, 80] 754 >>> cl = [mc.UserDefined.make(bins=my_bins)(a) for a in data.T] 755 >>> len(cl) 756 3 757 >>> cl[0][:10] 758 array([4, 5, 5, 5, 4, 4, 5, 4, 4, 5]) 759 """ 760 761 # only flag overrides return flag 762 to_annotate = copy.deepcopy(kwargs) 763 return_object = kwargs.pop("return_object", False) 764 return_bins = kwargs.pop("return_bins", False) 765 return_counts = kwargs.pop("return_counts", False) 766 767 rolling = kwargs.pop("rolling", False) 768 if rolling: 769 # just initialize a fake classifier 770 data = list(range(10)) 771 cls_instance = cls(data, *args, **kwargs) 772 # and empty it, since we'll be using the update 773 cls_instance.y = np.array([]) 774 else: 775 cls_instance = None 776 777 # wrap init in a closure to make a consumer. 778 # Qc Na: "Objects/Closures are poor man's Closures/Objects" 779 def classifier(data, cls_instance=cls_instance): 780 if rolling: 781 cls_instance.update(data, inplace=True, **kwargs) 782 yb = cls_instance.find_bin(data) 783 else: 784 cls_instance = cls(data, *args, **kwargs) 785 yb = cls_instance.yb 786 outs = [yb, None, None, None] 787 outs[1] = cls_instance if return_object else None 788 outs[2] = cls_instance.bins if return_bins else None 789 outs[3] = cls_instance.counts if return_counts else None 790 outs = [a for a in outs if a is not None] 791 if len(outs) == 1: 792 return outs[0] 793 else: 794 return outs 795 796 # for debugging/jic, keep around the kwargs. 797 # in future, we might want to make this a thin class, so that we can 798 # set a custom repr. Call the class `Binner` or something, that's a 799 # pre-configured Classifier that just consumes data, bins it, & 800 # possibly updates the bins. 801 classifier._options = to_annotate 802 return classifier 803 804 def update(self, y=None, inplace=False, **kwargs): 805 """ 806 Add data or change classification parameters. 807 808 Parameters 809 ---------- 810 y : array 811 (n,1) array of data to classify 812 inplace : bool 813 whether to conduct the update in place or to return a copy 814 estimated from the additional specifications. 815 816 Additional parameters provided in **kwargs are passed to the init 817 function of the class. For documentation, check the class constructor. 818 """ 819 kwargs.update({"k": kwargs.pop("k", self.k)}) 820 if inplace: 821 self._update(y, **kwargs) 822 else: 823 new = copy.deepcopy(self) 824 new._update(y, **kwargs) 825 return new 826 827 def __str__(self): 828 return self.table() 829 830 def __repr__(self): 831 return self.table() 832 833 def table(self): 834 fmt = self.fmt 835 return _get_table(self, fmt=fmt) 836 837 def __call__(self, *args, **kwargs): 838 """ 839 This will allow the classifier to be called like it's a function. 840 841 Whether or not we want to make this be "find_bin" or "update" is a 842 design decision. 843 844 I like this as find_bin, since a classifier's job should be to classify 845 the data given to it using the rules estimated from the `_classify()` 846 function. 847 """ 848 return self.find_bin(*args) 849 850 def get_tss(self): 851 """ 852 Total sum of squares around class means 853 854 Returns sum of squares over all class means 855 """ 856 tss = 0 857 for class_def in self.classes: 858 if len(class_def) > 0: 859 yc = self.y[class_def] 860 css = yc - yc.mean() 861 css *= css 862 tss += sum(css) 863 return tss 864 865 def _set_bins(self): 866 pass 867 868 def get_adcm(self): 869 """ 870 Absolute deviation around class median (ADCM). 871 872 Calculates the absolute deviations of each observation about its class 873 median as a measure of fit for the classification method. 874 875 Returns sum of ADCM over all classes 876 """ 877 adcm = 0 878 for class_def in self.classes: 879 if len(class_def) > 0: 880 yc = self.y[class_def] 881 yc_med = np.median(yc) 882 ycd = np.abs(yc - yc_med) 883 adcm += sum(ycd) 884 return adcm 885 886 def get_gadf(self): 887 """ 888 Goodness of absolute deviation of fit 889 """ 890 adam = (np.abs(self.y - np.median(self.y))).sum() 891 gadf = 1 - self.adcm / adam 892 return gadf 893 894 def _table_string(self, width=12, decimal=3): 895 labels, largest = self.get_legend_classes(table=True) 896 h1 = "Lower" 897 h1 = h1.center(largest) 898 h2 = " " 899 h2 = h2.center(10) 900 h3 = "Upper" 901 h3 = h3.center(largest + 1) 902 903 largest = "%d" % max(self.counts) 904 largest = len(largest) + 15 905 h4 = "Count" 906 907 h4 = h4.rjust(largest) 908 table = [] 909 header = h1 + h2 + h3 + h4 910 table.append(header) 911 table.append("=" * len(header)) 912 913 for i, label in enumerate(labels): 914 left, right = label.split() 915 if i == 0: 916 left = " " * largest 917 left += " x[i] <= " 918 else: 919 left += " < x[i] <= " 920 row = left + right 921 cnt = "%d" % self.counts[i] 922 cnt = cnt.rjust(largest) 923 row += cnt 924 table.append(row) 925 name = self.name 926 top = name.center(len(row)) 927 table.insert(0, top) 928 table.insert(1, " ") 929 table = "\n".join(table) 930 return table 931 932 def find_bin(self, x): 933 """ 934 Sort input or inputs according to the current bin estimate 935 936 Parameters 937 ---------- 938 x : array or numeric 939 a value or array of values to fit within the estimated 940 bins 941 942 Returns 943 ------- 944 a bin index or array of bin indices that classify the input into one of 945 the classifiers' bins. 946 947 Note that this differs from similar functionality in 948 numpy.digitize(x, classi.bins, right=True). 949 950 This will always provide the closest bin, so data "outside" the classifier, 951 above and below the max/min breaks, will be classified into the nearest bin. 952 953 numpy.digitize returns k+1 for data greater than the greatest bin, but retains 0 954 for data below the lowest bin. 955 """ 956 x = np.asarray(x).flatten() 957 right = np.digitize(x, self.bins, right=True) 958 if right.max() == len(self.bins): 959 right[right == len(self.bins)] = len(self.bins) - 1 960 return right 961 962 def get_legend_classes(self, fmt=FMT): 963 """ 964 Format the strings for the classes on the legend 965 966 967 Parameters 968 ========== 969 970 fmt : string 971 formatting specification 972 973 Returns 974 ======= 975 classes: list 976 k strings with class interval definitions 977 """ 978 return _get_mpl_labels(self, fmt) 979 980 def plot( 981 self, 982 gdf, 983 border_color="lightgrey", 984 border_width=0.10, 985 title=None, 986 legend=False, 987 cmap="YlGnBu", 988 axis_on=True, 989 legend_kwds={"loc": "lower right", "fmt": FMT}, 990 file_name=None, 991 dpi=600, 992 ax=None, 993 ): 994 """ 995 Plot Mapclassiifer 996 NOTE: Requires matplotlib, and implicitly requires geopandas 997 dataframe as input. 998 999 Parameters 1000 --------- 1001 gdf : geopandas geodataframe 1002 Contains the geometry column for the choropleth map 1003 border_color : string, optional 1004 matplotlib color string to use for polygon border 1005 (Default: lightgrey) 1006 border_width : float, optional 1007 width of polygon boarder 1008 (Default: 0.10) 1009 title : string, optional 1010 Title of map 1011 (Default: None) 1012 cmap : string, optional 1013 matplotlib color string for color map to fill polygons 1014 (Default: YlGn) 1015 axis_on : boolean, optional 1016 Show coordinate axes (default True) 1017 (Default: True) 1018 legend_kwds : dict, optional 1019 options for ax.legend() 1020 (Default: {"loc": "lower right", 'fmt':FMT}) 1021 file_name : string, optional 1022 Name of file to save figure to. 1023 (Default: None) 1024 dpi : int, optional 1025 Dots per inch for saved figure 1026 (Default: 600) 1027 ax : matplotlib axis, optional 1028 axis on which to plot the choropleth. 1029 (Default: None, so plots on the current figure) 1030 Returns 1031 ------- 1032 f,ax : tuple 1033 matplotlib figure, axis on which the plot is made. 1034 1035 1036 Examples 1037 -------- 1038 1039 >>> import libpysal as lp 1040 >>> import geopandas 1041 >>> import mapclassify 1042 >>> gdf = geopandas.read_file(lp.examples.get_path("columbus.shp")) 1043 >>> q5 = mapclassify.Quantiles(gdf.CRIME) 1044 >>> q5.plot(gdf) # doctest: +SKIP 1045 """ 1046 try: 1047 import matplotlib.pyplot as plt 1048 except ImportError: 1049 raise ImportError( 1050 "Mapclassify.plot depends on matplotlib.pyplot, and this was" 1051 "not able to be imported. \nInstall matplotlib to" 1052 "plot spatial classifier." 1053 ) 1054 if ax is None: 1055 f = plt.figure() 1056 ax = plt.gca() 1057 else: 1058 f = plt.gcf() 1059 1060 ax = gdf.assign(_cl=self.y).plot( 1061 column="_cl", 1062 ax=ax, 1063 cmap=cmap, 1064 edgecolor=border_color, 1065 linewidth=border_width, 1066 scheme=self.name, 1067 legend=legend, 1068 legend_kwds=legend_kwds, 1069 ) 1070 if not axis_on: 1071 ax.axis("off") 1072 if title: 1073 f.suptitle(title) 1074 if file_name: 1075 plt.savefig(file_name, dpi=dpi) 1076 return f, ax 1077 1078 1079class HeadTailBreaks(MapClassifier): 1080 """ 1081 Head/tail Breaks Map Classification for Heavy-tailed Distributions 1082 1083 Parameters 1084 ---------- 1085 y : array 1086 (n,1), values to classify 1087 1088 Attributes 1089 ---------- 1090 yb : array 1091 (n,1), bin ids for observations, 1092 bins : array 1093 (k,1), the upper bounds of each class 1094 k : int 1095 the number of classes 1096 counts : array 1097 (k,1), the number of observations falling in each class 1098 1099 Examples 1100 -------- 1101 >>> import numpy as np 1102 >>> import mapclassify as mc 1103 >>> np.random.seed(10) 1104 >>> cal = mc.load_example() 1105 >>> htb = mc.HeadTailBreaks(cal) 1106 >>> htb.k 1107 3 1108 >>> htb.counts 1109 array([50, 7, 1]) 1110 >>> htb.bins 1111 array([ 125.92810345, 811.26 , 4111.45 ]) 1112 >>> np.random.seed(123456) 1113 >>> x = np.random.lognormal(3, 1, 1000) 1114 >>> htb = mc.HeadTailBreaks(x) 1115 >>> htb.bins 1116 array([ 32.26204423, 72.50205622, 128.07150107, 190.2899093 , 1117 264.82847377, 457.88157946, 576.76046949]) 1118 >>> htb.counts 1119 array([695, 209, 62, 22, 10, 1, 1]) 1120 1121 Notes 1122 ----- 1123 Head/tail Breaks is a relatively new classification method developed 1124 for data with a heavy-tailed distribution. 1125 1126 Implementation based on contributions by Alessandra Sozzi <alessandra.sozzi@gmail.com>. 1127 1128 For theoretical details see :cite:`Jiang_2013`. 1129 1130 """ 1131 1132 def __init__(self, y): 1133 MapClassifier.__init__(self, y) 1134 self.name = "HeadTailBreaks" 1135 1136 def _set_bins(self): 1137 1138 x = self.y.copy() 1139 bins = [] 1140 bins = head_tail_breaks(x, bins) 1141 self.bins = np.array(bins) 1142 self.k = len(self.bins) 1143 1144 1145class EqualInterval(MapClassifier): 1146 """ 1147 Equal Interval Classification 1148 1149 Parameters 1150 ---------- 1151 y : array 1152 (n,1), values to classify 1153 k : int 1154 number of classes required 1155 1156 Attributes 1157 ---------- 1158 yb : array 1159 (n,1), bin ids for observations, 1160 each value is the id of the class the observation belongs to 1161 yb[i] = j for j>=1 if bins[j-1] < y[i] <= bins[j], yb[i] = 0 1162 otherwise 1163 bins : array 1164 (k,1), the upper bounds of each class 1165 k : int 1166 the number of classes 1167 counts : array 1168 (k,1), the number of observations falling in each class 1169 1170 Examples 1171 -------- 1172 >>> import mapclassify as mc 1173 >>> cal = mc.load_example() 1174 >>> ei = mc.EqualInterval(cal, k = 5) 1175 >>> ei.k 1176 5 1177 >>> ei.counts 1178 array([57, 0, 0, 0, 1]) 1179 >>> ei.bins 1180 array([ 822.394, 1644.658, 2466.922, 3289.186, 4111.45 ]) 1181 1182 Notes 1183 ----- 1184 Intervals defined to have equal width: 1185 1186 .. math:: 1187 1188 bins_j = min(y)+w*(j+1) 1189 1190 with :math:`w=\\frac{max(y)-min(j)}{k}` 1191 """ 1192 1193 def __init__(self, y, k=K): 1194 """ 1195 see class docstring 1196 1197 """ 1198 if min(y) == max(y): 1199 raise ValueError("Not enough unique values in array to form k classes.") 1200 self.k = k 1201 MapClassifier.__init__(self, y) 1202 self.name = "EqualInterval" 1203 1204 def _set_bins(self): 1205 y = self.y 1206 k = self.k 1207 max_y = max(y) 1208 min_y = min(y) 1209 rg = max_y - min_y 1210 width = rg * 1.0 / k 1211 cuts = np.arange(min_y + width, max_y + width, width) 1212 if len(cuts) > self.k: # handle overshooting 1213 cuts = cuts[0:k] 1214 cuts[-1] = max_y 1215 bins = cuts.copy() 1216 self.bins = bins 1217 1218 1219class Percentiles(MapClassifier): 1220 """ 1221 Percentiles Map Classification 1222 1223 Parameters 1224 ---------- 1225 y : array 1226 attribute to classify 1227 pct : array 1228 percentiles default=[1,10,50,90,99,100] 1229 1230 Attributes 1231 ---------- 1232 yb : array 1233 bin ids for observations (numpy array n x 1) 1234 bins : array 1235 the upper bounds of each class (numpy array k x 1) 1236 k : int 1237 the number of classes 1238 counts : int 1239 the number of observations falling in each class 1240 (numpy array k x 1) 1241 1242 Examples 1243 -------- 1244 >>> import mapclassify as mc 1245 >>> cal = mc.load_example() 1246 >>> p = mc.Percentiles(cal) 1247 >>> p.bins 1248 array([1.357000e-01, 5.530000e-01, 9.365000e+00, 2.139140e+02, 1249 2.179948e+03, 4.111450e+03]) 1250 >>> p.counts 1251 array([ 1, 5, 23, 23, 5, 1]) 1252 >>> p2 = mc.Percentiles(cal, pct = [50, 100]) 1253 >>> p2.bins 1254 array([ 9.365, 4111.45 ]) 1255 >>> p2.counts 1256 array([29, 29]) 1257 >>> p2.k 1258 2 1259 """ 1260 1261 def __init__(self, y, pct=[1, 10, 50, 90, 99, 100]): 1262 self.pct = pct 1263 MapClassifier.__init__(self, y) 1264 self.name = "Percentiles" 1265 1266 def _set_bins(self): 1267 y = self.y 1268 pct = self.pct 1269 self.bins = np.array([stats.scoreatpercentile(y, p) for p in pct]) 1270 self.k = len(self.bins) 1271 1272 def update(self, y=None, inplace=False, **kwargs): 1273 """ 1274 Add data or change classification parameters. 1275 1276 Parameters 1277 ---------- 1278 y : array 1279 (n,1) array of data to classify 1280 inplace : bool 1281 whether to conduct the update in place or to return a copy 1282 estimated from the additional specifications. 1283 1284 Additional parameters provided in **kwargs are passed to the init 1285 function of the class. For documentation, check the class constructor. 1286 """ 1287 kwargs.update({"pct": kwargs.pop("pct", self.pct)}) 1288 if inplace: 1289 self._update(y, **kwargs) 1290 else: 1291 new = copy.deepcopy(self) 1292 new._update(y, **kwargs) 1293 return new 1294 1295 1296class BoxPlot(MapClassifier): 1297 """ 1298 BoxPlot Map Classification 1299 1300 Parameters 1301 ---------- 1302 y : array 1303 attribute to classify 1304 hinge : float 1305 multiplier for IQR 1306 1307 Attributes 1308 ---------- 1309 yb : array 1310 (n,1), bin ids for observations 1311 bins : array 1312 (n,1), the upper bounds of each class (monotonic) 1313 k : int 1314 the number of classes 1315 counts : array 1316 (k,1), the number of observations falling in each class 1317 low_outlier_ids : array 1318 indices of observations that are low outliers 1319 high_outlier_ids : array 1320 indices of observations that are high outliers 1321 1322 Notes 1323 ----- 1324 1325 The bins are set as follows:: 1326 1327 bins[0] = q[0]-hinge*IQR 1328 bins[1] = q[0] 1329 bins[2] = q[1] 1330 bins[3] = q[2] 1331 bins[4] = q[2]+hinge*IQR 1332 bins[5] = inf (see Notes) 1333 1334 where q is an array of the first three quartiles of y and 1335 IQR=q[2]-q[0] 1336 1337 If q[2]+hinge*IQR > max(y) there will only be 5 classes and no high 1338 outliers, otherwise, there will be 6 classes and at least one high 1339 outlier. 1340 1341 Examples 1342 -------- 1343 >>> import mapclassify as mc 1344 >>> cal = mc.load_example() 1345 >>> bp = mc.BoxPlot(cal) 1346 >>> bp.bins 1347 array([-5.287625e+01, 2.567500e+00, 9.365000e+00, 3.953000e+01, 1348 9.497375e+01, 4.111450e+03]) 1349 >>> bp.counts 1350 array([ 0, 15, 14, 14, 6, 9]) 1351 >>> bp.high_outlier_ids 1352 array([ 0, 6, 18, 29, 33, 36, 37, 40, 42]) 1353 >>> cal[bp.high_outlier_ids].values 1354 array([ 329.92, 181.27, 370.5 , 722.85, 192.05, 110.74, 4111.45, 1355 317.11, 264.93]) 1356 >>> bx = mc.BoxPlot(np.arange(100)) 1357 >>> bx.bins 1358 array([-49.5 , 24.75, 49.5 , 74.25, 148.5 ]) 1359 1360 """ 1361 1362 def __init__(self, y, hinge=1.5): 1363 """ 1364 Parameters 1365 ---------- 1366 y : array (n,1) 1367 attribute to classify 1368 hinge : float 1369 multiple of inter-quartile range (default=1.5) 1370 """ 1371 self.hinge = hinge 1372 MapClassifier.__init__(self, y) 1373 self.name = "BoxPlot" 1374 1375 def _set_bins(self): 1376 y = self.y 1377 pct = [25, 50, 75, 100] 1378 bins = [stats.scoreatpercentile(y, p) for p in pct] 1379 iqr = bins[-2] - bins[0] 1380 self.iqr = iqr 1381 pivot = self.hinge * iqr 1382 left_fence = bins[0] - pivot 1383 right_fence = bins[-2] + pivot 1384 if right_fence < bins[-1]: 1385 bins.insert(-1, right_fence) 1386 else: 1387 bins[-1] = right_fence 1388 bins.insert(0, left_fence) 1389 self.bins = np.array(bins) 1390 self.k = len(bins) 1391 1392 def _classify(self): 1393 MapClassifier._classify(self) 1394 self.low_outlier_ids = np.nonzero(self.yb == 0)[0] 1395 self.high_outlier_ids = np.nonzero(self.yb == 5)[0] 1396 1397 def update(self, y=None, inplace=False, **kwargs): 1398 """ 1399 Add data or change classification parameters. 1400 1401 Parameters 1402 ---------- 1403 y : array 1404 (n,1) array of data to classify 1405 inplace : bool 1406 whether to conduct the update in place or to return a 1407 copy estimated from the additional specifications. 1408 1409 Additional parameters provided in **kwargs are passed to the init 1410 function of the class. For documentation, check the class constructor. 1411 """ 1412 kwargs.update({"hinge": kwargs.pop("hinge", self.hinge)}) 1413 if inplace: 1414 self._update(y, **kwargs) 1415 else: 1416 new = copy.deepcopy(self) 1417 new._update(y, **kwargs) 1418 return new 1419 1420 1421class Quantiles(MapClassifier): 1422 """ 1423 Quantile Map Classification 1424 1425 Parameters 1426 ---------- 1427 y : array 1428 (n,1), values to classify 1429 k : int 1430 number of classes required 1431 1432 Attributes 1433 ---------- 1434 yb : array 1435 (n,1), bin ids for observations, 1436 each value is the id of the class the observation belongs to 1437 yb[i] = j for j>=1 if bins[j-1] < y[i] <= bins[j], yb[i] = 0 1438 otherwise 1439 bins : array 1440 (k,1), the upper bounds of each class 1441 k : int 1442 the number of classes 1443 counts : array 1444 (k,1), the number of observations falling in each class 1445 1446 1447 Examples 1448 -------- 1449 >>> import mapclassify as mc 1450 >>> cal = mc.load_example() 1451 >>> q = mc.Quantiles(cal, k = 5) 1452 >>> q.bins 1453 array([1.46400e+00, 5.79800e+00, 1.32780e+01, 5.46160e+01, 4.11145e+03]) 1454 >>> q.counts 1455 array([12, 11, 12, 11, 12]) 1456 """ 1457 1458 def __init__(self, y, k=K): 1459 self.k = k 1460 MapClassifier.__init__(self, y) 1461 self.name = "Quantiles" 1462 1463 def _set_bins(self): 1464 y = self.y 1465 k = self.k 1466 self.bins = quantile(y, k=k) 1467 1468 1469class StdMean(MapClassifier): 1470 """ 1471 Standard Deviation and Mean Map Classification 1472 1473 Parameters 1474 ---------- 1475 y : array 1476 (n,1), values to classify 1477 multiples : array 1478 the multiples of the standard deviation to add/subtract from 1479 the sample mean to define the bins, default=[-2,-1,1,2] 1480 1481 Attributes 1482 ---------- 1483 1484 yb : array 1485 (n,1), bin ids for observations, 1486 bins : array 1487 (k,1), the upper bounds of each class 1488 k : int 1489 the number of classes 1490 counts : array 1491 (k,1), the number of observations falling in each class 1492 1493 Examples 1494 -------- 1495 >>> import mapclassify as mc 1496 >>> cal = mc.load_example() 1497 >>> st = mc.StdMean(cal) 1498 >>> st.k 1499 5 1500 >>> st.bins 1501 array([-967.36235382, -420.71712519, 672.57333208, 1219.21856072, 1502 4111.45 ]) 1503 >>> st.counts 1504 array([ 0, 0, 56, 1, 1]) 1505 >>> 1506 >>> st3 = mc.StdMean(cal, multiples = [-3, -1.5, 1.5, 3]) 1507 >>> st3.bins 1508 array([-1514.00758246, -694.03973951, 945.8959464 , 1765.86378936, 1509 4111.45 ]) 1510 >>> st3.counts 1511 array([ 0, 0, 57, 0, 1]) 1512 1513 """ 1514 1515 def __init__(self, y, multiples=[-2, -1, 1, 2]): 1516 self.multiples = multiples 1517 MapClassifier.__init__(self, y) 1518 self.name = "StdMean" 1519 1520 def _set_bins(self): 1521 y = self.y 1522 s = y.std(ddof=1) 1523 m = y.mean() 1524 cuts = [m + s * w for w in self.multiples] 1525 y_max = y.max() 1526 if cuts[-1] < y_max: 1527 cuts.append(y_max) 1528 self.bins = np.array(cuts) 1529 self.k = len(cuts) 1530 1531 def update(self, y=None, inplace=False, **kwargs): 1532 """ 1533 Add data or change classification parameters. 1534 1535 Parameters 1536 ---------- 1537 y : array 1538 (n,1) array of data to classify 1539 inplace : bool 1540 whether to conduct the update in place or to return a copy 1541 estimated from the additional specifications. 1542 1543 Additional parameters provided in **kwargs are passed to the init 1544 function of the class. For documentation, check the class constructor. 1545 """ 1546 kwargs.update({"multiples": kwargs.pop("multiples", self.multiples)}) 1547 if inplace: 1548 self._update(y, **kwargs) 1549 else: 1550 new = copy.deepcopy(self) 1551 new._update(y, **kwargs) 1552 return new 1553 1554 1555class MaximumBreaks(MapClassifier): 1556 """ 1557 Maximum Breaks Map Classification 1558 1559 Parameters 1560 ---------- 1561 y : array 1562 (n, 1), values to classify 1563 1564 k : int 1565 number of classes required 1566 1567 mindiff : float 1568 The minimum difference between class breaks 1569 1570 Attributes 1571 ---------- 1572 yb : array 1573 (n, 1), bin ids for observations 1574 bins : array 1575 (k, 1), the upper bounds of each class 1576 k : int 1577 the number of classes 1578 counts : array 1579 (k, 1), the number of observations falling in each class (numpy 1580 array k x 1) 1581 1582 Examples 1583 -------- 1584 >>> import mapclassify as mc 1585 >>> cal = mc.load_example() 1586 >>> mb = mc.MaximumBreaks(cal, k = 5) 1587 >>> mb.k 1588 5 1589 >>> mb.bins 1590 array([ 146.005, 228.49 , 546.675, 2417.15 , 4111.45 ]) 1591 >>> mb.counts 1592 array([50, 2, 4, 1, 1]) 1593 1594 """ 1595 1596 def __init__(self, y, k=5, mindiff=0): 1597 if min(y) == max(y): 1598 raise ValueError("Not enough unique values in array to form k classes.") 1599 self.k = k 1600 self.mindiff = mindiff 1601 MapClassifier.__init__(self, y) 1602 self.name = "MaximumBreaks" 1603 1604 def _set_bins(self): 1605 xs = self.y.copy() 1606 k = self.k 1607 xs.sort() 1608 min_diff = self.mindiff 1609 diffs = xs[1:] - xs[:-1] 1610 idxs = np.argsort(diffs) 1611 k1 = k - 1 1612 1613 ud = np.unique(diffs) 1614 if len(ud) < k1: 1615 print("Insufficient number of unique diffs. Breaks are random.") 1616 mp = [] 1617 for c in range(1, k): 1618 idx = idxs[-c] 1619 cp = (xs[idx] + xs[idx + 1]) / 2.0 1620 mp.append(cp) 1621 mp.append(xs[-1]) 1622 mp.sort() 1623 self.bins = np.array(mp) 1624 1625 def update(self, y=None, inplace=False, **kwargs): 1626 """ 1627 Add data or change classification parameters. 1628 1629 Parameters 1630 ---------- 1631 y : array 1632 (n,1) array of data to classify 1633 inplace : bool 1634 whether to conduct the update in place or to return a copy 1635 estimated from the additional specifications. 1636 1637 Additional parameters provided in **kwargs are passed to the init 1638 function of the class. For documentation, check the class constructor. 1639 """ 1640 kwargs.update({"k": kwargs.pop("k", self.k)}) 1641 kwargs.update({"mindiff": kwargs.pop("mindiff", self.mindiff)}) 1642 if inplace: 1643 self._update(y, **kwargs) 1644 else: 1645 new = copy.deepcopy(self) 1646 new._update(y, **kwargs) 1647 return new 1648 1649 1650class NaturalBreaks(MapClassifier): 1651 """ 1652 Natural Breaks Map Classification 1653 1654 Parameters 1655 ---------- 1656 y : array 1657 (n,1), values to classify 1658 k : int 1659 number of classes required 1660 1661 initial : int, default: 10 1662 Number of initial solutions generated with different centroids. Best of initial results is returned. 1663 1664 Attributes 1665 ---------- 1666 1667 yb : array 1668 (n,1), bin ids for observations, 1669 bins : array 1670 (k,1), the upper bounds of each class 1671 k : int 1672 the number of classes 1673 counts : array 1674 (k,1), the number of observations falling in each class 1675 1676 Examples 1677 -------- 1678 >>> import numpy as np 1679 >>> import mapclassify as mc 1680 >>> np.random.seed(123456) 1681 >>> cal = mc.load_example() 1682 >>> nb = mc.NaturalBreaks(cal, k=5) 1683 >>> nb.k 1684 5 1685 >>> nb.counts 1686 array([49, 3, 4, 1, 1]) 1687 >>> nb.bins 1688 array([ 75.29, 192.05, 370.5 , 722.85, 4111.45]) 1689 >>> x = np.array([1] * 50) 1690 >>> x[-1] = 20 1691 >>> nb = mc.NaturalBreaks(x, k = 5) 1692 1693 Warning: Not enough unique values in array to form k classes 1694 Warning: setting k to 2 1695 1696 >>> nb.bins 1697 array([ 1, 20]) 1698 >>> nb.counts 1699 array([49, 1]) 1700 1701 """ 1702 1703 def __init__(self, y, k=K, initial=10): 1704 self.k = k 1705 self.init = initial 1706 MapClassifier.__init__(self, y) 1707 self.name = "NaturalBreaks" 1708 1709 def _set_bins(self): 1710 1711 x = self.y.copy() 1712 k = self.k 1713 values = np.array(x) 1714 uv = np.unique(values) 1715 uvk = len(uv) 1716 if uvk < k: 1717 ms = "Warning: Not enough unique values in array to form k classes" 1718 Warn(ms, UserWarning) 1719 Warn("Warning: setting k to %d" % uvk, UserWarning) 1720 k = uvk 1721 uv.sort() 1722 # we set the bins equal to the sorted unique values and ramp k 1723 # downwards. no need to call kmeans. 1724 self.bins = uv 1725 self.k = k 1726 else: 1727 res0 = natural_breaks(x, k, init=self.init) 1728 fit = res0[2] 1729 self.bins = np.array(res0[-1]) 1730 self.k = len(self.bins) 1731 1732 def update(self, y=None, inplace=False, **kwargs): 1733 """ 1734 Add data or change classification parameters. 1735 1736 Parameters 1737 ---------- 1738 y : array 1739 (n,1) array of data to classify 1740 inplace : bool 1741 whether to conduct the update in place or to return a 1742 copy estimated from the additional specifications. 1743 1744 Additional parameters provided in **kwargs are passed to the init 1745 function of the class. For documentation, check the class constructor. 1746 """ 1747 kwargs.update({"k": kwargs.pop("k", self.k)}) 1748 if inplace: 1749 self._update(y, **kwargs) 1750 else: 1751 new = copy.deepcopy(self) 1752 new._update(y, **kwargs) 1753 return new 1754 1755 1756class FisherJenks(MapClassifier): 1757 """ 1758 Fisher Jenks optimal classifier - mean based 1759 1760 Parameters 1761 ---------- 1762 y : array 1763 (n,1), values to classify 1764 k : int 1765 number of classes required 1766 1767 Attributes 1768 ---------- 1769 yb : array 1770 (n,1), bin ids for observations 1771 bins : array 1772 (k,1), the upper bounds of each class 1773 k : int 1774 the number of classes 1775 counts : array 1776 (k,1), the number of observations falling in each class 1777 1778 Examples 1779 -------- 1780 >>> import mapclassify as mc 1781 >>> cal = mc.load_example() 1782 >>> fj = mc.FisherJenks(cal) 1783 >>> fj.adcm 1784 799.24 1785 >>> fj.bins 1786 array([ 75.29, 192.05, 370.5 , 722.85, 4111.45]) 1787 >>> fj.counts 1788 array([49, 3, 4, 1, 1]) 1789 >>> 1790 """ 1791 1792 def __init__(self, y, k=K): 1793 1794 nu = len(np.unique(y)) 1795 if nu < k: 1796 raise ValueError("Fewer unique values than specified classes.") 1797 self.k = k 1798 MapClassifier.__init__(self, y) 1799 self.name = "FisherJenks" 1800 1801 def _set_bins(self): 1802 x = self.y.copy() 1803 self.bins = np.array(_fisher_jenks_means(x, classes=self.k)[1:]) 1804 1805 1806class FisherJenksSampled(MapClassifier): 1807 """ 1808 Fisher Jenks optimal classifier - mean based using random sample 1809 1810 Parameters 1811 ---------- 1812 y : array 1813 (n,1), values to classify 1814 k : int 1815 number of classes required 1816 pct : float 1817 The percentage of n that should form the sample 1818 If pct is specified such that n*pct > 1000, then 1819 pct = 1000./n, unless truncate is False 1820 truncate : boolean 1821 truncate pct in cases where pct * n > 1000., (Default True) 1822 1823 Attributes 1824 ---------- 1825 yb : array 1826 (n,1), bin ids for observations 1827 bins : array 1828 (k,1), the upper bounds of each class 1829 k : int 1830 the number of classes 1831 counts : array 1832 (k,1), the number of observations falling in each class 1833 1834 Notes 1835 ----- 1836 For theoretical details see :cite:`Rey_2016`. 1837 1838 """ 1839 1840 def __init__(self, y, k=K, pct=0.10, truncate=True): 1841 self.k = k 1842 n = y.size 1843 1844 if (pct * n > 1000) and truncate: 1845 pct = 1000.0 / n 1846 ids = np.random.randint(0, n, int(n * pct)) 1847 y = np.asarray(y) 1848 yr = y[ids] 1849 yr[-1] = max(y) # make sure we have the upper bound 1850 yr[0] = min(y) # make sure we have the min 1851 self.original_y = y 1852 self.pct = pct 1853 self._truncated = truncate 1854 self.yr = yr 1855 self.yr_n = yr.size 1856 MapClassifier.__init__(self, yr) 1857 self.yb, self.counts = bin1d(y, self.bins) 1858 self.name = "FisherJenksSampled" 1859 self.y = y 1860 self._summary() # have to recalculate summary stats 1861 1862 def _set_bins(self): 1863 fj = FisherJenks(self.y, self.k) 1864 self.bins = fj.bins 1865 1866 def update(self, y=None, inplace=False, **kwargs): 1867 """ 1868 Add data or change classification parameters. 1869 1870 Parameters 1871 ---------- 1872 y : array 1873 (n,1) array of data to classify 1874 inplace : bool 1875 whether to conduct the update in place or to return a 1876 copy estimated from the additional specifications. 1877 1878 Additional parameters provided in **kwargs are passed to the init 1879 function of the class. For documentation, check the class constructor. 1880 """ 1881 kwargs.update({"k": kwargs.pop("k", self.k)}) 1882 kwargs.update({"pct": kwargs.pop("pct", self.pct)}) 1883 kwargs.update({"truncate": kwargs.pop("truncate", self._truncated)}) 1884 if inplace: 1885 self._update(y, **kwargs) 1886 else: 1887 new = copy.deepcopy(self) 1888 new._update(y, **kwargs) 1889 return new 1890 1891 1892class JenksCaspall(MapClassifier): 1893 """ 1894 Jenks Caspall Map Classification 1895 1896 Parameters 1897 ---------- 1898 y : array 1899 (n,1), values to classify 1900 k : int 1901 number of classes required 1902 1903 Attributes 1904 ---------- 1905 1906 yb : array 1907 (n,1), bin ids for observations, 1908 bins : array 1909 (k,1), the upper bounds of each class 1910 k : int 1911 the number of classes 1912 counts : array 1913 (k,1), the number of observations falling in each class 1914 1915 1916 Examples 1917 -------- 1918 >>> import mapclassify as mc 1919 >>> cal = mc.load_example() 1920 >>> jc = mc.JenksCaspall(cal, k = 5) 1921 >>> jc.bins 1922 array([1.81000e+00, 7.60000e+00, 2.98200e+01, 1.81270e+02, 4.11145e+03]) 1923 >>> jc.counts 1924 array([14, 13, 14, 10, 7]) 1925 1926 """ 1927 1928 def __init__(self, y, k=K): 1929 self.k = k 1930 MapClassifier.__init__(self, y) 1931 self.name = "JenksCaspall" 1932 1933 def _set_bins(self): 1934 x = self.y.copy() 1935 k = self.k 1936 # start with quantiles 1937 q = quantile(x, k) 1938 solving = True 1939 xb, cnts = bin1d(x, q) 1940 # class means 1941 if x.ndim == 1: 1942 x.shape = (x.size, 1) 1943 n, k = x.shape 1944 xm = [np.median(x[xb == i]) for i in np.unique(xb)] 1945 xb0 = xb.copy() 1946 q = xm 1947 it = 0 1948 rk = list(range(self.k)) 1949 while solving: 1950 xb = np.zeros(xb0.shape, int) 1951 d = abs(x - q) 1952 xb = d.argmin(axis=1) 1953 if (xb0 == xb).all(): 1954 solving = False 1955 else: 1956 xb0 = xb 1957 it += 1 1958 q = np.array([np.median(x[xb == i]) for i in rk]) 1959 cuts = np.array([max(x[xb == i]) for i in np.unique(xb)]) 1960 cuts.shape = (len(cuts),) 1961 self.bins = cuts 1962 self.iterations = it 1963 1964 1965class JenksCaspallSampled(MapClassifier): 1966 """ 1967 Jenks Caspall Map Classification using a random sample 1968 1969 Parameters 1970 ---------- 1971 1972 y : array 1973 (n,1), values to classify 1974 k : int 1975 number of classes required 1976 pct : float 1977 The percentage of n that should form the sample 1978 If pct is specified such that n*pct > 1000, then pct = 1000./n 1979 1980 Attributes 1981 ---------- 1982 1983 yb : array 1984 (n,1), bin ids for observations, 1985 bins : array 1986 (k,1), the upper bounds of each class 1987 k : int 1988 the number of classes 1989 counts : array 1990 (k,1), the number of observations falling in each class 1991 1992 1993 Examples 1994 -------- 1995 >>> import mapclassify as mc 1996 >>> cal = mc.load_example() 1997 >>> x = np.random.random(100000) 1998 >>> jc = mc.JenksCaspall(x) 1999 >>> jcs = mc.JenksCaspallSampled(x) 2000 >>> jc.bins 2001 array([0.1988721 , 0.39624334, 0.59441487, 0.79624357, 0.99999251]) 2002 >>> jcs.bins 2003 array([0.20998558, 0.42112792, 0.62752937, 0.80543819, 0.99999251]) 2004 >>> jc.counts 2005 array([19943, 19510, 19547, 20297, 20703]) 2006 >>> jcs.counts 2007 array([21039, 20908, 20425, 17813, 19815]) 2008 2009 # not for testing since we get different times on different hardware 2010 # just included for documentation of likely speed gains 2011 #>>> t1 = time.time(); jc = Jenks_Caspall(x); t2 = time.time() 2012 #>>> t1s = time.time(); jcs = Jenks_Caspall_Sampled(x); t2s = time.time() 2013 #>>> t2 - t1; t2s - t1s 2014 #1.8292930126190186 2015 #0.061631917953491211 2016 2017 Notes 2018 ----- 2019 This is intended for large n problems. The logic is to apply 2020 Jenks_Caspall to a random subset of the y space and then bin the 2021 complete vector y on the bins obtained from the subset. This would 2022 trade off some "accuracy" for a gain in speed. 2023 2024 """ 2025 2026 def __init__(self, y, k=K, pct=0.10): 2027 self.k = k 2028 n = y.size 2029 if pct * n > 1000: 2030 pct = 1000.0 / n 2031 ids = np.random.randint(0, n, int(n * pct)) 2032 y = np.asarray(y) 2033 yr = y[ids] 2034 yr[0] = max(y) # make sure we have the upper bound 2035 self.original_y = y 2036 self.pct = pct 2037 self.yr = yr 2038 self.yr_n = yr.size 2039 MapClassifier.__init__(self, yr) 2040 self.yb, self.counts = bin1d(y, self.bins) 2041 self.name = "JenksCaspallSampled" 2042 self.y = y 2043 self._summary() # have to recalculate summary stats 2044 2045 def _set_bins(self): 2046 jc = JenksCaspall(self.y, self.k) 2047 self.bins = jc.bins 2048 self.iterations = jc.iterations 2049 2050 def update(self, y=None, inplace=False, **kwargs): 2051 """ 2052 Add data or change classification parameters. 2053 2054 Parameters 2055 ---------- 2056 y : array 2057 (n,1) array of data to classify 2058 inplace : bool 2059 whether to conduct the update in place or to return a 2060 copy estimated from the additional specifications. 2061 2062 Additional parameters provided in **kwargs are passed to the init 2063 function of the class. For documentation, check the class constructor. 2064 """ 2065 kwargs.update({"k": kwargs.pop("k", self.k)}) 2066 kwargs.update({"pct": kwargs.pop("pct", self.pct)}) 2067 if inplace: 2068 self._update(y, **kwargs) 2069 else: 2070 new = copy.deepcopy(self) 2071 new._update(y, **kwargs) 2072 return new 2073 2074 2075class JenksCaspallForced(MapClassifier): 2076 """ 2077 Jenks Caspall Map Classification with forced movements 2078 2079 Parameters 2080 ---------- 2081 y : array 2082 (n,1), values to classify 2083 k : int 2084 number of classes required 2085 2086 Attributes 2087 ---------- 2088 yb : array 2089 (n,1), bin ids for observations 2090 bins : array 2091 (k,1), the upper bounds of each class 2092 k : int 2093 the number of classes 2094 counts : array 2095 (k,1), the number of observations falling in each class 2096 2097 2098 Examples 2099 -------- 2100 >>> import mapclassify as mc 2101 >>> cal = mc.load_example() 2102 >>> jcf = mc.JenksCaspallForced(cal, k = 5) 2103 >>> jcf.k 2104 5 2105 >>> jcf.bins 2106 array([1.34000e+00, 5.90000e+00, 1.67000e+01, 5.06500e+01, 4.11145e+03]) 2107 >>> jcf.counts 2108 array([12, 12, 13, 9, 12]) 2109 >>> jcf4 = mc.JenksCaspallForced(cal, k = 4) 2110 >>> jcf4.k 2111 4 2112 >>> jcf4.bins 2113 array([2.51000e+00, 8.70000e+00, 3.66800e+01, 4.11145e+03]) 2114 >>> jcf4.counts 2115 array([15, 14, 14, 15]) 2116 """ 2117 2118 def __init__(self, y, k=K): 2119 if min(y) == max(y): 2120 raise ValueError("Not enough unique values in array to form k classes.") 2121 self.k = k 2122 MapClassifier.__init__(self, y) 2123 self.name = "JenksCaspallForced" 2124 2125 def _set_bins(self): 2126 x = self.y.copy() 2127 k = self.k 2128 q = quantile(x, k) 2129 solving = True 2130 xb, cnt = bin1d(x, q) 2131 # class means 2132 if x.ndim == 1: 2133 x.shape = (x.size, 1) 2134 n, tmp = x.shape 2135 xm = [x[xb == i].mean() for i in np.unique(xb)] 2136 q = xm 2137 xbar = np.array([xm[xbi] for xbi in xb]) 2138 xbar.shape = (n, 1) 2139 ss = x - xbar 2140 ss *= ss 2141 ss = sum(ss) 2142 down_moves = up_moves = 0 2143 solving = True 2144 it = 0 2145 while solving: 2146 # try upward moves first 2147 moving_up = True 2148 while moving_up: 2149 class_ids = np.unique(xb) 2150 nk = [sum(xb == j) for j in class_ids] 2151 candidates = nk[:-1] 2152 i = 0 2153 up_moves = 0 2154 while candidates: 2155 nki = candidates.pop(0) 2156 if nki > 1: 2157 ids = np.nonzero(xb == class_ids[i]) 2158 mover = max(ids[0]) 2159 tmp = xb.copy() 2160 tmp[mover] = xb[mover] + 1 2161 tm = [x[tmp == j].mean() for j in np.unique(tmp)] 2162 txbar = np.array([tm[xbi] for xbi in tmp]) 2163 txbar.shape = (n, 1) 2164 tss = x - txbar 2165 tss *= tss 2166 tss = sum(tss) 2167 if tss < ss: 2168 xb = tmp 2169 ss = tss 2170 candidates = [] 2171 up_moves += 1 2172 i += 1 2173 if not up_moves: 2174 moving_up = False 2175 moving_down = True 2176 while moving_down: 2177 class_ids = np.unique(xb) 2178 nk = [sum(xb == j) for j in class_ids] 2179 candidates = nk[1:] 2180 i = 1 2181 down_moves = 0 2182 while candidates: 2183 nki = candidates.pop(0) 2184 if nki > 1: 2185 ids = np.nonzero(xb == class_ids[i]) 2186 mover = min(ids[0]) 2187 mover_class = xb[mover] 2188 target_class = mover_class - 1 2189 tmp = xb.copy() 2190 tmp[mover] = target_class 2191 tm = [x[tmp == j].mean() for j in np.unique(tmp)] 2192 txbar = np.array([tm[xbi] for xbi in tmp]) 2193 txbar.shape = (n, 1) 2194 tss = x - txbar 2195 tss *= tss 2196 tss = sum(tss) 2197 if tss < ss: 2198 xb = tmp 2199 ss = tss 2200 candidates = [] 2201 down_moves += 1 2202 i += 1 2203 if not down_moves: 2204 moving_down = False 2205 if not up_moves and not down_moves: 2206 solving = False 2207 it += 1 2208 cuts = [max(x[xb == c]) for c in np.unique(xb)] 2209 cuts = np.reshape(np.array(cuts), (k,)) 2210 self.bins = cuts 2211 self.iterations = it 2212 2213 2214class UserDefined(MapClassifier): 2215 """ 2216 User Specified Binning 2217 2218 Parameters 2219 ---------- 2220 y : array 2221 (n,1), values to classify 2222 bins : array 2223 (k,1), upper bounds of classes (have to be monotically increasing) 2224 2225 lowest : float (optional) 2226 scalar minimum value of lowest class. Default is to set the minimum 2227 to -inf if y.min() > first upper bound, otherwise minimum is set to 2228 y.min(). lowest will override the default 2229 2230 2231 2232 Attributes 2233 ---------- 2234 yb : array 2235 (n,1), bin ids for observations, 2236 bins : array 2237 (k,1), the upper bounds of each class 2238 k : int 2239 the number of classes 2240 counts : array 2241 (k,1), the number of observations falling in each class 2242 2243 2244 Examples 2245 -------- 2246 >>> import mapclassify as mc 2247 >>> cal = mc.load_example() 2248 >>> bins = [20, max(cal)] 2249 >>> bins 2250 [20, 4111.45] 2251 >>> ud = mc.UserDefined(cal, bins) 2252 >>> ud.bins 2253 array([ 20. , 4111.45]) 2254 >>> ud.counts 2255 array([37, 21]) 2256 >>> bins = [20, 30] 2257 >>> ud = mc.UserDefined(cal, bins) 2258 >>> ud.bins 2259 array([ 20. , 30. , 4111.45]) 2260 >>> ud.counts 2261 array([37, 4, 17]) 2262 2263 Notes 2264 ----- 2265 If upper bound of user bins does not exceed max(y) we append an 2266 additional bin. 2267 2268 """ 2269 2270 def __init__(self, y, bins, lowest=None): 2271 if bins[-1] < max(y): 2272 bins = np.append(bins, max(y)) 2273 self.lowest = lowest 2274 self.k = len(bins) 2275 self.bins = np.array(bins) 2276 self.y = y 2277 MapClassifier.__init__(self, y) 2278 self.name = "UserDefined" 2279 2280 def _set_bins(self): 2281 pass 2282 2283 def _update(self, y=None, bins=None): 2284 if y is not None: 2285 if hasattr(y, "values"): 2286 y = y.values 2287 y = np.append(y.flatten(), self.y) 2288 else: 2289 y = self.y 2290 if bins is None: 2291 bins = self.bins 2292 self.__init__(y, bins) 2293 2294 def update(self, y=None, inplace=False, **kwargs): 2295 """ 2296 Add data or change classification parameters. 2297 2298 Parameters 2299 ---------- 2300 y : array 2301 (n,1) array of data to classify 2302 inplace : bool 2303 whether to conduct the update in place or to return a 2304 copy estimated from the additional specifications. 2305 2306 Additional parameters provided in **kwargs are passed to the init 2307 function of the class. For documentation, check the class constructor. 2308 """ 2309 bins = kwargs.pop("bins", self.bins) 2310 if inplace: 2311 self._update(y=y, bins=bins, **kwargs) 2312 else: 2313 new = copy.deepcopy(self) 2314 new._update(y, bins, **kwargs) 2315 return new 2316 2317 # We have to override the plot method for additional kwargs for UserDefined 2318 def plot( 2319 self, 2320 gdf, 2321 border_color="lightgrey", 2322 border_width=0.10, 2323 title=None, 2324 legend=False, 2325 cmap="YlGnBu", 2326 axis_on=True, 2327 legend_kwds={"loc": "lower right", "fmt": FMT}, 2328 file_name=None, 2329 dpi=600, 2330 ax=None, 2331 ): 2332 try: 2333 import matplotlib.pyplot as plt 2334 except ImportError: 2335 raise ImportError( 2336 "Mapclassify.plot depends on matplotlib.pyplot, and this was" 2337 "not able to be imported. \nInstall matplotlib to" 2338 "plot spatial classifier." 2339 ) 2340 if ax is None: 2341 f = plt.figure() 2342 ax = plt.gca() 2343 else: 2344 f = plt.gcf() 2345 2346 fmt = FMT 2347 if "fmt" in legend_kwds: 2348 fmt = legend_kwds.pop("fmt") 2349 2350 ax = gdf.assign(_cl=self.y).plot( 2351 column="_cl", 2352 ax=ax, 2353 cmap=cmap, 2354 edgecolor=border_color, 2355 linewidth=border_width, 2356 scheme=self.name, 2357 legend=legend, 2358 legend_kwds=legend_kwds, 2359 classification_kwds={"bins": self.bins}, # for UserDefined 2360 ) 2361 if not axis_on: 2362 ax.axis("off") 2363 if title: 2364 f.suptitle(title) 2365 if file_name: 2366 plt.savefig(file_name, dpi=dpi) 2367 return f, ax 2368 2369 2370class MaxP(MapClassifier): 2371 """ 2372 MaxP Map Classification 2373 2374 Based on Max-p regionalization algorithm 2375 2376 Parameters 2377 ---------- 2378 y : array 2379 (n,1), values to classify 2380 k : int 2381 number of classes required 2382 initial : int 2383 number of initial solutions to use prior to swapping 2384 2385 Attributes 2386 ---------- 2387 2388 yb : array 2389 (n,1), bin ids for observations, 2390 bins : array 2391 (k,1), the upper bounds of each class 2392 k : int 2393 the number of classes 2394 counts : array 2395 (k,1), the number of observations falling in each class 2396 2397 Examples 2398 -------- 2399 >>> import mapclassify as mc 2400 >>> cal = mc.load_example() 2401 >>> mp = mc.MaxP(cal) 2402 >>> mp.bins 2403 array([ 8.7 , 16.7 , 20.47, 66.26, 4111.45]) 2404 2405 >>> mp.counts 2406 array([29, 8, 1, 10, 10]) 2407 """ 2408 2409 def __init__(self, y, k=K, initial=1000): 2410 if min(y) == max(y): 2411 raise ValueError("Not enough unique values in array to form k classes.") 2412 self.k = k 2413 self.initial = initial 2414 MapClassifier.__init__(self, y) 2415 self.name = "MaxP" 2416 2417 def _set_bins(self): 2418 x = self.y.copy() 2419 k = self.k 2420 q = quantile(x, k) 2421 if x.ndim == 1: 2422 x.shape = (x.size, 1) 2423 n, tmp = x.shape 2424 x.sort(axis=0) 2425 # find best of initial solutions 2426 solution = 0 2427 best_tss = x.var() * x.shape[0] 2428 tss_all = np.zeros((self.initial, 1)) 2429 while solution < self.initial: 2430 remaining = list(range(n)) 2431 seeds = [ 2432 np.nonzero(di == min(di))[0][0] for di in [np.abs(x - qi) for qi in q] 2433 ] 2434 rseeds = np.random.permutation(list(range(k))).tolist() 2435 [remaining.remove(seed) for seed in seeds] 2436 self.classes = classes = [] 2437 [classes.append([seed]) for seed in seeds] 2438 while rseeds: 2439 seed_id = rseeds.pop() 2440 current = classes[seed_id] 2441 growing = True 2442 while growing: 2443 current = classes[seed_id] 2444 low = current[0] 2445 high = current[-1] 2446 left = low - 1 2447 right = high + 1 2448 move_made = False 2449 if left in remaining: 2450 current.insert(0, left) 2451 remaining.remove(left) 2452 move_made = True 2453 if right in remaining: 2454 current.append(right) 2455 remaining.remove(right) 2456 move_made = True 2457 if move_made: 2458 classes[seed_id] = current 2459 else: 2460 growing = False 2461 tss = _fit(self.y, classes) 2462 tss_all[solution] = tss 2463 if tss < best_tss: 2464 best_solution = classes 2465 best_it = solution 2466 best_tss = tss 2467 solution += 1 2468 classes = best_solution 2469 self.best_it = best_it 2470 self.tss = best_tss 2471 self.a2c = a2c = {} 2472 self.tss_all = tss_all 2473 for r, cl in enumerate(classes): 2474 for a in cl: 2475 a2c[a] = r 2476 swapping = True 2477 while swapping: 2478 rseeds = np.random.permutation(list(range(k))).tolist() 2479 total_moves = 0 2480 while rseeds: 2481 id = rseeds.pop() 2482 growing = True 2483 total_moves = 0 2484 while growing: 2485 target = classes[id] 2486 left = target[0] - 1 2487 right = target[-1] + 1 2488 n_moves = 0 2489 if left in a2c: 2490 left_class = classes[a2c[left]] 2491 if len(left_class) > 1: 2492 a = left_class[-1] 2493 if self._swap(left_class, target, a): 2494 target.insert(0, a) 2495 left_class.remove(a) 2496 a2c[a] = id 2497 n_moves += 1 2498 if right in a2c: 2499 right_class = classes[a2c[right]] 2500 if len(right_class) > 1: 2501 a = right_class[0] 2502 if self._swap(right_class, target, a): 2503 target.append(a) 2504 right_class.remove(a) 2505 n_moves += 1 2506 a2c[a] = id 2507 if not n_moves: 2508 growing = False 2509 total_moves += n_moves 2510 if not total_moves: 2511 swapping = False 2512 xs = self.y.copy() 2513 xs.sort() 2514 self.bins = np.array([xs[cl][-1] for cl in classes]) 2515 2516 def _ss(self, class_def): 2517 """calculates sum of squares for a class""" 2518 yc = self.y[class_def] 2519 css = yc - yc.mean() 2520 css *= css 2521 return sum(css) 2522 2523 def _swap(self, class1, class2, a): 2524 """evaluate cost of moving a from class1 to class2""" 2525 ss1 = self._ss(class1) 2526 ss2 = self._ss(class2) 2527 tss1 = ss1 + ss2 2528 class1c = copy.copy(class1) 2529 class2c = copy.copy(class2) 2530 class1c.remove(a) 2531 class2c.append(a) 2532 ss1 = self._ss(class1c) 2533 ss2 = self._ss(class2c) 2534 tss2 = ss1 + ss2 2535 if tss1 < tss2: 2536 return False 2537 else: 2538 return True 2539 2540 def update(self, y=None, inplace=False, **kwargs): 2541 """ 2542 Add data or change classification parameters. 2543 2544 Parameters 2545 ---------- 2546 y : array 2547 (n,1) array of data to classify 2548 inplace : bool 2549 whether to conduct the update in place or to return a 2550 copy estimated from the additional specifications. 2551 2552 Additional parameters provided in **kwargs are passed to the init 2553 function of the class. For documentation, check the class constructor. 2554 """ 2555 kwargs.update({"initial": kwargs.pop("initial", self.initial)}) 2556 if inplace: 2557 self._update(y, bins, **kwargs) 2558 else: 2559 new = copy.deepcopy(self) 2560 new._update(y, bins, **kwargs) 2561 return new 2562 2563 2564def _fit(y, classes): 2565 """Calculate the total sum of squares for a vector y classified into 2566 classes 2567 2568 Parameters 2569 ---------- 2570 y : array 2571 (n,1), variable to be classified 2572 2573 classes : array 2574 (k,1), integer values denoting class membership 2575 2576 """ 2577 tss = 0 2578 for class_def in classes: 2579 yc = y[class_def] 2580 css = yc - yc.mean() 2581 css *= css 2582 tss += sum(css) 2583 return tss 2584 2585 2586kmethods = {} 2587kmethods["Quantiles"] = Quantiles 2588kmethods["FisherJenks"] = FisherJenks 2589kmethods["NaturalBreaks"] = NaturalBreaks 2590kmethods["MaximumBreaks"] = MaximumBreaks 2591 2592 2593def gadf(y, method="Quantiles", maxk=15, pct=0.8): 2594 r""" 2595 Evaluate the Goodness of Absolute Deviation Fit of a Classifier 2596 Finds the minimum value of k for which gadf>pct 2597 2598 Parameters 2599 ---------- 2600 2601 y : array 2602 (n, 1) values to be classified 2603 method : {'Quantiles, 'Fisher_Jenks', 'Maximum_Breaks', 'Natrual_Breaks'} 2604 maxk : int 2605 maximum value of k to evaluate 2606 pct : float 2607 The percentage of GADF to exceed 2608 2609 Returns 2610 ------- 2611 k : int 2612 number of classes 2613 cl : object 2614 instance of the classifier at k 2615 gadf : float 2616 goodness of absolute deviation fit 2617 2618 Examples 2619 -------- 2620 >>> import mapclassify as mc 2621 >>> cal = mc.load_example() 2622 >>> qgadf = mc.classifiers.gadf(cal) 2623 >>> qgadf[0] 2624 15 2625 >>> qgadf[-1] 2626 0.3740257590909283 2627 2628 Quantiles fail to exceed 0.80 before 15 classes. If we lower the bar to 2629 0.2 we see quintiles as a result 2630 2631 >>> qgadf2 = mc.classifiers.gadf(cal, pct = 0.2) 2632 >>> qgadf2[0] 2633 5 2634 >>> qgadf2[-1] 2635 0.21710231966462412 2636 >>> 2637 2638 Notes 2639 ----- 2640 2641 The GADF is defined as: 2642 2643 .. math:: 2644 2645 GADF = 1 - \sum_c \sum_{i \in c} 2646 |y_i - y_{c,med}| / \sum_i |y_i - y_{med}| 2647 2648 where :math:`y_{med}` is the global median and :math:`y_{c,med}` is 2649 the median for class :math:`c`. 2650 2651 See Also 2652 -------- 2653 KClassifiers 2654 """ 2655 2656 y = np.array(y) 2657 adam = (np.abs(y - np.median(y))).sum() 2658 for k in range(2, maxk + 1): 2659 cl = kmethods[method](y, k) 2660 gadf = 1 - cl.adcm / adam 2661 if gadf > pct: 2662 break 2663 return (k, cl, gadf) 2664 2665 2666class KClassifiers(object): 2667 """ 2668 Evaluate all k-classifers and pick optimal based on k and GADF 2669 2670 Parameters 2671 ---------- 2672 y : array 2673 (n,1), values to be classified 2674 pct : float 2675 The percentage of GADF to exceed 2676 2677 Attributes 2678 ---------- 2679 best : object 2680 instance of the optimal MapClassifier 2681 results : dictionary 2682 keys are classifier names, values are the MapClassifier 2683 instances with the best pct for each classifer 2684 2685 Examples 2686 -------- 2687 >>> import mapclassify as mc 2688 >>> cal = mc.load_example() 2689 >>> ks = mc.classifiers.KClassifiers(cal) 2690 >>> ks.best.name 2691 'FisherJenks' 2692 >>> ks.best.k 2693 4 2694 >>> ks.best.gadf 2695 0.8481032719908105 2696 2697 Notes 2698 ----- 2699 This can be used to suggest a classification scheme. 2700 2701 See Also 2702 -------- 2703 gadf 2704 2705 """ 2706 2707 def __init__(self, y, pct=0.8): 2708 results = {} 2709 best = gadf(y, "FisherJenks", maxk=len(y) - 1, pct=pct) 2710 pct0 = best[0] 2711 k0 = best[-1] 2712 keys = list(kmethods.keys()) 2713 keys.remove("FisherJenks") 2714 results["FisherJenks"] = best 2715 for method in keys: 2716 results[method] = gadf(y, method, maxk=len(y) - 1, pct=pct) 2717 k1 = results[method][0] 2718 pct1 = results[method][-1] 2719 if (k1 < k0) or (k1 == k0 and pct0 < pct1): 2720 best = results[method] 2721 k0 = k1 2722 pct0 = pct1 2723 self.results = results 2724 self.best = best[1] 2725