1"""
2A module of classification schemes for choropleth mapping.
3"""
4import numpy as np
5import scipy.stats as stats
6import copy
7from sklearn.cluster import KMeans as KMEANS
8from warnings import warn as Warn
9
10__author__ = "Sergio J. Rey"
11
12__all__ = [
13    "MapClassifier",
14    "quantile",
15    "BoxPlot",
16    "EqualInterval",
17    "FisherJenks",
18    "FisherJenksSampled",
19    "JenksCaspall",
20    "JenksCaspallForced",
21    "JenksCaspallSampled",
22    "HeadTailBreaks",
23    "MaxP",
24    "MaximumBreaks",
25    "NaturalBreaks",
26    "Quantiles",
27    "Percentiles",
28    "StdMean",
29    "UserDefined",
30    "gadf",
31    "KClassifiers",
32    "CLASSIFIERS",
33]
34
35CLASSIFIERS = (
36    "BoxPlot",
37    "EqualInterval",
38    "FisherJenks",
39    "FisherJenksSampled",
40    "HeadTailBreaks",
41    "JenksCaspall",
42    "JenksCaspallForced",
43    "JenksCaspallSampled",
44    "MaxP",
45    "MaximumBreaks",
46    "NaturalBreaks",
47    "Quantiles",
48    "Percentiles",
49    "StdMean",
50    "UserDefined",
51)
52
53K = 5  # default number of classes in any map scheme with this as an argument
54SEEDRANGE = 1000000  # range for drawing random ints from for Natural Breaks
55
56
57FMT = "{:.2f}"
58
59try:
60    from numba import jit
61except ImportError:
62
63    def jit(func):
64        return func
65
66
67def _format_intervals(mc, fmt="{:.0f}"):
68    """
69    Helper methods to format legend intervals
70
71
72    Parameters
73    ----------
74
75    mc: MapClassifier
76
77    fmt: str
78         specification of formatting for legend
79
80    Returns
81    -------
82    tuple:
83         edges: list
84                k strings for class intervals
85         max_width: int
86                length of largest interval string
87         lower_open: boolean
88                True: lower bound of first interval is open
89                False: lower bound of first interval is closed
90
91    Notes
92    -----
93    For some classifiers, it is possible that the upper bound of the first interval is less than the minimum value of the attribute that is being classified. In these cases `lower_open=True` and the lower bound of the interval is set to `np.NINF`.
94    """
95
96
97    lowest = mc.y.min()
98    if hasattr(mc, 'lowest'):
99        if mc.lowest is not None:
100            lowest = mc.lowest
101    lower_open = False
102    if lowest > mc.bins[0]:
103        lowest = np.NINF
104        lower_open = True
105    edges = [lowest]
106    edges.extend(mc.bins)
107    edges = [fmt.format(edge) for edge in edges]
108    max_width = max([len(edge) for edge in edges])
109    return edges, max_width, lower_open
110
111
112def _get_mpl_labels(mc, fmt="{:.1f}"):
113    """
114    Helper method to format legend intervals for matplotlib (and geopandas)
115
116    Parameters
117    ----------
118
119    mc: MapClassifier
120
121    fmt: str
122         specification of formatting for legend
123
124    Returns
125    -------
126    intervals: list
127             k strings for class intervals
128    """
129    edges, max_width, lower_open = _format_intervals(mc, fmt)
130    k = len(edges) - 1
131    left = ["["]
132    if lower_open:
133        left = ["("]
134    left.extend("(" * k)
135    right = "]" * (k + 1)
136    lower = ["{:>{width}}".format(edges[i], width=max_width) for i in range(k)]
137    upper = ["{:>{width}}".format(edges[i], width=max_width) for i in range(1, k + 1)]
138    lower = [l + r for l, r in zip(left, lower)]
139    upper = [l + r for l, r in zip(upper, right)]
140    intervals = [l + ", " + r for l, r in zip(lower, upper)]
141    return intervals
142
143
144def _get_table(mc, fmt="{:.2f}"):
145    """
146    Helper function to generate tabular classification report
147
148    Parameters
149    ----------
150
151    mc: MapClassifier
152
153    fmt: str
154         specification of formatting for legend
155
156    Returns
157    -------
158    table: string
159           formatted table of classification results
160
161    """
162    intervals = _get_mpl_labels(mc, fmt)
163    interval_width = len(intervals[0])
164    counts = list(map(str, mc.counts))
165    count_width = max([len(count) for count in counts])
166    count_width = max(count_width, len("count"))
167    interval_width = max(interval_width, len("interval"))
168    header = "{:^{width}}".format("Interval", width=interval_width)
169    header += "   " + "{:>{width}}".format("Count", width=count_width)
170    title = "{:<{width}}".format(mc.name, width=len(header))
171    header += "\n" + "-" * len(header)
172    table = [title, "", header]
173    for i, interval in enumerate(intervals):
174        row = interval + " | " + "{:>{width}}".format(counts[i], width=count_width)
175        table.append(row)
176    return "\n".join(table)
177
178
179def head_tail_breaks(values, cuts):
180    """
181    head tail breaks helper function
182    """
183    values = np.array(values)
184    mean = np.mean(values)
185    cuts.append(mean)
186    if len(set(values)) > 1:
187        return head_tail_breaks(values[values > mean], cuts)
188    return cuts
189
190
191def quantile(y, k=4):
192    """
193    Calculates the quantiles for an array
194
195    Parameters
196    ----------
197    y : array
198        (n,1), values to classify
199    k : int
200        number of quantiles
201
202    Returns
203    -------
204    q         : array
205                (n,1), quantile values
206
207    Examples
208    --------
209    >>> import numpy as np
210    >>> import mapclassify as mc
211    >>> x = np.arange(1000)
212    >>> mc.classifiers.quantile(x)
213    array([249.75, 499.5 , 749.25, 999.  ])
214    >>> mc.classifiers.quantile(x, k = 3)
215    array([333., 666., 999.])
216
217    Note that if there are enough ties that the quantile values repeat, we
218    collapse to pseudo quantiles in which case the number of classes will be
219    less than k
220
221    >>> x = [1.0] * 100
222    >>> x.extend([3.0] * 40)
223    >>> len(x)
224    140
225    >>> y = np.array(x)
226    >>> mc.classifiers.quantile(y)
227    array([1., 3.])
228    """
229
230    w = 100.0 / k
231    p = np.arange(w, 100 + w, w)
232    if p[-1] > 100.0:
233        p[-1] = 100.0
234    q = np.array([stats.scoreatpercentile(y, pct) for pct in p])
235    q = np.unique(q)
236    k_q = len(q)
237    if k_q < k:
238        Warn(
239            "Warning: Not enough unique values in array to form k classes", UserWarning
240        )
241        Warn("Warning: setting k to %d" % k_q, UserWarning)
242    return q
243
244
245def binC(y, bins):
246    """
247    Bin categorical/qualitative data
248
249    Parameters
250    ----------
251    y    : array
252           (n,q), categorical values
253    bins : array
254           (k,1),  unique values associated with each bin
255
256    Return
257    ------
258    b : array
259        (n,q), bin membership, values between 0 and k-1
260
261    Examples
262    --------
263    >>> import numpy as np
264    >>> import mapclassify as mc
265    >>> np.random.seed(1)
266    >>> x = np.random.randint(2, 8, (10, 3))
267    >>> bins = list(range(2, 8))
268    >>> x
269    array([[7, 5, 6],
270           [2, 3, 5],
271           [7, 2, 2],
272           [3, 6, 7],
273           [6, 3, 4],
274           [6, 7, 4],
275           [6, 5, 6],
276           [4, 6, 7],
277           [4, 6, 3],
278           [3, 2, 7]])
279    >>> y = mc.classifiers.binC(x, bins)
280    >>> y
281    array([[5, 3, 4],
282           [0, 1, 3],
283           [5, 0, 0],
284           [1, 4, 5],
285           [4, 1, 2],
286           [4, 5, 2],
287           [4, 3, 4],
288           [2, 4, 5],
289           [2, 4, 1],
290           [1, 0, 5]])
291    """
292
293    if np.ndim(y) == 1:
294        k = 1
295        n = np.shape(y)[0]
296    else:
297        n, k = np.shape(y)
298    b = np.zeros((n, k), dtype="int")
299    for i, bin in enumerate(bins):
300        b[np.nonzero(y == bin)] = i
301
302    # check for non-binned items and warn if needed
303    vals = set(y.flatten())
304    for val in vals:
305        if val not in bins:
306            Warn("value not in bin: {}".format(val), UserWarning)
307            Warn("bins: {}".format(bins), UserWarning)
308
309    return b
310
311
312def bin(y, bins):
313    """
314    bin interval/ratio data
315
316    Parameters
317    ----------
318    y : array
319        (n,q), values to bin
320    bins : array
321           (k,1), upper bounds of each bin (monotonic)
322
323    Returns
324    -------
325    b : array
326        (n,q), values of values between 0 and k-1
327
328    Examples
329    --------
330    >>> import numpy as np
331    >>> import mapclassify as mc
332    >>> np.random.seed(1)
333    >>> x = np.random.randint(2, 20, (10, 3))
334    >>> bins = [10, 15, 20]
335    >>> b = mc.classifiers.bin(x, bins)
336    >>> x
337    array([[ 7, 13, 14],
338           [10, 11, 13],
339           [ 7, 17,  2],
340           [18,  3, 14],
341           [ 9, 15,  8],
342           [ 7, 13, 12],
343           [16,  6, 11],
344           [19,  2, 15],
345           [11, 11,  9],
346           [ 3,  2, 19]])
347    >>> b
348    array([[0, 1, 1],
349           [0, 1, 1],
350           [0, 2, 0],
351           [2, 0, 1],
352           [0, 1, 0],
353           [0, 1, 1],
354           [2, 0, 1],
355           [2, 0, 1],
356           [1, 1, 0],
357           [0, 0, 2]])
358    """
359    if np.ndim(y) == 1:
360        k = 1
361        n = np.shape(y)[0]
362    else:
363        n, k = np.shape(y)
364    b = np.zeros((n, k), dtype="int")
365    i = len(bins)
366    if type(bins) != list:
367        bins = bins.tolist()
368    binsc = copy.copy(bins)
369    while binsc:
370        i -= 1
371        c = binsc.pop(-1)
372        b[np.nonzero(y <= c)] = i
373    return b
374
375
376def bin1d(x, bins):
377    """
378    Place values of a 1-d array into bins and determine counts of values in
379    each bin
380
381    Parameters
382    ----------
383    x : array
384        (n, 1), values to bin
385    bins : array
386           (k,1), upper bounds of each bin (monotonic)
387
388    Returns
389    -------
390    binIds : array
391             1-d array of integer bin Ids
392
393    counts : int
394            number of elements of x falling in each bin
395
396    Examples
397    --------
398    >>> import numpy as np
399    >>> import mapclassify as mc
400    >>> x = np.arange(100, dtype = 'float')
401    >>> bins = [25, 74, 100]
402    >>> binIds, counts = mc.classifiers.bin1d(x, bins)
403    >>> binIds
404    array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
405           0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
406           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
407           1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
408           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
409    >>> counts
410    array([26, 49, 25])
411    """
412    left = [-float("inf")]
413    left.extend(bins[0:-1])
414    right = bins
415    cuts = list(zip(left, right))
416    k = len(bins)
417    binIds = np.zeros(x.shape, dtype="int")
418    while cuts:
419        k -= 1
420        l, r = cuts.pop(-1)
421        binIds += (x > l) * (x <= r) * k
422    counts = np.bincount(binIds, minlength=len(bins))
423    return (binIds, counts)
424
425
426def load_example():
427    """
428    Helper function for doc tests
429    """
430    from .datasets import calemp
431
432    return calemp.load()
433
434
435def _kmeans(y, k=5, n_init=10):
436    """
437    Helper function to do k-means in one dimension
438
439    Parameters
440    ----------
441
442    y       : array
443              (n,1), values to classify
444    k       : int
445              number of classes to form
446
447    n_init : int, default: 10
448              number of initial  solutions. Best of initial results is returned.
449    """
450
451    y = y * 1.0  # KMEANS needs float or double dtype
452    y.shape = (-1, 1)
453    result = KMEANS(n_clusters=k, init="k-means++", n_init=n_init).fit(y)
454    class_ids = result.labels_
455    centroids = result.cluster_centers_
456    binning = []
457    for c in range(k):
458        values = y[class_ids == c]
459        binning.append([values.max(), len(values)])
460    binning = np.array(binning)
461    binning = binning[binning[:, 0].argsort()]
462    cuts = binning[:, 0]
463
464    y_cent = np.zeros_like(y)
465    for c in range(k):
466        y_cent[class_ids == c] = centroids[c]
467    diffs = y - y_cent
468    diffs *= diffs
469
470    return class_ids, cuts, diffs.sum(), centroids
471
472
473def natural_breaks(values, k=5, init=10):
474    """
475    natural breaks helper function
476
477    Jenks natural breaks is kmeans in one dimension
478
479    Parameters
480    ----------
481
482    values : array
483             (n, 1) values to bin
484
485    k : int
486        Number of classes
487
488    init: int, default:10
489        Number of different solutions to obtain using different centroids. Best solution is returned.
490
491
492    """
493    values = np.array(values)
494    uv = np.unique(values)
495    uvk = len(uv)
496    if uvk < k:
497        Warn(
498            "Warning: Not enough unique values in array to form k classes", UserWarning
499        )
500        Warn("Warning: setting k to %d" % uvk, UserWarning)
501        k = uvk
502    kres = _kmeans(values, k, n_init=init)
503    sids = kres[-1]  # centroids
504    fit = kres[-2]
505    class_ids = kres[0]
506    cuts = kres[1]
507    return (sids, class_ids, fit, cuts)
508
509
510@jit
511def _fisher_jenks_means(values, classes=5, sort=True):
512    """
513    Jenks Optimal (Natural Breaks) algorithm implemented in Python.
514
515    Notes
516    -----
517    The original Python code comes from here:
518    http://danieljlewis.org/2010/06/07/jenks-natural-breaks-algorithm-in-python/
519    and is based on a JAVA and Fortran code available here:
520    https://stat.ethz.ch/pipermail/r-sig-geo/2006-March/000811.html
521
522    Returns class breaks such that classes are internally homogeneous while
523    assuring heterogeneity among classes.
524
525    """
526    if sort:
527        values.sort()
528    n_data = len(values)
529    mat1 = np.zeros((n_data + 1, classes + 1), dtype=np.int32)
530    mat2 = np.zeros((n_data + 1, classes + 1), dtype=np.float32)
531    mat1[1, 1:] = 1
532    mat2[2:, 1:] = np.inf
533
534    v = np.float32(0)
535    for l in range(2, len(values) + 1):
536        s1 = np.float32(0)
537        s2 = np.float32(0)
538        w = np.float32(0)
539        for m in range(1, l + 1):
540            i3 = l - m + 1
541            val = np.float32(values[i3 - 1])
542            s2 += val * val
543            s1 += val
544            w += np.float32(1)
545            v = s2 - (s1 * s1) / w
546            i4 = i3 - 1
547            if i4 != 0:
548                for j in range(2, classes + 1):
549                    if mat2[l, j] >= (v + mat2[i4, j - 1]):
550                        mat1[l, j] = i3
551                        mat2[l, j] = v + mat2[i4, j - 1]
552        mat1[l, 1] = 1
553        mat2[l, 1] = v
554
555    k = len(values)
556
557    kclass = np.zeros(classes + 1, dtype=values.dtype)
558    kclass[classes] = values[len(values) - 1]
559    kclass[0] = values[0]
560    for countNum in range(classes, 1, -1):
561        pivot = mat1[k, countNum]
562        id = int(pivot - 2)
563        kclass[countNum - 1] = values[id]
564        k = int(pivot - 1)
565    return kclass
566
567
568class MapClassifier(object):
569    r"""
570    Abstract class for all map classifications :cite:`Slocum_2009`
571
572    For an array :math:`y` of :math:`n` values, a map classifier places each
573    value :math:`y_i` into one of :math:`k` mutually exclusive and exhaustive
574    classes.  Each classifer defines the classes based on different criteria,
575    but in all cases the following hold for the classifiers in PySAL:
576
577    .. math:: C_j^l < y_i \le C_j^u \  \forall  i \in C_j
578
579    where :math:`C_j` denotes class :math:`j` which has lower bound
580          :math:`C_j^l` and upper bound :math:`C_j^u`.
581
582    Map Classifiers Supported
583
584    * :class:`mapclassify.classifiers.BoxPlot`
585    * :class:`mapclassify.classifiers.EqualInterval`
586    * :class:`mapclassify.classifiers.FisherJenks`
587    * :class:`mapclassify.classifiers.FisherJenksSampled`
588    * :class:`mapclassify.classifiers.HeadTailBreaks`
589    * :class:`mapclassify.classifiers.JenksCaspall`
590    * :class:`mapclassify.classifiers.JenksCaspallForced`
591    * :class:`mapclassify.classifiers.JenksCaspallSampled`
592    * :class:`mapclassify.classifiers.MaxP`
593    * :class:`mapclassify.classifiers.MaximumBreaks`
594    * :class:`mapclassify.classifiers.NaturalBreaks`
595    * :class:`mapclassify.classifiers.Quantiles`
596    * :class:`mapclassify.classifiers.Percentiles`
597    * :class:`mapclassify.classifiers.StdMean`
598    * :class:`mapclassify.classifiers.UserDefined`
599
600    Utilities:
601
602    In addition to the classifiers, there are several utility functions that
603    can be used to evaluate the properties of a specific classifier,
604    or for automatic selection of a classifier and
605    number of classes.
606
607    * :func:`mapclassify.classifiers.gadf`
608    * :class:`mapclassify.classifiers.K_classifiers`
609
610    """
611
612    def __init__(self, y):
613        y = np.asarray(y).flatten()
614        self.name = "Map Classifier"
615        self.fmt = FMT
616        self.y = y
617        self._classify()
618        self._summary()
619
620    def get_fmt(self):
621        return self._fmt
622
623    def set_fmt(self, fmt):
624        self._fmt = fmt
625
626    fmt = property(get_fmt, set_fmt)
627
628    def _summary(self):
629        yb = self.yb
630        self.classes = [np.nonzero(yb == c)[0].tolist() for c in range(self.k)]
631        self.tss = self.get_tss()
632        self.adcm = self.get_adcm()
633        self.gadf = self.get_gadf()
634
635    def _classify(self):
636        self._set_bins()
637        self.yb, self.counts = bin1d(self.y, self.bins)
638
639    def _update(self, data, *args, **kwargs):
640        """
641        The only thing that *should* happen in this function is
642        1. input sanitization for pandas
643        2. classification/reclassification.
644
645        Using their __init__ methods, all classifiers can re-classify given
646        different input parameters or additional data.
647
648        If you've got a cleverer updating equation than the intial estimation
649        equation, remove the call to self.__init__ below and replace it with
650        the updating function.
651        """
652        if data is not None:
653            data = np.asarray(data).flatten()
654            data = np.append(data.flatten(), self.y)
655        else:
656            data = self.y
657        self.__init__(data, *args, **kwargs)
658
659    @classmethod
660    def make(cls, *args, **kwargs):
661        """
662        Configure and create a classifier that will consume data and produce
663        classifications, given the configuration options specified by this
664        function.
665
666        Note that this like a *partial application* of the relevant class
667        constructor. `make` creates a function that returns classifications; it
668        does not actually do the classification.
669
670        If you want to classify data directly, use the appropriate class
671        constructor, like Quantiles, Max_Breaks, etc.
672
673        If you *have* a classifier object, but want to find which bins new data
674        falls into, use find_bin.
675
676        Parameters
677        ----------
678        *args           : required positional arguments
679                          all positional arguments required by the classifier,
680                          excluding the input data.
681        rolling         : bool
682                          a boolean configuring the outputted classifier to use
683                          a rolling classifier rather than a new classifier for
684                          each input. If rolling, this adds the current data to
685                          all of the previous data in the classifier, and
686                          rebalances the bins, like a running median
687                          computation.
688        return_object   : bool
689                          a boolean configuring the outputted classifier to
690                          return the classifier object or not
691        return_bins     : bool
692                          a boolean configuring the outputted classifier to
693                          return the bins/breaks or not
694        return_counts   : bool
695                          a boolean configuring the outputted classifier to
696                          return the histogram of objects falling into each bin
697                          or not
698
699        Returns
700        -------
701        A function that consumes data and returns their bins (and object,
702        bins/breaks, or counts, if requested).
703
704        Note
705        ----
706        This is most useful when you want to run a classifier many times
707        with a given configuration, such as when classifying many columns of an
708        array or dataframe using the same configuration.
709
710        Examples
711        --------
712        >>> import libpysal as ps
713        >>> import mapclassify as mc
714        >>> import geopandas as gpd
715        >>> df = gpd.read_file(ps.examples.get_path('columbus.dbf'))
716        >>> classifier = mc.Quantiles.make(k=9)
717        >>> cl = df[['HOVAL', 'CRIME', 'INC']].apply(classifier)
718        >>> cl["HOVAL"].values[:10]
719        array([8, 7, 2, 4, 1, 3, 8, 5, 7, 8])
720        >>> cl["CRIME"].values[:10]
721        array([0, 1, 3, 4, 6, 2, 0, 5, 3, 4])
722        >>> cl["INC"].values[:10]
723        array([7, 8, 5, 0, 3, 5, 0, 3, 6, 4])
724        >>> import pandas as pd; from numpy import linspace as lsp
725        >>> data = [lsp(3,8,num=10), lsp(10, 0, num=10), lsp(-5, 15, num=10)]
726        >>> data = pd.DataFrame(data).T
727        >>> data
728                  0          1          2
729        0  3.000000  10.000000  -5.000000
730        1  3.555556   8.888889  -2.777778
731        2  4.111111   7.777778  -0.555556
732        3  4.666667   6.666667   1.666667
733        4  5.222222   5.555556   3.888889
734        5  5.777778   4.444444   6.111111
735        6  6.333333   3.333333   8.333333
736        7  6.888889   2.222222  10.555556
737        8  7.444444   1.111111  12.777778
738        9  8.000000   0.000000  15.000000
739        >>> data.apply(mc.Quantiles.make(rolling=True))
740           0  1  2
741        0  0  4  0
742        1  0  4  0
743        2  1  4  0
744        3  1  3  0
745        4  2  2  1
746        5  2  1  2
747        6  3  0  4
748        7  3  0  4
749        8  4  0  4
750        9  4  0  4
751        >>> dbf = ps.io.open(ps.examples.get_path('baltim.dbf'))
752        >>> data = dbf.by_col_array('PRICE', 'LOTSZ', 'SQFT')
753        >>> my_bins = [1, 10, 20, 40, 80]
754        >>> cl = [mc.UserDefined.make(bins=my_bins)(a) for a in data.T]
755        >>> len(cl)
756        3
757        >>> cl[0][:10]
758        array([4, 5, 5, 5, 4, 4, 5, 4, 4, 5])
759        """
760
761        # only flag overrides return flag
762        to_annotate = copy.deepcopy(kwargs)
763        return_object = kwargs.pop("return_object", False)
764        return_bins = kwargs.pop("return_bins", False)
765        return_counts = kwargs.pop("return_counts", False)
766
767        rolling = kwargs.pop("rolling", False)
768        if rolling:
769            #  just initialize a fake classifier
770            data = list(range(10))
771            cls_instance = cls(data, *args, **kwargs)
772            #  and empty it, since we'll be using the update
773            cls_instance.y = np.array([])
774        else:
775            cls_instance = None
776
777        #  wrap init in a closure to make a consumer.
778        #  Qc Na: "Objects/Closures are poor man's Closures/Objects"
779        def classifier(data, cls_instance=cls_instance):
780            if rolling:
781                cls_instance.update(data, inplace=True, **kwargs)
782                yb = cls_instance.find_bin(data)
783            else:
784                cls_instance = cls(data, *args, **kwargs)
785                yb = cls_instance.yb
786            outs = [yb, None, None, None]
787            outs[1] = cls_instance if return_object else None
788            outs[2] = cls_instance.bins if return_bins else None
789            outs[3] = cls_instance.counts if return_counts else None
790            outs = [a for a in outs if a is not None]
791            if len(outs) == 1:
792                return outs[0]
793            else:
794                return outs
795
796        #  for debugging/jic, keep around the kwargs.
797        #  in future, we might want to make this a thin class, so that we can
798        #  set a custom repr. Call the class `Binner` or something, that's a
799        #  pre-configured Classifier that just consumes data, bins it, &
800        #  possibly updates the bins.
801        classifier._options = to_annotate
802        return classifier
803
804    def update(self, y=None, inplace=False, **kwargs):
805        """
806        Add data or change classification parameters.
807
808        Parameters
809        ----------
810        y       :   array
811                    (n,1) array of data to classify
812        inplace :   bool
813                    whether to conduct the update in place or to return a copy
814                    estimated from the additional specifications.
815
816        Additional parameters provided in **kwargs are passed to the init
817        function of the class. For documentation, check the class constructor.
818        """
819        kwargs.update({"k": kwargs.pop("k", self.k)})
820        if inplace:
821            self._update(y, **kwargs)
822        else:
823            new = copy.deepcopy(self)
824            new._update(y, **kwargs)
825            return new
826
827    def __str__(self):
828        return self.table()
829
830    def __repr__(self):
831        return self.table()
832
833    def table(self):
834        fmt = self.fmt
835        return _get_table(self, fmt=fmt)
836
837    def __call__(self, *args, **kwargs):
838        """
839        This will allow the classifier to be called like it's a function.
840
841        Whether or not we want to make this be "find_bin" or "update" is a
842        design decision.
843
844        I like this as find_bin, since a classifier's job should be to classify
845        the data given to it using the rules estimated from the `_classify()`
846        function.
847        """
848        return self.find_bin(*args)
849
850    def get_tss(self):
851        """
852        Total sum of squares around class means
853
854        Returns sum of squares over all class means
855        """
856        tss = 0
857        for class_def in self.classes:
858            if len(class_def) > 0:
859                yc = self.y[class_def]
860                css = yc - yc.mean()
861                css *= css
862                tss += sum(css)
863        return tss
864
865    def _set_bins(self):
866        pass
867
868    def get_adcm(self):
869        """
870        Absolute deviation around class median (ADCM).
871
872        Calculates the absolute deviations of each observation about its class
873        median as a measure of fit for the classification method.
874
875        Returns sum of ADCM over all classes
876        """
877        adcm = 0
878        for class_def in self.classes:
879            if len(class_def) > 0:
880                yc = self.y[class_def]
881                yc_med = np.median(yc)
882                ycd = np.abs(yc - yc_med)
883                adcm += sum(ycd)
884        return adcm
885
886    def get_gadf(self):
887        """
888        Goodness of absolute deviation of fit
889        """
890        adam = (np.abs(self.y - np.median(self.y))).sum()
891        gadf = 1 - self.adcm / adam
892        return gadf
893
894    def _table_string(self, width=12, decimal=3):
895        labels, largest = self.get_legend_classes(table=True)
896        h1 = "Lower"
897        h1 = h1.center(largest)
898        h2 = " "
899        h2 = h2.center(10)
900        h3 = "Upper"
901        h3 = h3.center(largest + 1)
902
903        largest = "%d" % max(self.counts)
904        largest = len(largest) + 15
905        h4 = "Count"
906
907        h4 = h4.rjust(largest)
908        table = []
909        header = h1 + h2 + h3 + h4
910        table.append(header)
911        table.append("=" * len(header))
912
913        for i, label in enumerate(labels):
914            left, right = label.split()
915            if i == 0:
916                left = " " * largest
917                left += "   x[i] <= "
918            else:
919                left += " < x[i] <= "
920            row = left + right
921            cnt = "%d" % self.counts[i]
922            cnt = cnt.rjust(largest)
923            row += cnt
924            table.append(row)
925        name = self.name
926        top = name.center(len(row))
927        table.insert(0, top)
928        table.insert(1, " ")
929        table = "\n".join(table)
930        return table
931
932    def find_bin(self, x):
933        """
934        Sort input or inputs according to the current bin estimate
935
936        Parameters
937        ----------
938        x       :   array or numeric
939                    a value or array of values to fit within the estimated
940                    bins
941
942        Returns
943        -------
944        a bin index or array of bin indices that classify the input into one of
945        the classifiers' bins.
946
947        Note that this differs from similar functionality in
948        numpy.digitize(x, classi.bins, right=True).
949
950        This will always provide the closest bin, so data "outside" the classifier,
951        above and below the max/min breaks, will be classified into the nearest bin.
952
953        numpy.digitize returns k+1 for data greater than the greatest bin, but retains 0
954        for data below the lowest bin.
955        """
956        x = np.asarray(x).flatten()
957        right = np.digitize(x, self.bins, right=True)
958        if right.max() == len(self.bins):
959            right[right == len(self.bins)] = len(self.bins) - 1
960        return right
961
962    def get_legend_classes(self, fmt=FMT):
963        """
964        Format the strings for the classes on the legend
965
966
967        Parameters
968        ==========
969
970        fmt : string
971              formatting specification
972
973        Returns
974        =======
975        classes: list
976               k strings with class interval definitions
977        """
978        return _get_mpl_labels(self, fmt)
979
980    def plot(
981        self,
982        gdf,
983        border_color="lightgrey",
984        border_width=0.10,
985        title=None,
986        legend=False,
987        cmap="YlGnBu",
988        axis_on=True,
989        legend_kwds={"loc": "lower right", "fmt": FMT},
990        file_name=None,
991        dpi=600,
992        ax=None,
993    ):
994        """
995        Plot Mapclassiifer
996        NOTE: Requires matplotlib, and implicitly requires geopandas
997        dataframe as input.
998
999        Parameters
1000        ---------
1001        gdf           : geopandas geodataframe
1002                        Contains the geometry column for the choropleth map
1003        border_color  : string, optional
1004                        matplotlib color string to use for polygon border
1005                        (Default: lightgrey)
1006        border_width  : float, optional
1007                        width of polygon boarder
1008                        (Default: 0.10)
1009        title         : string, optional
1010                        Title of map
1011                        (Default: None)
1012        cmap          : string, optional
1013                        matplotlib color string for color map to fill polygons
1014                        (Default: YlGn)
1015        axis_on       : boolean, optional
1016                        Show coordinate axes (default True)
1017                        (Default: True)
1018        legend_kwds   : dict, optional
1019                        options for ax.legend()
1020                        (Default: {"loc": "lower right", 'fmt':FMT})
1021        file_name     : string, optional
1022                        Name of file to save figure to.
1023                        (Default: None)
1024        dpi           : int, optional
1025                        Dots per inch for saved figure
1026                        (Default: 600)
1027        ax            : matplotlib axis, optional
1028                        axis on which to plot the choropleth.
1029                        (Default: None, so plots on the current figure)
1030        Returns
1031        -------
1032        f,ax        : tuple
1033                      matplotlib figure, axis on which the plot is made.
1034
1035
1036        Examples
1037        --------
1038
1039        >>> import libpysal as lp
1040        >>> import geopandas
1041        >>> import mapclassify
1042        >>> gdf = geopandas.read_file(lp.examples.get_path("columbus.shp"))
1043        >>> q5 = mapclassify.Quantiles(gdf.CRIME)
1044        >>> q5.plot(gdf)  # doctest: +SKIP
1045        """
1046        try:
1047            import matplotlib.pyplot as plt
1048        except ImportError:
1049            raise ImportError(
1050                "Mapclassify.plot depends on matplotlib.pyplot, and this was"
1051                "not able to be imported. \nInstall matplotlib to"
1052                "plot spatial classifier."
1053            )
1054        if ax is None:
1055            f = plt.figure()
1056            ax = plt.gca()
1057        else:
1058            f = plt.gcf()
1059
1060        ax = gdf.assign(_cl=self.y).plot(
1061            column="_cl",
1062            ax=ax,
1063            cmap=cmap,
1064            edgecolor=border_color,
1065            linewidth=border_width,
1066            scheme=self.name,
1067            legend=legend,
1068            legend_kwds=legend_kwds,
1069        )
1070        if not axis_on:
1071            ax.axis("off")
1072        if title:
1073            f.suptitle(title)
1074        if file_name:
1075            plt.savefig(file_name, dpi=dpi)
1076        return f, ax
1077
1078
1079class HeadTailBreaks(MapClassifier):
1080    """
1081    Head/tail Breaks Map Classification for Heavy-tailed Distributions
1082
1083    Parameters
1084    ----------
1085    y       : array
1086              (n,1), values to classify
1087
1088    Attributes
1089    ----------
1090    yb      : array
1091              (n,1), bin ids for observations,
1092    bins    : array
1093              (k,1), the upper bounds of each class
1094    k       : int
1095              the number of classes
1096    counts  : array
1097              (k,1), the number of observations falling in each class
1098
1099    Examples
1100    --------
1101    >>> import numpy as np
1102    >>> import mapclassify as mc
1103    >>> np.random.seed(10)
1104    >>> cal = mc.load_example()
1105    >>> htb = mc.HeadTailBreaks(cal)
1106    >>> htb.k
1107    3
1108    >>> htb.counts
1109    array([50,  7,  1])
1110    >>> htb.bins
1111    array([ 125.92810345,  811.26      , 4111.45      ])
1112    >>> np.random.seed(123456)
1113    >>> x = np.random.lognormal(3, 1, 1000)
1114    >>> htb = mc.HeadTailBreaks(x)
1115    >>> htb.bins
1116    array([ 32.26204423,  72.50205622, 128.07150107, 190.2899093 ,
1117           264.82847377, 457.88157946, 576.76046949])
1118    >>> htb.counts
1119    array([695, 209,  62,  22,  10,   1,   1])
1120
1121    Notes
1122    -----
1123    Head/tail Breaks is a relatively new classification method developed
1124    for data with a heavy-tailed distribution.
1125
1126    Implementation based on contributions by Alessandra Sozzi <alessandra.sozzi@gmail.com>.
1127
1128    For theoretical details see :cite:`Jiang_2013`.
1129
1130    """
1131
1132    def __init__(self, y):
1133        MapClassifier.__init__(self, y)
1134        self.name = "HeadTailBreaks"
1135
1136    def _set_bins(self):
1137
1138        x = self.y.copy()
1139        bins = []
1140        bins = head_tail_breaks(x, bins)
1141        self.bins = np.array(bins)
1142        self.k = len(self.bins)
1143
1144
1145class EqualInterval(MapClassifier):
1146    """
1147    Equal Interval Classification
1148
1149    Parameters
1150    ----------
1151    y : array
1152        (n,1), values to classify
1153    k : int
1154        number of classes required
1155
1156    Attributes
1157    ----------
1158    yb      : array
1159              (n,1), bin ids for observations,
1160              each value is the id of the class the observation belongs to
1161              yb[i] = j  for j>=1  if bins[j-1] < y[i] <= bins[j], yb[i] = 0
1162              otherwise
1163    bins    : array
1164              (k,1), the upper bounds of each class
1165    k       : int
1166              the number of classes
1167    counts  : array
1168              (k,1), the number of observations falling in each class
1169
1170    Examples
1171    --------
1172    >>> import mapclassify as mc
1173    >>> cal = mc.load_example()
1174    >>> ei = mc.EqualInterval(cal, k = 5)
1175    >>> ei.k
1176    5
1177    >>> ei.counts
1178    array([57,  0,  0,  0,  1])
1179    >>> ei.bins
1180    array([ 822.394, 1644.658, 2466.922, 3289.186, 4111.45 ])
1181
1182    Notes
1183    -----
1184    Intervals defined to have equal width:
1185
1186    .. math::
1187
1188        bins_j = min(y)+w*(j+1)
1189
1190    with :math:`w=\\frac{max(y)-min(j)}{k}`
1191    """
1192
1193    def __init__(self, y, k=K):
1194        """
1195        see class docstring
1196
1197        """
1198        if min(y) == max(y):
1199            raise ValueError("Not enough unique values in array to form k classes.")
1200        self.k = k
1201        MapClassifier.__init__(self, y)
1202        self.name = "EqualInterval"
1203
1204    def _set_bins(self):
1205        y = self.y
1206        k = self.k
1207        max_y = max(y)
1208        min_y = min(y)
1209        rg = max_y - min_y
1210        width = rg * 1.0 / k
1211        cuts = np.arange(min_y + width, max_y + width, width)
1212        if len(cuts) > self.k:  # handle overshooting
1213            cuts = cuts[0:k]
1214        cuts[-1] = max_y
1215        bins = cuts.copy()
1216        self.bins = bins
1217
1218
1219class Percentiles(MapClassifier):
1220    """
1221    Percentiles Map Classification
1222
1223    Parameters
1224    ----------
1225    y    : array
1226           attribute to classify
1227    pct  : array
1228           percentiles default=[1,10,50,90,99,100]
1229
1230    Attributes
1231    ----------
1232    yb     : array
1233             bin ids for observations (numpy array n x 1)
1234    bins   : array
1235             the upper bounds of each class (numpy array k x 1)
1236    k      : int
1237             the number of classes
1238    counts : int
1239             the number of observations falling in each class
1240             (numpy array k x 1)
1241
1242    Examples
1243    --------
1244    >>> import mapclassify as mc
1245    >>> cal = mc.load_example()
1246    >>> p = mc.Percentiles(cal)
1247    >>> p.bins
1248    array([1.357000e-01, 5.530000e-01, 9.365000e+00, 2.139140e+02,
1249           2.179948e+03, 4.111450e+03])
1250    >>> p.counts
1251    array([ 1,  5, 23, 23,  5,  1])
1252    >>> p2 = mc.Percentiles(cal, pct = [50, 100])
1253    >>> p2.bins
1254    array([   9.365, 4111.45 ])
1255    >>> p2.counts
1256    array([29, 29])
1257    >>> p2.k
1258    2
1259    """
1260
1261    def __init__(self, y, pct=[1, 10, 50, 90, 99, 100]):
1262        self.pct = pct
1263        MapClassifier.__init__(self, y)
1264        self.name = "Percentiles"
1265
1266    def _set_bins(self):
1267        y = self.y
1268        pct = self.pct
1269        self.bins = np.array([stats.scoreatpercentile(y, p) for p in pct])
1270        self.k = len(self.bins)
1271
1272    def update(self, y=None, inplace=False, **kwargs):
1273        """
1274        Add data or change classification parameters.
1275
1276        Parameters
1277        ----------
1278        y   :   array
1279                    (n,1) array of data to classify
1280        inplace :   bool
1281                    whether to conduct the update in place or to return a copy
1282                    estimated from the additional specifications.
1283
1284        Additional parameters provided in **kwargs are passed to the init
1285        function of the class. For documentation, check the class constructor.
1286        """
1287        kwargs.update({"pct": kwargs.pop("pct", self.pct)})
1288        if inplace:
1289            self._update(y, **kwargs)
1290        else:
1291            new = copy.deepcopy(self)
1292            new._update(y, **kwargs)
1293            return new
1294
1295
1296class BoxPlot(MapClassifier):
1297    """
1298    BoxPlot Map Classification
1299
1300    Parameters
1301    ----------
1302    y     : array
1303            attribute to classify
1304    hinge : float
1305            multiplier for IQR
1306
1307    Attributes
1308    ----------
1309    yb : array
1310        (n,1), bin ids for observations
1311    bins : array
1312          (n,1), the upper bounds of each class  (monotonic)
1313    k : int
1314        the number of classes
1315    counts : array
1316             (k,1), the number of observations falling in each class
1317    low_outlier_ids : array
1318        indices of observations that are low outliers
1319    high_outlier_ids : array
1320        indices of observations that are high outliers
1321
1322    Notes
1323    -----
1324
1325    The bins are set as follows::
1326
1327        bins[0] = q[0]-hinge*IQR
1328        bins[1] = q[0]
1329        bins[2] = q[1]
1330        bins[3] = q[2]
1331        bins[4] = q[2]+hinge*IQR
1332        bins[5] = inf  (see Notes)
1333
1334    where q is an array of the first three quartiles of y and
1335    IQR=q[2]-q[0]
1336
1337    If q[2]+hinge*IQR > max(y) there will only be 5 classes and no high
1338    outliers, otherwise, there will be 6 classes and at least one high
1339    outlier.
1340
1341    Examples
1342    --------
1343    >>> import mapclassify as mc
1344    >>> cal = mc.load_example()
1345    >>> bp = mc.BoxPlot(cal)
1346    >>> bp.bins
1347    array([-5.287625e+01,  2.567500e+00,  9.365000e+00,  3.953000e+01,
1348            9.497375e+01,  4.111450e+03])
1349    >>> bp.counts
1350    array([ 0, 15, 14, 14,  6,  9])
1351    >>> bp.high_outlier_ids
1352    array([ 0,  6, 18, 29, 33, 36, 37, 40, 42])
1353    >>> cal[bp.high_outlier_ids].values
1354    array([ 329.92,  181.27,  370.5 ,  722.85,  192.05,  110.74, 4111.45,
1355            317.11,  264.93])
1356    >>> bx = mc.BoxPlot(np.arange(100))
1357    >>> bx.bins
1358    array([-49.5 ,  24.75,  49.5 ,  74.25, 148.5 ])
1359
1360    """
1361
1362    def __init__(self, y, hinge=1.5):
1363        """
1364        Parameters
1365        ----------
1366        y : array (n,1)
1367            attribute to classify
1368        hinge : float
1369            multiple of inter-quartile range (default=1.5)
1370        """
1371        self.hinge = hinge
1372        MapClassifier.__init__(self, y)
1373        self.name = "BoxPlot"
1374
1375    def _set_bins(self):
1376        y = self.y
1377        pct = [25, 50, 75, 100]
1378        bins = [stats.scoreatpercentile(y, p) for p in pct]
1379        iqr = bins[-2] - bins[0]
1380        self.iqr = iqr
1381        pivot = self.hinge * iqr
1382        left_fence = bins[0] - pivot
1383        right_fence = bins[-2] + pivot
1384        if right_fence < bins[-1]:
1385            bins.insert(-1, right_fence)
1386        else:
1387            bins[-1] = right_fence
1388        bins.insert(0, left_fence)
1389        self.bins = np.array(bins)
1390        self.k = len(bins)
1391
1392    def _classify(self):
1393        MapClassifier._classify(self)
1394        self.low_outlier_ids = np.nonzero(self.yb == 0)[0]
1395        self.high_outlier_ids = np.nonzero(self.yb == 5)[0]
1396
1397    def update(self, y=None, inplace=False, **kwargs):
1398        """
1399        Add data or change classification parameters.
1400
1401        Parameters
1402        ----------
1403        y       :   array
1404                        (n,1) array of data to classify
1405        inplace     :   bool
1406                        whether to conduct the update in place or to return a
1407                        copy estimated from the additional specifications.
1408
1409        Additional parameters provided in **kwargs are passed to the init
1410        function of the class. For documentation, check the class constructor.
1411        """
1412        kwargs.update({"hinge": kwargs.pop("hinge", self.hinge)})
1413        if inplace:
1414            self._update(y, **kwargs)
1415        else:
1416            new = copy.deepcopy(self)
1417            new._update(y, **kwargs)
1418            return new
1419
1420
1421class Quantiles(MapClassifier):
1422    """
1423    Quantile Map Classification
1424
1425    Parameters
1426    ----------
1427    y : array
1428        (n,1), values to classify
1429    k : int
1430        number of classes required
1431
1432    Attributes
1433    ----------
1434    yb      : array
1435              (n,1), bin ids for observations,
1436              each value is the id of the class the observation belongs to
1437              yb[i] = j  for j>=1  if bins[j-1] < y[i] <= bins[j], yb[i] = 0
1438              otherwise
1439    bins    : array
1440              (k,1), the upper bounds of each class
1441    k       : int
1442              the number of classes
1443    counts  : array
1444              (k,1), the number of observations falling in each class
1445
1446
1447    Examples
1448    --------
1449    >>> import mapclassify as mc
1450    >>> cal = mc.load_example()
1451    >>> q = mc.Quantiles(cal, k = 5)
1452    >>> q.bins
1453    array([1.46400e+00, 5.79800e+00, 1.32780e+01, 5.46160e+01, 4.11145e+03])
1454    >>> q.counts
1455    array([12, 11, 12, 11, 12])
1456    """
1457
1458    def __init__(self, y, k=K):
1459        self.k = k
1460        MapClassifier.__init__(self, y)
1461        self.name = "Quantiles"
1462
1463    def _set_bins(self):
1464        y = self.y
1465        k = self.k
1466        self.bins = quantile(y, k=k)
1467
1468
1469class StdMean(MapClassifier):
1470    """
1471    Standard Deviation and Mean Map Classification
1472
1473    Parameters
1474    ----------
1475    y         : array
1476                (n,1), values to classify
1477    multiples : array
1478                the multiples of the standard deviation to add/subtract from
1479                the sample mean to define the bins, default=[-2,-1,1,2]
1480
1481    Attributes
1482    ----------
1483
1484    yb      : array
1485              (n,1), bin ids for observations,
1486    bins    : array
1487              (k,1), the upper bounds of each class
1488    k       : int
1489              the number of classes
1490    counts  : array
1491              (k,1), the number of observations falling in each class
1492
1493    Examples
1494    --------
1495    >>> import mapclassify as mc
1496    >>> cal = mc.load_example()
1497    >>> st = mc.StdMean(cal)
1498    >>> st.k
1499    5
1500    >>> st.bins
1501    array([-967.36235382, -420.71712519,  672.57333208, 1219.21856072,
1502           4111.45      ])
1503    >>> st.counts
1504    array([ 0,  0, 56,  1,  1])
1505    >>>
1506    >>> st3 = mc.StdMean(cal, multiples = [-3, -1.5, 1.5, 3])
1507    >>> st3.bins
1508    array([-1514.00758246,  -694.03973951,   945.8959464 ,  1765.86378936,
1509            4111.45      ])
1510    >>> st3.counts
1511    array([ 0,  0, 57,  0,  1])
1512
1513    """
1514
1515    def __init__(self, y, multiples=[-2, -1, 1, 2]):
1516        self.multiples = multiples
1517        MapClassifier.__init__(self, y)
1518        self.name = "StdMean"
1519
1520    def _set_bins(self):
1521        y = self.y
1522        s = y.std(ddof=1)
1523        m = y.mean()
1524        cuts = [m + s * w for w in self.multiples]
1525        y_max = y.max()
1526        if cuts[-1] < y_max:
1527            cuts.append(y_max)
1528        self.bins = np.array(cuts)
1529        self.k = len(cuts)
1530
1531    def update(self, y=None, inplace=False, **kwargs):
1532        """
1533        Add data or change classification parameters.
1534
1535        Parameters
1536        ----------
1537        y   :   array
1538                    (n,1) array of data to classify
1539        inplace :   bool
1540                    whether to conduct the update in place or to return a copy
1541                    estimated from the additional specifications.
1542
1543        Additional parameters provided in **kwargs are passed to the init
1544        function of the class. For documentation, check the class constructor.
1545        """
1546        kwargs.update({"multiples": kwargs.pop("multiples", self.multiples)})
1547        if inplace:
1548            self._update(y, **kwargs)
1549        else:
1550            new = copy.deepcopy(self)
1551            new._update(y, **kwargs)
1552            return new
1553
1554
1555class MaximumBreaks(MapClassifier):
1556    """
1557    Maximum Breaks Map Classification
1558
1559    Parameters
1560    ----------
1561    y  : array
1562         (n, 1), values to classify
1563
1564    k  : int
1565         number of classes required
1566
1567    mindiff : float
1568              The minimum difference between class breaks
1569
1570    Attributes
1571    ----------
1572    yb : array
1573         (n, 1), bin ids for observations
1574    bins : array
1575           (k, 1), the upper bounds of each class
1576    k    : int
1577           the number of classes
1578    counts : array
1579             (k, 1), the number of observations falling in each class (numpy
1580             array k x 1)
1581
1582    Examples
1583    --------
1584    >>> import mapclassify as mc
1585    >>> cal = mc.load_example()
1586    >>> mb = mc.MaximumBreaks(cal, k = 5)
1587    >>> mb.k
1588    5
1589    >>> mb.bins
1590    array([ 146.005,  228.49 ,  546.675, 2417.15 , 4111.45 ])
1591    >>> mb.counts
1592    array([50,  2,  4,  1,  1])
1593
1594    """
1595
1596    def __init__(self, y, k=5, mindiff=0):
1597        if min(y) == max(y):
1598            raise ValueError("Not enough unique values in array to form k classes.")
1599        self.k = k
1600        self.mindiff = mindiff
1601        MapClassifier.__init__(self, y)
1602        self.name = "MaximumBreaks"
1603
1604    def _set_bins(self):
1605        xs = self.y.copy()
1606        k = self.k
1607        xs.sort()
1608        min_diff = self.mindiff
1609        diffs = xs[1:] - xs[:-1]
1610        idxs = np.argsort(diffs)
1611        k1 = k - 1
1612
1613        ud = np.unique(diffs)
1614        if len(ud) < k1:
1615            print("Insufficient number of unique diffs. Breaks are random.")
1616        mp = []
1617        for c in range(1, k):
1618            idx = idxs[-c]
1619            cp = (xs[idx] + xs[idx + 1]) / 2.0
1620            mp.append(cp)
1621        mp.append(xs[-1])
1622        mp.sort()
1623        self.bins = np.array(mp)
1624
1625    def update(self, y=None, inplace=False, **kwargs):
1626        """
1627        Add data or change classification parameters.
1628
1629        Parameters
1630        ----------
1631        y   :   array
1632                    (n,1) array of data to classify
1633        inplace :   bool
1634                    whether to conduct the update in place or to return a copy
1635                    estimated from the additional specifications.
1636
1637        Additional parameters provided in **kwargs are passed to the init
1638        function of the class. For documentation, check the class constructor.
1639        """
1640        kwargs.update({"k": kwargs.pop("k", self.k)})
1641        kwargs.update({"mindiff": kwargs.pop("mindiff", self.mindiff)})
1642        if inplace:
1643            self._update(y, **kwargs)
1644        else:
1645            new = copy.deepcopy(self)
1646            new._update(y, **kwargs)
1647            return new
1648
1649
1650class NaturalBreaks(MapClassifier):
1651    """
1652    Natural Breaks Map Classification
1653
1654    Parameters
1655    ----------
1656    y       : array
1657              (n,1), values to classify
1658    k       : int
1659              number of classes required
1660
1661    initial : int, default: 10
1662              Number of initial solutions generated with different centroids. Best of initial results is returned.
1663
1664    Attributes
1665    ----------
1666
1667    yb      : array
1668              (n,1), bin ids for observations,
1669    bins    : array
1670              (k,1), the upper bounds of each class
1671    k       : int
1672              the number of classes
1673    counts  : array
1674              (k,1), the number of observations falling in each class
1675
1676    Examples
1677    --------
1678    >>> import numpy as np
1679    >>> import mapclassify as mc
1680    >>> np.random.seed(123456)
1681    >>> cal = mc.load_example()
1682    >>> nb = mc.NaturalBreaks(cal, k=5)
1683    >>> nb.k
1684    5
1685    >>> nb.counts
1686    array([49,  3,  4,  1,  1])
1687    >>> nb.bins
1688    array([  75.29,  192.05,  370.5 ,  722.85, 4111.45])
1689    >>> x = np.array([1] * 50)
1690    >>> x[-1] = 20
1691    >>> nb = mc.NaturalBreaks(x, k = 5)
1692
1693    Warning: Not enough unique values in array to form k classes
1694    Warning: setting k to 2
1695
1696    >>> nb.bins
1697    array([ 1, 20])
1698    >>> nb.counts
1699    array([49,  1])
1700
1701    """
1702
1703    def __init__(self, y, k=K, initial=10):
1704        self.k = k
1705        self.init = initial
1706        MapClassifier.__init__(self, y)
1707        self.name = "NaturalBreaks"
1708
1709    def _set_bins(self):
1710
1711        x = self.y.copy()
1712        k = self.k
1713        values = np.array(x)
1714        uv = np.unique(values)
1715        uvk = len(uv)
1716        if uvk < k:
1717            ms = "Warning: Not enough unique values in array to form k classes"
1718            Warn(ms, UserWarning)
1719            Warn("Warning: setting k to %d" % uvk, UserWarning)
1720            k = uvk
1721            uv.sort()
1722            # we set the bins equal to the sorted unique values and ramp k
1723            # downwards. no need to call kmeans.
1724            self.bins = uv
1725            self.k = k
1726        else:
1727            res0 = natural_breaks(x, k, init=self.init)
1728            fit = res0[2]
1729            self.bins = np.array(res0[-1])
1730            self.k = len(self.bins)
1731
1732    def update(self, y=None, inplace=False, **kwargs):
1733        """
1734        Add data or change classification parameters.
1735
1736        Parameters
1737        ----------
1738        y           :   array
1739                        (n,1) array of data to classify
1740        inplace     :   bool
1741                        whether to conduct the update in place or to return a
1742                        copy estimated from the additional specifications.
1743
1744        Additional parameters provided in **kwargs are passed to the init
1745        function of the class. For documentation, check the class constructor.
1746        """
1747        kwargs.update({"k": kwargs.pop("k", self.k)})
1748        if inplace:
1749            self._update(y, **kwargs)
1750        else:
1751            new = copy.deepcopy(self)
1752            new._update(y, **kwargs)
1753            return new
1754
1755
1756class FisherJenks(MapClassifier):
1757    """
1758    Fisher Jenks optimal classifier - mean based
1759
1760    Parameters
1761    ----------
1762    y : array
1763        (n,1), values to classify
1764    k : int
1765        number of classes required
1766
1767    Attributes
1768    ----------
1769    yb      : array
1770              (n,1), bin ids for observations
1771    bins    : array
1772              (k,1), the upper bounds of each class
1773    k       : int
1774              the number of classes
1775    counts  : array
1776              (k,1), the number of observations falling in each class
1777
1778    Examples
1779    --------
1780    >>> import mapclassify as mc
1781    >>> cal = mc.load_example()
1782    >>> fj = mc.FisherJenks(cal)
1783    >>> fj.adcm
1784    799.24
1785    >>> fj.bins
1786    array([  75.29,  192.05,  370.5 ,  722.85, 4111.45])
1787    >>> fj.counts
1788    array([49,  3,  4,  1,  1])
1789    >>>
1790    """
1791
1792    def __init__(self, y, k=K):
1793
1794        nu = len(np.unique(y))
1795        if nu < k:
1796            raise ValueError("Fewer unique values than specified classes.")
1797        self.k = k
1798        MapClassifier.__init__(self, y)
1799        self.name = "FisherJenks"
1800
1801    def _set_bins(self):
1802        x = self.y.copy()
1803        self.bins = np.array(_fisher_jenks_means(x, classes=self.k)[1:])
1804
1805
1806class FisherJenksSampled(MapClassifier):
1807    """
1808    Fisher Jenks optimal classifier - mean based using random sample
1809
1810    Parameters
1811    ----------
1812    y      : array
1813             (n,1), values to classify
1814    k      : int
1815             number of classes required
1816    pct    : float
1817             The percentage of n that should form the sample
1818             If pct is specified such that n*pct > 1000, then
1819             pct = 1000./n, unless truncate is False
1820    truncate : boolean
1821               truncate pct in cases where pct * n > 1000., (Default True)
1822
1823    Attributes
1824    ----------
1825    yb      : array
1826              (n,1), bin ids for observations
1827    bins    : array
1828              (k,1), the upper bounds of each class
1829    k       : int
1830              the number of classes
1831    counts  : array
1832              (k,1), the number of observations falling in each class
1833
1834    Notes
1835    -----
1836    For theoretical details see :cite:`Rey_2016`.
1837
1838    """
1839
1840    def __init__(self, y, k=K, pct=0.10, truncate=True):
1841        self.k = k
1842        n = y.size
1843
1844        if (pct * n > 1000) and truncate:
1845            pct = 1000.0 / n
1846        ids = np.random.randint(0, n, int(n * pct))
1847        y = np.asarray(y)
1848        yr = y[ids]
1849        yr[-1] = max(y)  # make sure we have the upper bound
1850        yr[0] = min(y)  # make sure we have the min
1851        self.original_y = y
1852        self.pct = pct
1853        self._truncated = truncate
1854        self.yr = yr
1855        self.yr_n = yr.size
1856        MapClassifier.__init__(self, yr)
1857        self.yb, self.counts = bin1d(y, self.bins)
1858        self.name = "FisherJenksSampled"
1859        self.y = y
1860        self._summary()  # have to recalculate summary stats
1861
1862    def _set_bins(self):
1863        fj = FisherJenks(self.y, self.k)
1864        self.bins = fj.bins
1865
1866    def update(self, y=None, inplace=False, **kwargs):
1867        """
1868        Add data or change classification parameters.
1869
1870        Parameters
1871        ----------
1872        y           :   array
1873                        (n,1) array of data to classify
1874        inplace     :   bool
1875                        whether to conduct the update in place or to return a
1876                        copy estimated from the additional specifications.
1877
1878        Additional parameters provided in **kwargs are passed to the init
1879        function of the class. For documentation, check the class constructor.
1880        """
1881        kwargs.update({"k": kwargs.pop("k", self.k)})
1882        kwargs.update({"pct": kwargs.pop("pct", self.pct)})
1883        kwargs.update({"truncate": kwargs.pop("truncate", self._truncated)})
1884        if inplace:
1885            self._update(y, **kwargs)
1886        else:
1887            new = copy.deepcopy(self)
1888            new._update(y, **kwargs)
1889            return new
1890
1891
1892class JenksCaspall(MapClassifier):
1893    """
1894    Jenks Caspall  Map Classification
1895
1896    Parameters
1897    ----------
1898    y : array
1899        (n,1), values to classify
1900    k : int
1901        number of classes required
1902
1903    Attributes
1904    ----------
1905
1906    yb      : array
1907              (n,1), bin ids for observations,
1908    bins    : array
1909              (k,1), the upper bounds of each class
1910    k       : int
1911              the number of classes
1912    counts  : array
1913              (k,1), the number of observations falling in each class
1914
1915
1916    Examples
1917    --------
1918    >>> import mapclassify as mc
1919    >>> cal = mc.load_example()
1920    >>> jc = mc.JenksCaspall(cal, k = 5)
1921    >>> jc.bins
1922    array([1.81000e+00, 7.60000e+00, 2.98200e+01, 1.81270e+02, 4.11145e+03])
1923    >>> jc.counts
1924    array([14, 13, 14, 10,  7])
1925
1926    """
1927
1928    def __init__(self, y, k=K):
1929        self.k = k
1930        MapClassifier.__init__(self, y)
1931        self.name = "JenksCaspall"
1932
1933    def _set_bins(self):
1934        x = self.y.copy()
1935        k = self.k
1936        # start with quantiles
1937        q = quantile(x, k)
1938        solving = True
1939        xb, cnts = bin1d(x, q)
1940        # class means
1941        if x.ndim == 1:
1942            x.shape = (x.size, 1)
1943        n, k = x.shape
1944        xm = [np.median(x[xb == i]) for i in np.unique(xb)]
1945        xb0 = xb.copy()
1946        q = xm
1947        it = 0
1948        rk = list(range(self.k))
1949        while solving:
1950            xb = np.zeros(xb0.shape, int)
1951            d = abs(x - q)
1952            xb = d.argmin(axis=1)
1953            if (xb0 == xb).all():
1954                solving = False
1955            else:
1956                xb0 = xb
1957            it += 1
1958            q = np.array([np.median(x[xb == i]) for i in rk])
1959        cuts = np.array([max(x[xb == i]) for i in np.unique(xb)])
1960        cuts.shape = (len(cuts),)
1961        self.bins = cuts
1962        self.iterations = it
1963
1964
1965class JenksCaspallSampled(MapClassifier):
1966    """
1967    Jenks Caspall Map Classification using a random sample
1968
1969    Parameters
1970    ----------
1971
1972    y       : array
1973              (n,1), values to classify
1974    k       : int
1975              number of classes required
1976    pct     : float
1977              The percentage of n that should form the sample
1978              If pct is specified such that n*pct > 1000, then pct = 1000./n
1979
1980    Attributes
1981    ----------
1982
1983    yb      : array
1984              (n,1), bin ids for observations,
1985    bins    : array
1986              (k,1), the upper bounds of each class
1987    k       : int
1988              the number of classes
1989    counts  : array
1990              (k,1), the number of observations falling in each class
1991
1992
1993    Examples
1994    --------
1995    >>> import mapclassify as mc
1996    >>> cal = mc.load_example()
1997    >>> x = np.random.random(100000)
1998    >>> jc = mc.JenksCaspall(x)
1999    >>> jcs = mc.JenksCaspallSampled(x)
2000    >>> jc.bins
2001    array([0.1988721 , 0.39624334, 0.59441487, 0.79624357, 0.99999251])
2002    >>> jcs.bins
2003    array([0.20998558, 0.42112792, 0.62752937, 0.80543819, 0.99999251])
2004    >>> jc.counts
2005    array([19943, 19510, 19547, 20297, 20703])
2006    >>> jcs.counts
2007    array([21039, 20908, 20425, 17813, 19815])
2008
2009    # not for testing since we get different times on different hardware
2010    # just included for documentation of likely speed gains
2011    #>>> t1 = time.time(); jc = Jenks_Caspall(x); t2 = time.time()
2012    #>>> t1s = time.time(); jcs = Jenks_Caspall_Sampled(x); t2s = time.time()
2013    #>>> t2 - t1; t2s - t1s
2014    #1.8292930126190186
2015    #0.061631917953491211
2016
2017    Notes
2018    -----
2019    This is intended for large n problems. The logic is to apply
2020    Jenks_Caspall to a random subset of the y space and then bin the
2021    complete vector y on the bins obtained from the subset. This would
2022    trade off some "accuracy" for a gain in speed.
2023
2024    """
2025
2026    def __init__(self, y, k=K, pct=0.10):
2027        self.k = k
2028        n = y.size
2029        if pct * n > 1000:
2030            pct = 1000.0 / n
2031        ids = np.random.randint(0, n, int(n * pct))
2032        y = np.asarray(y)
2033        yr = y[ids]
2034        yr[0] = max(y)  # make sure we have the upper bound
2035        self.original_y = y
2036        self.pct = pct
2037        self.yr = yr
2038        self.yr_n = yr.size
2039        MapClassifier.__init__(self, yr)
2040        self.yb, self.counts = bin1d(y, self.bins)
2041        self.name = "JenksCaspallSampled"
2042        self.y = y
2043        self._summary()  # have to recalculate summary stats
2044
2045    def _set_bins(self):
2046        jc = JenksCaspall(self.y, self.k)
2047        self.bins = jc.bins
2048        self.iterations = jc.iterations
2049
2050    def update(self, y=None, inplace=False, **kwargs):
2051        """
2052        Add data or change classification parameters.
2053
2054        Parameters
2055        ----------
2056        y           :   array
2057                        (n,1) array of data to classify
2058        inplace     :   bool
2059                        whether to conduct the update in place or to return a
2060                        copy estimated from the additional specifications.
2061
2062        Additional parameters provided in **kwargs are passed to the init
2063        function of the class. For documentation, check the class constructor.
2064        """
2065        kwargs.update({"k": kwargs.pop("k", self.k)})
2066        kwargs.update({"pct": kwargs.pop("pct", self.pct)})
2067        if inplace:
2068            self._update(y, **kwargs)
2069        else:
2070            new = copy.deepcopy(self)
2071            new._update(y, **kwargs)
2072            return new
2073
2074
2075class JenksCaspallForced(MapClassifier):
2076    """
2077    Jenks Caspall  Map Classification with forced movements
2078
2079    Parameters
2080    ----------
2081    y : array
2082        (n,1), values to classify
2083    k : int
2084        number of classes required
2085
2086    Attributes
2087    ----------
2088    yb      : array
2089              (n,1), bin ids for observations
2090    bins    : array
2091              (k,1), the upper bounds of each class
2092    k       : int
2093              the number of classes
2094    counts  : array
2095              (k,1), the number of observations falling in each class
2096
2097
2098    Examples
2099    --------
2100    >>> import mapclassify as mc
2101    >>> cal = mc.load_example()
2102    >>> jcf = mc.JenksCaspallForced(cal, k = 5)
2103    >>> jcf.k
2104    5
2105    >>> jcf.bins
2106    array([1.34000e+00, 5.90000e+00, 1.67000e+01, 5.06500e+01, 4.11145e+03])
2107    >>> jcf.counts
2108    array([12, 12, 13,  9, 12])
2109    >>> jcf4 = mc.JenksCaspallForced(cal, k = 4)
2110    >>> jcf4.k
2111    4
2112    >>> jcf4.bins
2113    array([2.51000e+00, 8.70000e+00, 3.66800e+01, 4.11145e+03])
2114    >>> jcf4.counts
2115    array([15, 14, 14, 15])
2116    """
2117
2118    def __init__(self, y, k=K):
2119        if min(y) == max(y):
2120            raise ValueError("Not enough unique values in array to form k classes.")
2121        self.k = k
2122        MapClassifier.__init__(self, y)
2123        self.name = "JenksCaspallForced"
2124
2125    def _set_bins(self):
2126        x = self.y.copy()
2127        k = self.k
2128        q = quantile(x, k)
2129        solving = True
2130        xb, cnt = bin1d(x, q)
2131        # class means
2132        if x.ndim == 1:
2133            x.shape = (x.size, 1)
2134        n, tmp = x.shape
2135        xm = [x[xb == i].mean() for i in np.unique(xb)]
2136        q = xm
2137        xbar = np.array([xm[xbi] for xbi in xb])
2138        xbar.shape = (n, 1)
2139        ss = x - xbar
2140        ss *= ss
2141        ss = sum(ss)
2142        down_moves = up_moves = 0
2143        solving = True
2144        it = 0
2145        while solving:
2146            # try upward moves first
2147            moving_up = True
2148            while moving_up:
2149                class_ids = np.unique(xb)
2150                nk = [sum(xb == j) for j in class_ids]
2151                candidates = nk[:-1]
2152                i = 0
2153                up_moves = 0
2154                while candidates:
2155                    nki = candidates.pop(0)
2156                    if nki > 1:
2157                        ids = np.nonzero(xb == class_ids[i])
2158                        mover = max(ids[0])
2159                        tmp = xb.copy()
2160                        tmp[mover] = xb[mover] + 1
2161                        tm = [x[tmp == j].mean() for j in np.unique(tmp)]
2162                        txbar = np.array([tm[xbi] for xbi in tmp])
2163                        txbar.shape = (n, 1)
2164                        tss = x - txbar
2165                        tss *= tss
2166                        tss = sum(tss)
2167                        if tss < ss:
2168                            xb = tmp
2169                            ss = tss
2170                            candidates = []
2171                            up_moves += 1
2172                    i += 1
2173                if not up_moves:
2174                    moving_up = False
2175            moving_down = True
2176            while moving_down:
2177                class_ids = np.unique(xb)
2178                nk = [sum(xb == j) for j in class_ids]
2179                candidates = nk[1:]
2180                i = 1
2181                down_moves = 0
2182                while candidates:
2183                    nki = candidates.pop(0)
2184                    if nki > 1:
2185                        ids = np.nonzero(xb == class_ids[i])
2186                        mover = min(ids[0])
2187                        mover_class = xb[mover]
2188                        target_class = mover_class - 1
2189                        tmp = xb.copy()
2190                        tmp[mover] = target_class
2191                        tm = [x[tmp == j].mean() for j in np.unique(tmp)]
2192                        txbar = np.array([tm[xbi] for xbi in tmp])
2193                        txbar.shape = (n, 1)
2194                        tss = x - txbar
2195                        tss *= tss
2196                        tss = sum(tss)
2197                        if tss < ss:
2198                            xb = tmp
2199                            ss = tss
2200                            candidates = []
2201                            down_moves += 1
2202                    i += 1
2203                if not down_moves:
2204                    moving_down = False
2205            if not up_moves and not down_moves:
2206                solving = False
2207            it += 1
2208        cuts = [max(x[xb == c]) for c in np.unique(xb)]
2209        cuts = np.reshape(np.array(cuts), (k,))
2210        self.bins = cuts
2211        self.iterations = it
2212
2213
2214class UserDefined(MapClassifier):
2215    """
2216    User Specified Binning
2217
2218    Parameters
2219    ----------
2220    y    : array
2221           (n,1), values to classify
2222    bins : array
2223           (k,1), upper bounds of classes (have to be monotically increasing)
2224
2225    lowest  : float (optional)
2226           scalar minimum value of lowest class. Default is to set the minimum
2227           to -inf if  y.min() > first upper bound, otherwise minimum is set to
2228           y.min(). lowest will override the default
2229
2230
2231
2232    Attributes
2233    ----------
2234    yb      : array
2235              (n,1), bin ids for observations,
2236    bins    : array
2237              (k,1), the upper bounds of each class
2238    k       : int
2239              the number of classes
2240    counts  : array
2241              (k,1), the number of observations falling in each class
2242
2243
2244    Examples
2245    --------
2246    >>> import mapclassify as mc
2247    >>> cal = mc.load_example()
2248    >>> bins = [20, max(cal)]
2249    >>> bins
2250    [20, 4111.45]
2251    >>> ud = mc.UserDefined(cal, bins)
2252    >>> ud.bins
2253    array([  20.  , 4111.45])
2254    >>> ud.counts
2255    array([37, 21])
2256    >>> bins = [20, 30]
2257    >>> ud = mc.UserDefined(cal, bins)
2258    >>> ud.bins
2259    array([  20.  ,   30.  , 4111.45])
2260    >>> ud.counts
2261    array([37,  4, 17])
2262
2263    Notes
2264    -----
2265    If upper bound of user bins does not exceed max(y) we append an
2266    additional bin.
2267
2268    """
2269
2270    def __init__(self, y, bins, lowest=None):
2271        if bins[-1] < max(y):
2272            bins = np.append(bins, max(y))
2273        self.lowest = lowest
2274        self.k = len(bins)
2275        self.bins = np.array(bins)
2276        self.y = y
2277        MapClassifier.__init__(self, y)
2278        self.name = "UserDefined"
2279
2280    def _set_bins(self):
2281        pass
2282
2283    def _update(self, y=None, bins=None):
2284        if y is not None:
2285            if hasattr(y, "values"):
2286                y = y.values
2287            y = np.append(y.flatten(), self.y)
2288        else:
2289            y = self.y
2290        if bins is None:
2291            bins = self.bins
2292        self.__init__(y, bins)
2293
2294    def update(self, y=None, inplace=False, **kwargs):
2295        """
2296        Add data or change classification parameters.
2297
2298        Parameters
2299        ----------
2300        y           :   array
2301                        (n,1) array of data to classify
2302        inplace     :   bool
2303                        whether to conduct the update in place or to return a
2304                        copy estimated from the additional specifications.
2305
2306        Additional parameters provided in **kwargs are passed to the init
2307        function of the class. For documentation, check the class constructor.
2308        """
2309        bins = kwargs.pop("bins", self.bins)
2310        if inplace:
2311            self._update(y=y, bins=bins, **kwargs)
2312        else:
2313            new = copy.deepcopy(self)
2314            new._update(y, bins, **kwargs)
2315            return new
2316
2317    # We have to override the plot method for additional kwargs for UserDefined
2318    def plot(
2319        self,
2320        gdf,
2321        border_color="lightgrey",
2322        border_width=0.10,
2323        title=None,
2324        legend=False,
2325        cmap="YlGnBu",
2326        axis_on=True,
2327        legend_kwds={"loc": "lower right", "fmt": FMT},
2328        file_name=None,
2329        dpi=600,
2330        ax=None,
2331    ):
2332        try:
2333            import matplotlib.pyplot as plt
2334        except ImportError:
2335            raise ImportError(
2336                "Mapclassify.plot depends on matplotlib.pyplot, and this was"
2337                "not able to be imported. \nInstall matplotlib to"
2338                "plot spatial classifier."
2339            )
2340        if ax is None:
2341            f = plt.figure()
2342            ax = plt.gca()
2343        else:
2344            f = plt.gcf()
2345
2346        fmt = FMT
2347        if "fmt" in legend_kwds:
2348            fmt = legend_kwds.pop("fmt")
2349
2350        ax = gdf.assign(_cl=self.y).plot(
2351            column="_cl",
2352            ax=ax,
2353            cmap=cmap,
2354            edgecolor=border_color,
2355            linewidth=border_width,
2356            scheme=self.name,
2357            legend=legend,
2358            legend_kwds=legend_kwds,
2359            classification_kwds={"bins": self.bins},  # for UserDefined
2360        )
2361        if not axis_on:
2362            ax.axis("off")
2363        if title:
2364            f.suptitle(title)
2365        if file_name:
2366            plt.savefig(file_name, dpi=dpi)
2367        return f, ax
2368
2369
2370class MaxP(MapClassifier):
2371    """
2372    MaxP Map Classification
2373
2374    Based on Max-p regionalization algorithm
2375
2376    Parameters
2377    ----------
2378    y       : array
2379              (n,1), values to classify
2380    k       : int
2381              number of classes required
2382    initial : int
2383              number of initial solutions to use prior to swapping
2384
2385    Attributes
2386    ----------
2387
2388    yb      : array
2389              (n,1), bin ids for observations,
2390    bins    : array
2391              (k,1), the upper bounds of each class
2392    k       : int
2393              the number of classes
2394    counts  : array
2395              (k,1), the number of observations falling in each class
2396
2397    Examples
2398    --------
2399    >>> import mapclassify as mc
2400    >>> cal = mc.load_example()
2401    >>> mp = mc.MaxP(cal)
2402    >>> mp.bins
2403    array([   8.7 ,   16.7 ,   20.47,   66.26, 4111.45])
2404
2405    >>> mp.counts
2406    array([29,  8,  1, 10, 10])
2407    """
2408
2409    def __init__(self, y, k=K, initial=1000):
2410        if min(y) == max(y):
2411            raise ValueError("Not enough unique values in array to form k classes.")
2412        self.k = k
2413        self.initial = initial
2414        MapClassifier.__init__(self, y)
2415        self.name = "MaxP"
2416
2417    def _set_bins(self):
2418        x = self.y.copy()
2419        k = self.k
2420        q = quantile(x, k)
2421        if x.ndim == 1:
2422            x.shape = (x.size, 1)
2423        n, tmp = x.shape
2424        x.sort(axis=0)
2425        # find best of initial solutions
2426        solution = 0
2427        best_tss = x.var() * x.shape[0]
2428        tss_all = np.zeros((self.initial, 1))
2429        while solution < self.initial:
2430            remaining = list(range(n))
2431            seeds = [
2432                np.nonzero(di == min(di))[0][0] for di in [np.abs(x - qi) for qi in q]
2433            ]
2434            rseeds = np.random.permutation(list(range(k))).tolist()
2435            [remaining.remove(seed) for seed in seeds]
2436            self.classes = classes = []
2437            [classes.append([seed]) for seed in seeds]
2438            while rseeds:
2439                seed_id = rseeds.pop()
2440                current = classes[seed_id]
2441                growing = True
2442                while growing:
2443                    current = classes[seed_id]
2444                    low = current[0]
2445                    high = current[-1]
2446                    left = low - 1
2447                    right = high + 1
2448                    move_made = False
2449                    if left in remaining:
2450                        current.insert(0, left)
2451                        remaining.remove(left)
2452                        move_made = True
2453                    if right in remaining:
2454                        current.append(right)
2455                        remaining.remove(right)
2456                        move_made = True
2457                    if move_made:
2458                        classes[seed_id] = current
2459                    else:
2460                        growing = False
2461            tss = _fit(self.y, classes)
2462            tss_all[solution] = tss
2463            if tss < best_tss:
2464                best_solution = classes
2465                best_it = solution
2466                best_tss = tss
2467            solution += 1
2468        classes = best_solution
2469        self.best_it = best_it
2470        self.tss = best_tss
2471        self.a2c = a2c = {}
2472        self.tss_all = tss_all
2473        for r, cl in enumerate(classes):
2474            for a in cl:
2475                a2c[a] = r
2476        swapping = True
2477        while swapping:
2478            rseeds = np.random.permutation(list(range(k))).tolist()
2479            total_moves = 0
2480            while rseeds:
2481                id = rseeds.pop()
2482                growing = True
2483                total_moves = 0
2484                while growing:
2485                    target = classes[id]
2486                    left = target[0] - 1
2487                    right = target[-1] + 1
2488                    n_moves = 0
2489                    if left in a2c:
2490                        left_class = classes[a2c[left]]
2491                        if len(left_class) > 1:
2492                            a = left_class[-1]
2493                            if self._swap(left_class, target, a):
2494                                target.insert(0, a)
2495                                left_class.remove(a)
2496                                a2c[a] = id
2497                                n_moves += 1
2498                    if right in a2c:
2499                        right_class = classes[a2c[right]]
2500                        if len(right_class) > 1:
2501                            a = right_class[0]
2502                            if self._swap(right_class, target, a):
2503                                target.append(a)
2504                                right_class.remove(a)
2505                                n_moves += 1
2506                                a2c[a] = id
2507                    if not n_moves:
2508                        growing = False
2509                total_moves += n_moves
2510            if not total_moves:
2511                swapping = False
2512        xs = self.y.copy()
2513        xs.sort()
2514        self.bins = np.array([xs[cl][-1] for cl in classes])
2515
2516    def _ss(self, class_def):
2517        """calculates sum of squares for a class"""
2518        yc = self.y[class_def]
2519        css = yc - yc.mean()
2520        css *= css
2521        return sum(css)
2522
2523    def _swap(self, class1, class2, a):
2524        """evaluate cost of moving a from class1 to class2"""
2525        ss1 = self._ss(class1)
2526        ss2 = self._ss(class2)
2527        tss1 = ss1 + ss2
2528        class1c = copy.copy(class1)
2529        class2c = copy.copy(class2)
2530        class1c.remove(a)
2531        class2c.append(a)
2532        ss1 = self._ss(class1c)
2533        ss2 = self._ss(class2c)
2534        tss2 = ss1 + ss2
2535        if tss1 < tss2:
2536            return False
2537        else:
2538            return True
2539
2540    def update(self, y=None, inplace=False, **kwargs):
2541        """
2542        Add data or change classification parameters.
2543
2544        Parameters
2545        ----------
2546        y           :   array
2547                        (n,1) array of data to classify
2548        inplace     :   bool
2549                        whether to conduct the update in place or to return a
2550                        copy estimated from the additional specifications.
2551
2552        Additional parameters provided in **kwargs are passed to the init
2553        function of the class. For documentation, check the class constructor.
2554        """
2555        kwargs.update({"initial": kwargs.pop("initial", self.initial)})
2556        if inplace:
2557            self._update(y, bins, **kwargs)
2558        else:
2559            new = copy.deepcopy(self)
2560            new._update(y, bins, **kwargs)
2561            return new
2562
2563
2564def _fit(y, classes):
2565    """Calculate the total sum of squares for a vector y classified into
2566    classes
2567
2568    Parameters
2569    ----------
2570    y : array
2571        (n,1), variable to be classified
2572
2573    classes : array
2574              (k,1), integer values denoting class membership
2575
2576    """
2577    tss = 0
2578    for class_def in classes:
2579        yc = y[class_def]
2580        css = yc - yc.mean()
2581        css *= css
2582        tss += sum(css)
2583    return tss
2584
2585
2586kmethods = {}
2587kmethods["Quantiles"] = Quantiles
2588kmethods["FisherJenks"] = FisherJenks
2589kmethods["NaturalBreaks"] = NaturalBreaks
2590kmethods["MaximumBreaks"] = MaximumBreaks
2591
2592
2593def gadf(y, method="Quantiles", maxk=15, pct=0.8):
2594    r"""
2595    Evaluate the Goodness of Absolute Deviation Fit of a Classifier
2596    Finds the minimum value of k for which gadf>pct
2597
2598    Parameters
2599    ----------
2600
2601    y      : array
2602             (n, 1) values to be classified
2603    method : {'Quantiles, 'Fisher_Jenks', 'Maximum_Breaks', 'Natrual_Breaks'}
2604    maxk   : int
2605             maximum value of k to evaluate
2606    pct    : float
2607             The percentage of GADF to exceed
2608
2609    Returns
2610    -------
2611    k : int
2612        number of classes
2613    cl : object
2614         instance of the classifier at k
2615    gadf : float
2616           goodness of absolute deviation fit
2617
2618    Examples
2619    --------
2620    >>> import mapclassify as mc
2621    >>> cal = mc.load_example()
2622    >>> qgadf = mc.classifiers.gadf(cal)
2623    >>> qgadf[0]
2624    15
2625    >>> qgadf[-1]
2626    0.3740257590909283
2627
2628    Quantiles fail to exceed 0.80 before 15 classes. If we lower the bar to
2629    0.2 we see quintiles as a result
2630
2631    >>> qgadf2 = mc.classifiers.gadf(cal, pct = 0.2)
2632    >>> qgadf2[0]
2633    5
2634    >>> qgadf2[-1]
2635    0.21710231966462412
2636    >>>
2637
2638    Notes
2639    -----
2640
2641    The GADF is defined as:
2642
2643        .. math::
2644
2645            GADF = 1 - \sum_c \sum_{i \in c}
2646                   |y_i - y_{c,med}|  / \sum_i |y_i - y_{med}|
2647
2648        where :math:`y_{med}` is the global median and :math:`y_{c,med}` is
2649        the median for class :math:`c`.
2650
2651    See Also
2652    --------
2653    KClassifiers
2654    """
2655
2656    y = np.array(y)
2657    adam = (np.abs(y - np.median(y))).sum()
2658    for k in range(2, maxk + 1):
2659        cl = kmethods[method](y, k)
2660        gadf = 1 - cl.adcm / adam
2661        if gadf > pct:
2662            break
2663    return (k, cl, gadf)
2664
2665
2666class KClassifiers(object):
2667    """
2668    Evaluate all k-classifers and pick optimal based on k and GADF
2669
2670    Parameters
2671    ----------
2672    y      : array
2673             (n,1), values to be classified
2674    pct    : float
2675             The percentage of GADF to exceed
2676
2677    Attributes
2678    ----------
2679    best   :  object
2680              instance of the optimal MapClassifier
2681    results : dictionary
2682              keys are classifier names, values are the MapClassifier
2683              instances with the best pct for each classifer
2684
2685    Examples
2686    --------
2687    >>> import mapclassify as mc
2688    >>> cal = mc.load_example()
2689    >>> ks = mc.classifiers.KClassifiers(cal)
2690    >>> ks.best.name
2691    'FisherJenks'
2692    >>> ks.best.k
2693    4
2694    >>> ks.best.gadf
2695    0.8481032719908105
2696
2697    Notes
2698    -----
2699    This can be used to suggest a classification scheme.
2700
2701    See Also
2702    --------
2703    gadf
2704
2705    """
2706
2707    def __init__(self, y, pct=0.8):
2708        results = {}
2709        best = gadf(y, "FisherJenks", maxk=len(y) - 1, pct=pct)
2710        pct0 = best[0]
2711        k0 = best[-1]
2712        keys = list(kmethods.keys())
2713        keys.remove("FisherJenks")
2714        results["FisherJenks"] = best
2715        for method in keys:
2716            results[method] = gadf(y, method, maxk=len(y) - 1, pct=pct)
2717            k1 = results[method][0]
2718            pct1 = results[method][-1]
2719            if (k1 < k0) or (k1 == k0 and pct0 < pct1):
2720                best = results[method]
2721                k0 = k1
2722                pct0 = pct1
2723        self.results = results
2724        self.best = best[1]
2725