1# pylint: disable=invalid-name,too-many-lines
2"""Density estimation functions for ArviZ."""
3import warnings
4
5import numpy as np
6from scipy.fftpack import fft
7from scipy.optimize import brentq
8from scipy.signal import convolve, convolve2d, gaussian  # pylint: disable=no-name-in-module
9from scipy.sparse import coo_matrix
10from scipy.special import ive  # pylint: disable=no-name-in-module
11
12from ..utils import _cov, _dot, _stack, conditional_jit
13
14__all__ = ["kde"]
15
16
17def _bw_scott(x, x_std=None, **kwargs):  # pylint: disable=unused-argument
18    """Scott's Rule."""
19    if x_std is None:
20        x_std = np.std(x)
21    bw = 1.06 * x_std * len(x) ** (-0.2)
22    return bw
23
24
25def _bw_silverman(x, x_std=None, **kwargs):  # pylint: disable=unused-argument
26    """Silverman's Rule."""
27    if x_std is None:
28        x_std = np.std(x)
29    q75, q25 = np.percentile(x, [75, 25])
30    x_iqr = q75 - q25
31    a = min(x_std, x_iqr / 1.34)
32    bw = 0.9 * a * len(x) ** (-0.2)
33    return bw
34
35
36def _bw_isj(x, grid_counts=None, x_std=None, x_range=None):
37    """Improved Sheather-Jones bandwidth estimation.
38
39    Improved Sheather and Jones method as explained in [1]_. This method is used internally by the
40    KDE estimator, resulting in saved computation time as minimums, maximums and the grid are
41    pre-computed.
42
43    References
44    ----------
45    .. [1] Kernel density estimation via diffusion.
46       Z. I. Botev, J. F. Grotowski, and D. P. Kroese.
47       Ann. Statist. 38 (2010), no. 5, 2916--2957.
48    """
49    x_len = len(x)
50    if x_range is None:
51        x_min = np.min(x)
52        x_max = np.max(x)
53        x_range = x_max - x_min
54
55    # Relative frequency per bin
56    if grid_counts is None:
57        x_std = np.std(x)
58        grid_len = 256
59        grid_min = x_min - 0.5 * x_std
60        grid_max = x_max + 0.5 * x_std
61        grid_counts, _, _ = histogram(x, grid_len, (grid_min, grid_max))
62    else:
63        grid_len = len(grid_counts) - 1
64
65    grid_relfreq = grid_counts / x_len
66
67    # Discrete cosine transform of the data
68    a_k = _dct1d(grid_relfreq)
69
70    k_sq = np.arange(1, grid_len) ** 2
71    a_sq = a_k[range(1, grid_len)] ** 2
72
73    t = _root(_fixed_point, x_len, args=(x_len, k_sq, a_sq), x=x)
74    h = t ** 0.5 * x_range
75    return h
76
77
78def _bw_experimental(x, grid_counts=None, x_std=None, x_range=None):
79    """Experimental bandwidth estimator."""
80    bw_silverman = _bw_silverman(x, x_std=x_std)
81    bw_isj = _bw_isj(x, grid_counts=grid_counts, x_range=x_range)
82    return 0.5 * (bw_silverman + bw_isj)
83
84
85def _bw_taylor(x):
86    """Taylor's rule for circular bandwidth estimation.
87
88    This function implements a rule-of-thumb for choosing the bandwidth of a von Mises kernel
89    density estimator that assumes the underlying distribution is von Mises as introduced in [1]_.
90    It is analogous to Scott's rule for the Gaussian KDE.
91
92    Circular bandwidth has a different scale from linear bandwidth. Unlike linear scale, low
93    bandwidths are associated with oversmoothing and high values with undersmoothing.
94
95    References
96    ----------
97    .. [1] C.C Taylor (2008). Automatic bandwidth selection for circular
98           density estimation.
99           Computational Statistics and Data Analysis, 52, 7, 3493–3500.
100    """
101    x_len = len(x)
102    kappa = _kappa_mle(x)
103    num = 3 * x_len * kappa ** 2 * ive(2, 2 * kappa)
104    den = 4 * np.pi ** 0.5 * ive(0, kappa) ** 2
105    return (num / den) ** 0.4
106
107
108_BW_METHODS_LINEAR = {
109    "scott": _bw_scott,
110    "silverman": _bw_silverman,
111    "isj": _bw_isj,
112    "experimental": _bw_experimental,
113}
114
115
116def _get_bw(x, bw, grid_counts=None, x_std=None, x_range=None):
117    """Compute bandwidth for a given data `x` and `bw`.
118
119    Also checks `bw` is correctly specified.
120
121    Parameters
122    ----------
123    x : 1-D numpy array
124        1 dimensional array of sample data from the
125        variable for which a density estimate is desired.
126    bw: int, float or str
127        If numeric, indicates the bandwidth and must be positive.
128        If str, indicates the method to estimate the bandwidth.
129
130    Returns
131    -------
132    bw: float
133        Bandwidth
134    """
135    if isinstance(bw, bool):
136        raise ValueError(
137            (
138                "`bw` must not be of type `bool`.\n"
139                "Expected a positive numeric or one of the following strings:\n"
140                f"{list(_BW_METHODS_LINEAR.keys())}."
141            )
142        )
143    if isinstance(bw, (int, float)):
144        if bw < 0:
145            raise ValueError(f"Numeric `bw` must be positive.\nInput: {bw:.4f}.")
146    elif isinstance(bw, str):
147        bw_lower = bw.lower()
148
149        if bw_lower not in _BW_METHODS_LINEAR.keys():
150            raise ValueError(
151                "Unrecognized bandwidth method.\n"
152                f"Input is: {bw_lower}.\n"
153                f"Expected one of: {list(_BW_METHODS_LINEAR.keys())}."
154            )
155
156        bw_fun = _BW_METHODS_LINEAR[bw_lower]
157        bw = bw_fun(x, grid_counts=grid_counts, x_std=x_std, x_range=x_range)
158    else:
159        raise ValueError(
160            "Unrecognized `bw` argument.\n"
161            "Expected a positive numeric or one of the following strings:\n"
162            f"{list(_BW_METHODS_LINEAR.keys())}."
163        )
164    return bw
165
166
167def _vonmises_pdf(x, mu, kappa):
168    """Calculate vonmises_pdf."""
169    if kappa <= 0:
170        raise ValueError("Argument 'kappa' must be positive.")
171    pdf = 1 / (2 * np.pi * ive(0, kappa)) * np.exp(np.cos(x - mu) - 1) ** kappa
172    return pdf
173
174
175def _a1inv(x):
176    """Compute inverse function.
177
178    Inverse function of the ratio of the first and
179    zeroth order Bessel functions of the first kind.
180
181    Returns the value k, such that a1inv(x) = k, i.e. a1(k) = x.
182    """
183    if 0 <= x < 0.53:
184        return 2 * x + x ** 3 + (5 * x ** 5) / 6
185    elif x < 0.85:
186        return -0.4 + 1.39 * x + 0.43 / (1 - x)
187    else:
188        return 1 / (x ** 3 - 4 * x ** 2 + 3 * x)
189
190
191def _kappa_mle(x):
192    mean = _circular_mean(x)
193    kappa = _a1inv(np.mean(np.cos(x - mean)))
194    return kappa
195
196
197def _dct1d(x):
198    """Discrete Cosine Transform in 1 Dimension.
199
200    Parameters
201    ----------
202    x : numpy array
203        1 dimensional array of values for which the
204        DCT is desired
205
206    Returns
207    -------
208    output : DTC transformed values
209    """
210    x_len = len(x)
211
212    even_increasing = np.arange(0, x_len, 2)
213    odd_decreasing = np.arange(x_len - 1, 0, -2)
214
215    x = np.concatenate((x[even_increasing], x[odd_decreasing]))
216
217    w_1k = np.r_[1, (2 * np.exp(-(0 + 1j) * (np.arange(1, x_len)) * np.pi / (2 * x_len)))]
218    output = np.real(w_1k * fft(x))
219
220    return output
221
222
223def _fixed_point(t, N, k_sq, a_sq):
224    """Calculate t-zeta*gamma^[l](t).
225
226    Implementation of the function t-zeta*gamma^[l](t) derived from equation (30) in [1].
227
228    References
229    ----------
230    .. [1] Kernel density estimation via diffusion.
231       Z. I. Botev, J. F. Grotowski, and D. P. Kroese.
232       Ann. Statist. 38 (2010), no. 5, 2916--2957.
233    """
234    k_sq = np.asfarray(k_sq, dtype=np.float64)
235    a_sq = np.asfarray(a_sq, dtype=np.float64)
236
237    l = 7
238    f = np.sum(np.power(k_sq, l) * a_sq * np.exp(-k_sq * np.pi ** 2 * t))
239    f *= 0.5 * np.pi ** (2.0 * l)
240
241    for j in np.arange(l - 1, 2 - 1, -1):
242        c1 = (1 + 0.5 ** (j + 0.5)) / 3
243        c2 = np.product(np.arange(1.0, 2 * j + 1, 2, dtype=np.float64))
244        c2 /= (np.pi / 2) ** 0.5
245        t_j = np.power((c1 * (c2 / (N * f))), (2.0 / (3.0 + 2.0 * j)))
246        f = np.sum(k_sq ** j * a_sq * np.exp(-k_sq * np.pi ** 2.0 * t_j))
247        f *= 0.5 * np.pi ** (2 * j)
248
249    out = t - (2 * N * np.pi ** 0.5 * f) ** (-0.4)
250    return out
251
252
253def _root(function, N, args, x):
254    # The right bound is at most 0.01
255    found = False
256    N = max(min(1050, N), 50)
257    tol = 10e-12 + 0.01 * (N - 50) / 1000
258
259    while not found:
260        try:
261            bw, res = brentq(function, 0, 0.01, args=args, full_output=True, disp=False)
262            found = res.converged
263        except ValueError:
264            bw = 0
265            tol *= 2.0
266            found = False
267        if bw <= 0 or tol >= 1:
268            bw = (_bw_silverman(x) / np.ptp(x)) ** 2
269            return bw
270    return bw
271
272
273def _check_custom_lims(custom_lims, x_min, x_max):
274    """Check if `custom_lims` are of the correct type.
275
276    It accepts numeric lists/tuples of length 2.
277
278    Parameters
279    ----------
280    custom_lims : Object whose type is checked.
281
282    Returns
283    -------
284    None: Object of type None
285    """
286    if not isinstance(custom_lims, (list, tuple)):
287        raise TypeError(
288            "`custom_lims` must be a numeric list or tuple of length 2.\n"
289            f"Not an object of {type(custom_lims)}."
290        )
291
292    if len(custom_lims) != 2:
293        raise AttributeError(f"`len(custom_lims)` must be 2, not {len(custom_lims)}.")
294
295    any_bool = any(isinstance(i, bool) for i in custom_lims)
296    if any_bool:
297        raise TypeError("Elements of `custom_lims` must be numeric or None, not bool.")
298
299    custom_lims = list(custom_lims)  # convert to a mutable object
300    if custom_lims[0] is None:
301        custom_lims[0] = x_min
302
303    if custom_lims[1] is None:
304        custom_lims[1] = x_max
305
306    all_numeric = all(isinstance(i, (int, float, np.integer, np.float)) for i in custom_lims)
307    if not all_numeric:
308        raise TypeError(
309            ("Elements of `custom_lims` must be numeric or None.\n" "At least one of them is not.")
310        )
311
312    if not custom_lims[0] < custom_lims[1]:
313        raise ValueError("`custom_lims[0]` must be smaller than `custom_lims[1]`.")
314
315    if custom_lims[0] > x_min or custom_lims[1] < x_max:
316        raise ValueError("Some observations are outside `custom_lims` boundaries.")
317
318    return custom_lims
319
320
321def _get_grid(
322    x_min, x_max, x_std, extend_fct, grid_len, custom_lims, extend=True, bound_correction=False
323):
324    """Compute the grid that bins the data used to estimate the density function.
325
326    Parameters
327    ----------
328    x_min : float
329        Minimum value of the data
330    x_max: float
331        Maximum value of the data.
332    x_std: float
333        Standard deviation of the data.
334    extend_fct: bool
335        Indicates the factor by which `x_std` is multiplied
336        to extend the range of the data.
337    grid_len: int
338        Number of bins
339    custom_lims: tuple or list
340        Custom limits for the domain of the density estimation.
341        Must be numeric of length 2. Overrides `extend`.
342    extend: bool, optional
343        Whether to extend the range of the data or not.
344        Default is True.
345    bound_correction: bool, optional
346        Whether the density estimations performs boundary correction or not.
347        This does not impacts directly in the output, but is used
348        to override `extend`. Overrides `extend`.
349        Default is False.
350
351    Returns
352    -------
353    grid_len: int
354        Number of bins
355    grid_min: float
356        Minimum value of the grid
357    grid_max: float
358        Maximum value of the grid
359    """
360    # Set up number of bins.
361    grid_len = max(int(grid_len), 100)
362
363    # Set up domain
364    if custom_lims is not None:
365        custom_lims = _check_custom_lims(custom_lims, x_min, x_max)
366        grid_min = custom_lims[0]
367        grid_max = custom_lims[1]
368    elif extend and not bound_correction:
369        grid_extend = extend_fct * x_std
370        grid_min = x_min - grid_extend
371        grid_max = x_max + grid_extend
372    else:
373        grid_min = x_min
374        grid_max = x_max
375    return grid_min, grid_max, grid_len
376
377
378def kde(x, circular=False, **kwargs):
379    """One dimensional density estimation.
380
381    It is a wrapper around `kde_linear()` and `kde_circular()`.
382
383    Parameters
384    ----------
385    x : 1D numpy array
386        Data used to calculate the density estimation.
387    circular: bool, optional
388        Whether `x` is a circular variable or not. Defaults to False.
389    **kwargs: Arguments passed to `kde_linear()` and `kde_circular()`.
390        See their documentation for more info.
391
392    Returns
393    -------
394    grid : Gridded numpy array for the x values.
395    pdf : Numpy array for the density estimates.
396    bw: optional, the estimated bandwidth.
397
398    Examples
399    --------
400    Default density estimation for linear data
401    .. plot::
402        :context: close-figs
403
404        >>> import numpy as np
405        >>> import matplotlib.pyplot as plt
406        >>> from arviz import kde
407        >>>
408        >>> rvs = np.random.gamma(shape=1.8, size=1000)
409        >>> grid, pdf = kde(rvs)
410        >>> plt.plot(grid, pdf)
411        >>> plt.show()
412
413    Density estimation for linear data with Silverman's rule bandwidth
414    .. plot::
415        :context: close-figs
416
417        >>> grid, pdf = kde(rvs, bw="silverman")
418        >>> plt.plot(grid, pdf)
419        >>> plt.show()
420
421    Density estimation for linear data with scaled bandwidth
422    .. plot::
423        :context: close-figs
424
425        >>> # bw_fct > 1 means more smoothness.
426        >>> grid, pdf = kde(rvs, bw_fct=2.5)
427        >>> plt.plot(grid, pdf)
428        >>> plt.show()
429
430    Default density estimation for linear data with extended limits
431    .. plot::
432        :context: close-figs
433
434        >>> grid, pdf = kde(rvs, bound_correction=False, extend=True, extend_fct=0.5)
435        >>> plt.plot(grid, pdf)
436        >>> plt.show()
437
438    Default density estimation for linear data with custom limits
439    .. plot::
440        :context: close-figs
441    # It accepts tuples and lists of length 2.
442        >>> grid, pdf = kde(rvs, bound_correction=False, custom_lims=(0, 10))
443        >>> plt.plot(grid, pdf)
444        >>> plt.show()
445
446    Default density estimation for circular data
447    .. plot::
448        :context: close-figs
449
450        >>> rvs = np.random.vonmises(mu=np.pi, kappa=1, size=500)
451        >>> grid, pdf = kde(rvs, circular=True)
452        >>> plt.plot(grid, pdf)
453        >>> plt.show()
454
455    Density estimation for circular data with scaled bandwidth
456    .. plot::
457        :context: close-figs
458
459        >>> rvs = np.random.vonmises(mu=np.pi, kappa=1, size=500)
460        >>> # bw_fct > 1 means less smoothness.
461        >>> grid, pdf = kde(rvs, circular=True, bw_fct=3)
462        >>> plt.plot(grid, pdf)
463        >>> plt.show()
464
465    Density estimation for circular data with custom limits
466    .. plot::
467        :context: close-figs
468
469        >>> # This is still experimental, does not always work.
470        >>> rvs = np.random.vonmises(mu=0, kappa=30, size=500)
471        >>> grid, pdf = kde(rvs, circular=True, custom_lims=(-1, 1))
472        >>> plt.plot(grid, pdf)
473        >>> plt.show()
474
475    See Also
476    --------
477    plot_kde : Compute and plot a kernel density estimate.
478    """
479    x = x[np.isfinite(x)]
480    if x.size == 0 or np.all(x == x[0]):
481        warnings.warn("Your data appears to have a single value or no finite values")
482
483        return np.zeros(2), np.array([np.nan] * 2)
484
485    if circular:
486        if circular == "degrees":
487            x = np.radians(x)
488        kde_fun = _kde_circular
489    else:
490        kde_fun = _kde_linear
491
492    return kde_fun(x, **kwargs)
493
494
495def _kde_linear(
496    x,
497    bw="experimental",
498    adaptive=False,
499    extend=False,
500    bound_correction=True,
501    extend_fct=0,
502    bw_fct=1,
503    bw_return=False,
504    custom_lims=None,
505    cumulative=False,
506    grid_len=512,
507    **kwargs,  # pylint: disable=unused-argument
508):
509    """One dimensional density estimation for linear data.
510
511    Given an array of data points `x` it returns an estimate of
512    the probability density function that generated the samples in `x`.
513
514    Parameters
515    ----------
516    x : 1D numpy array
517        Data used to calculate the density estimation.
518    bw: int, float or str, optional
519        If numeric, indicates the bandwidth and must be positive.
520        If str, indicates the method to estimate the bandwidth and must be one of "scott",
521        "silverman", "isj" or "experimental". Defaults to "experimental".
522    adaptive: boolean, optional
523        Indicates if the bandwidth is adaptive or not.
524        It is the recommended approach when there are multiple modes with different spread.
525        It is not compatible with convolution. Defaults to False.
526    extend: boolean, optional
527        Whether to extend the observed range for `x` in the estimation.
528        It extends each bound by a multiple of the standard deviation of `x` given by `extend_fct`.
529        Defaults to False.
530    bound_correction: boolean, optional
531        Whether to perform boundary correction on the bounds of `x` or not.
532        Defaults to True.
533    extend_fct: float, optional
534        Number of standard deviations used to widen the lower and upper bounds of `x`.
535        Defaults to 0.5.
536    bw_fct: float, optional
537        A value that multiplies `bw` which enables tuning smoothness by hand.
538        Must be positive. Values below 1 decrease smoothness while values above 1 decrease it.
539        Defaults to 1 (no modification).
540    bw_return: bool, optional
541        Whether to return the estimated bandwidth in addition to the other objects.
542        Defaults to False.
543    custom_lims: list or tuple, optional
544        A list or tuple of length 2 indicating custom bounds for the range of `x`.
545        Defaults to None which disables custom bounds.
546    cumulative: bool, optional
547        Whether return the PDF or the cumulative PDF. Defaults to False.
548    grid_len: int, optional
549        The number of intervals used to bin the data points i.e. the length of the grid used in
550        the estimation. Defaults to 512.
551
552    Returns
553    -------
554    grid : Gridded numpy array for the x values.
555    pdf : Numpy array for the density estimates.
556    bw: optional, the estimated bandwidth.
557    """
558    # Check `bw_fct` is numeric and positive
559    if not isinstance(bw_fct, (int, float, np.integer, np.floating)):
560        raise TypeError(f"`bw_fct` must be a positive number, not an object of {type(bw_fct)}.")
561
562    if bw_fct <= 0:
563        raise ValueError(f"`bw_fct` must be a positive number, not {bw_fct}.")
564
565    # Preliminary calculations
566    x_min = x.min()
567    x_max = x.max()
568    x_std = np.std(x)
569    x_range = x_max - x_min
570
571    # Determine grid
572    grid_min, grid_max, grid_len = _get_grid(
573        x_min, x_max, x_std, extend_fct, grid_len, custom_lims, extend, bound_correction
574    )
575    grid_counts, _, grid_edges = histogram(x, grid_len, (grid_min, grid_max))
576
577    # Bandwidth estimation
578    bw = bw_fct * _get_bw(x, bw, grid_counts, x_std, x_range)
579
580    # Density estimation
581    if adaptive:
582        grid, pdf = _kde_adaptive(x, bw, grid_edges, grid_counts, grid_len, bound_correction)
583    else:
584        grid, pdf = _kde_convolution(x, bw, grid_edges, grid_counts, grid_len, bound_correction)
585
586    if cumulative:
587        pdf = pdf.cumsum() / pdf.sum()
588
589    if bw_return:
590        return grid, pdf, bw
591    else:
592        return grid, pdf
593
594
595def _kde_circular(
596    x,
597    bw="taylor",
598    bw_fct=1,
599    bw_return=False,
600    custom_lims=None,
601    cumulative=False,
602    grid_len=512,
603    **kwargs,  # pylint: disable=unused-argument
604):
605    """One dimensional density estimation for circular data.
606
607    Given an array of data points `x` measured in radians, it returns an estimate of the
608    probability density function that generated the samples in `x`.
609
610    Parameters
611    ----------
612    x : 1D numpy array
613        Data used to calculate the density estimation.
614    bw: int, float or str, optional
615        If numeric, indicates the bandwidth and must be positive.
616        If str, indicates the method to estimate the bandwidth and must be "taylor" since it is the
617        only option supported so far. Defaults to "taylor".
618    bw_fct: float, optional
619        A value that multiplies `bw` which enables tuning smoothness by hand. Must be positive.
620        Values above 1 decrease smoothness while values below 1 decrease it.
621        Defaults to 1 (no modification).
622    bw_return: bool, optional
623        Whether to return the estimated bandwidth in addition to the other objects.
624        Defaults to False.
625    custom_lims: list or tuple, optional
626        A list or tuple of length 2 indicating custom bounds for the range of `x`.
627        Defaults to None which means the estimation limits are [-pi, pi].
628    cumulative: bool, optional
629        Whether return the PDF or the cumulative PDF. Defaults to False.
630    grid_len: int, optional
631        The number of intervals used to bin the data pointa i.e. the length of the grid used in the
632        estimation. Defaults to 512.
633    """
634    # All values between -pi and pi
635    x = _normalize_angle(x)
636
637    # Check `bw_fct` is numeric and positive
638    if not isinstance(bw_fct, (int, float, np.integer, np.floating)):
639        raise TypeError(f"`bw_fct` must be a positive number, not an object of {type(bw_fct)}.")
640
641    if bw_fct <= 0:
642        raise ValueError(f"`bw_fct` must be a positive number, not {bw_fct}.")
643
644    # Determine bandwidth
645    if isinstance(bw, bool):
646        raise ValueError(
647            "`bw` can't be of type `bool`.\n" "Expected a positive numeric or 'taylor'"
648        )
649    if isinstance(bw, (int, float)):
650        if bw < 0:
651            raise ValueError(f"Numeric `bw` must be positive.\nInput: {bw:.4f}.")
652    if isinstance(bw, str):
653        if bw == "taylor":
654            bw = _bw_taylor(x)
655        else:
656            raise ValueError(f"`bw` must be a positive numeric or `taylor`, not {bw}")
657    bw *= bw_fct
658
659    # Determine grid
660    if custom_lims is not None:
661        custom_lims = _check_custom_lims(custom_lims, x.min(), x.max())
662        grid_min = custom_lims[0]
663        grid_max = custom_lims[1]
664        assert grid_min >= -np.pi, "Lower limit can't be smaller than -pi"
665        assert grid_max <= np.pi, "Upper limit can't be larger than pi"
666    else:
667        grid_min = -np.pi
668        grid_max = np.pi
669
670    bins = np.linspace(grid_min, grid_max, grid_len + 1)
671    bin_counts, _, bin_edges = histogram(x, bins=bins)
672    grid = 0.5 * (bin_edges[1:] + bin_edges[:-1])
673
674    kern = _vonmises_pdf(x=grid, mu=0, kappa=bw)
675    pdf = np.fft.fftshift(np.fft.irfft(np.fft.rfft(kern) * np.fft.rfft(bin_counts)))
676    pdf /= len(x)
677
678    if cumulative:
679        pdf = pdf.cumsum() / pdf.sum()
680
681    if bw_return:
682        return grid, pdf, bw
683    else:
684        return grid, pdf
685
686
687# pylint: disable=unused-argument
688def _kde_convolution(x, bw, grid_edges, grid_counts, grid_len, bound_correction, **kwargs):
689    """Kernel density with convolution.
690
691    One dimensional Gaussian kernel density estimation via convolution of the binned relative
692    frequencies and a Gaussian filter. This is an internal function used by `kde()`.
693    """
694    # Calculate relative frequencies per bin
695    bin_width = grid_edges[1] - grid_edges[0]
696    f = grid_counts / bin_width / len(x)
697
698    # Bandwidth must consider the bin width
699    bw /= bin_width
700
701    # See: https://stackoverflow.com/questions/2773606/gaussian-filter-in-matlab
702
703    grid = (grid_edges[1:] + grid_edges[:-1]) / 2
704
705    kernel_n = int(bw * 2 * np.pi)
706    if kernel_n == 0:
707        kernel_n = 1
708
709    kernel = gaussian(kernel_n, bw)
710
711    if bound_correction:
712        npad = int(grid_len / 5)
713        f = np.concatenate([f[npad - 1 :: -1], f, f[grid_len : grid_len - npad - 1 : -1]])
714        pdf = convolve(f, kernel, mode="same", method="direct")[npad : npad + grid_len]
715        pdf /= bw * (2 * np.pi) ** 0.5
716    else:
717        pdf = convolve(f, kernel, mode="same", method="direct")
718        pdf /= bw * (2 * np.pi) ** 0.5
719
720    return grid, pdf
721
722
723def _kde_adaptive(x, bw, grid_edges, grid_counts, grid_len, bound_correction, **kwargs):
724    """Compute Adaptive Kernel Density Estimation.
725
726    One dimensional adaptive Gaussian kernel density estimation. The implementation uses the binning
727    technique. Since there is not an unique `bw`, the convolution is not possible. The alternative
728    implemented in this function is known as Abramson's method.
729    This is an internal function used by `kde()`.
730    """
731    # Pilot computations used for bandwidth adjustment
732    pilot_grid, pilot_pdf = _kde_convolution(
733        x, bw, grid_edges, grid_counts, grid_len, bound_correction
734    )
735
736    # Adds to avoid np.log(0) and zero division
737    pilot_pdf += 1e-9
738
739    # Determine the modification factors
740    pdf_interp = np.interp(x, pilot_grid, pilot_pdf)
741    geom_mean = np.exp(np.mean(np.log(pdf_interp)))
742
743    # Power of c = 0.5 -> Abramson's method
744    adj_factor = (geom_mean / pilot_pdf) ** 0.5
745    bw_adj = bw * adj_factor
746
747    # Estimation of Gaussian KDE via binned method (convolution not possible)
748    grid = pilot_grid
749
750    if bound_correction:
751        grid_npad = int(grid_len / 5)
752        grid_width = grid_edges[1] - grid_edges[0]
753        grid_pad = grid_npad * grid_width
754        grid_padded = np.linspace(
755            grid_edges[0] - grid_pad,
756            grid_edges[grid_len - 1] + grid_pad,
757            num=grid_len + 2 * grid_npad,
758        )
759        grid_counts = np.concatenate(
760            [
761                grid_counts[grid_npad - 1 :: -1],
762                grid_counts,
763                grid_counts[grid_len : grid_len - grid_npad - 1 : -1],
764            ]
765        )
766        bw_adj = np.concatenate(
767            [bw_adj[grid_npad - 1 :: -1], bw_adj, bw_adj[grid_len : grid_len - grid_npad - 1 : -1]]
768        )
769        pdf_mat = (grid_padded - grid_padded[:, None]) / bw_adj[:, None]
770        pdf_mat = np.exp(-0.5 * pdf_mat ** 2) * grid_counts[:, None]
771        pdf_mat /= (2 * np.pi) ** 0.5 * bw_adj[:, None]
772        pdf = np.sum(pdf_mat[:, grid_npad : grid_npad + grid_len], axis=0) / len(x)
773
774    else:
775        pdf_mat = (grid - grid[:, None]) / bw_adj[:, None]
776        pdf_mat = np.exp(-0.5 * pdf_mat ** 2) * grid_counts[:, None]
777        pdf_mat /= (2 * np.pi) ** 0.5 * bw_adj[:, None]
778        pdf = np.sum(pdf_mat, axis=0) / len(x)
779
780    return grid, pdf
781
782
783def _fast_kde_2d(x, y, gridsize=(128, 128), circular=False):
784    """
785    2D fft-based Gaussian kernel density estimate (KDE).
786
787    The code was adapted from https://github.com/mfouesneau/faststats
788
789    Parameters
790    ----------
791    x : Numpy array or list
792    y : Numpy array or list
793    gridsize : tuple
794        Number of points used to discretize data. Use powers of 2 for fft optimization
795    circular: bool
796        If True use circular boundaries. Defaults to False
797
798    Returns
799    -------
800    grid: A gridded 2D KDE of the input points (x, y)
801    xmin: minimum value of x
802    xmax: maximum value of x
803    ymin: minimum value of y
804    ymax: maximum value of y
805    """
806    x = np.asarray(x, dtype=float)
807    x = x[np.isfinite(x)]
808    y = np.asarray(y, dtype=float)
809    y = y[np.isfinite(y)]
810
811    xmin, xmax = x.min(), x.max()
812    ymin, ymax = y.min(), y.max()
813
814    len_x = len(x)
815    weights = np.ones(len_x)
816    n_x, n_y = gridsize
817
818    d_x = (xmax - xmin) / (n_x - 1)
819    d_y = (ymax - ymin) / (n_y - 1)
820
821    xyi = _stack(x, y).T
822    xyi -= [xmin, ymin]
823    xyi /= [d_x, d_y]
824    xyi = np.floor(xyi, xyi).T
825
826    scotts_factor = len_x ** (-1 / 6)
827    cov = _cov(xyi)
828    std_devs = np.diag(cov) ** 0.5
829    kern_nx, kern_ny = np.round(scotts_factor * 2 * np.pi * std_devs)
830
831    inv_cov = np.linalg.inv(cov * scotts_factor ** 2)
832
833    x_x = np.arange(kern_nx) - kern_nx / 2
834    y_y = np.arange(kern_ny) - kern_ny / 2
835    x_x, y_y = np.meshgrid(x_x, y_y)
836
837    kernel = _stack(x_x.flatten(), y_y.flatten())
838    kernel = _dot(inv_cov, kernel) * kernel
839    kernel = np.exp(-kernel.sum(axis=0) / 2)
840    kernel = kernel.reshape((int(kern_ny), int(kern_nx)))
841
842    boundary = "wrap" if circular else "symm"
843
844    grid = coo_matrix((weights, xyi), shape=(n_x, n_y)).toarray()
845    grid = convolve2d(grid, kernel, mode="same", boundary=boundary)
846
847    norm_factor = np.linalg.det(2 * np.pi * cov * scotts_factor ** 2)
848    norm_factor = len_x * d_x * d_y * norm_factor ** 0.5
849
850    grid /= norm_factor
851
852    return grid, xmin, xmax, ymin, ymax
853
854
855def get_bins(values):
856    """
857    Automatically compute the number of bins for discrete variables.
858
859    Parameters
860    ----------
861    values = numpy array
862        values
863
864    Returns
865    -------
866    array with the bins
867
868    Notes
869    -----
870    Computes the width of the bins by taking the maximum of the Sturges and the Freedman-Diaconis
871    estimators. According to numpy `np.histogram` this provides good all around performance.
872
873    The Sturges is a very simplistic estimator based on the assumption of normality of the data.
874    This estimator has poor performance for non-normal data, which becomes especially obvious for
875    large data sets. The estimate depends only on size of the data.
876
877    The Freedman-Diaconis rule uses interquartile range (IQR) to estimate the binwidth.
878    It is considered a robust version of the Scott rule as the IQR is less affected by outliers
879    than the standard deviation. However, the IQR depends on fewer points than the standard
880    deviation, so it is less accurate, especially for long tailed distributions.
881    """
882    dtype = values.dtype.kind
883
884    if dtype == "i":
885        x_min = values.min().astype(int)
886        x_max = values.max().astype(int)
887    else:
888        x_min = values.min().astype(float)
889        x_max = values.max().astype(float)
890
891    # Sturges histogram bin estimator
892    bins_sturges = (x_max - x_min) / (np.log2(values.size) + 1)
893
894    # The Freedman-Diaconis histogram bin estimator.
895    iqr = np.subtract(*np.percentile(values, [75, 25]))  # pylint: disable=assignment-from-no-return
896    bins_fd = 2 * iqr * values.size ** (-1 / 3)
897
898    if dtype == "i":
899        width = np.round(np.max([1, bins_sturges, bins_fd])).astype(int)
900        bins = np.arange(x_min, x_max + width + 1, width)
901    else:
902        width = np.max([bins_sturges, bins_fd])
903        if np.isclose(x_min, x_max):
904            width = 1e-3
905        bins = np.arange(x_min, x_max + width, width)
906
907    return bins
908
909
910def _sturges_formula(dataset, mult=1):
911    """Use Sturges' formula to determine number of bins.
912
913    See https://en.wikipedia.org/wiki/Histogram#Sturges'_formula
914    or https://doi.org/10.1080%2F01621459.1926.10502161
915
916    Parameters
917    ----------
918    dataset: xarray.DataSet
919       Must have the `draw` dimension
920
921    mult: float
922        Used to scale the number of bins up or down. Default is 1 for Sturges' formula.
923
924    Returns
925    -------
926    int
927        Number of bins to use
928    """
929    return int(np.ceil(mult * np.log2(dataset.draw.size)) + 1)
930
931
932def _circular_mean(x):
933    """Compute mean of circular variable measured in radians.
934
935    The result is between -pi and pi.
936    """
937    sinr = np.sum(np.sin(x))
938    cosr = np.sum(np.cos(x))
939    mean = np.arctan2(sinr, cosr)
940
941    return mean
942
943
944def _normalize_angle(x, zero_centered=True):
945    """Normalize angles.
946
947    Normalize angles in radians to [-pi, pi) or [0, 2 * pi) according to `zero_centered`.
948    """
949    if zero_centered:
950        return (x + np.pi) % (2 * np.pi) - np.pi
951    else:
952        return x % (2 * np.pi)
953
954
955@conditional_jit(cache=True)
956def histogram(data, bins, range_hist=None):
957    """Conditionally jitted histogram.
958
959    Parameters
960    ----------
961    data : array-like
962        Input data. Passed as first positional argument to ``np.histogram``.
963    bins : int or array-like
964        Passed as keyword argument ``bins`` to ``np.histogram``.
965    range_hist : (float, float), optional
966        Passed as keyword argument ``range`` to ``np.histogram``.
967
968    Returns
969    -------
970    hist : array
971        The number of counts per bin.
972    density : array
973        The density corresponding to each bin.
974    bin_edges : array
975        The edges of the bins used.
976    """
977    hist, bin_edges = np.histogram(data, bins=bins, range=range_hist)
978    hist_dens = hist / (hist.sum() * np.diff(bin_edges))
979    return hist, hist_dens, bin_edges
980
981
982def _find_hdi_contours(density, hdi_probs):
983    """
984    Find contours enclosing regions of highest posterior density.
985
986    Parameters
987    ----------
988    density : array-like
989        A 2D KDE on a grid with cells of equal area.
990    hdi_probs : array-like
991        An array of highest density interval confidence probabilities.
992
993    Returns
994    -------
995    contour_levels : array
996        The contour levels corresponding to the given HDI probabilities.
997    """
998    # Using the algorithm from corner.py
999    sorted_density = np.sort(density, axis=None)[::-1]
1000    sm = sorted_density.cumsum()
1001    sm /= sm[-1]
1002
1003    contours = np.empty_like(hdi_probs)
1004    for idx, hdi_prob in enumerate(hdi_probs):
1005        try:
1006            contours[idx] = sorted_density[sm <= hdi_prob][-1]
1007        except IndexError:
1008            contours[idx] = sorted_density[0]
1009
1010    return contours
1011