1# pylint: disable=invalid-name,too-many-lines 2"""Density estimation functions for ArviZ.""" 3import warnings 4 5import numpy as np 6from scipy.fftpack import fft 7from scipy.optimize import brentq 8from scipy.signal import convolve, convolve2d, gaussian # pylint: disable=no-name-in-module 9from scipy.sparse import coo_matrix 10from scipy.special import ive # pylint: disable=no-name-in-module 11 12from ..utils import _cov, _dot, _stack, conditional_jit 13 14__all__ = ["kde"] 15 16 17def _bw_scott(x, x_std=None, **kwargs): # pylint: disable=unused-argument 18 """Scott's Rule.""" 19 if x_std is None: 20 x_std = np.std(x) 21 bw = 1.06 * x_std * len(x) ** (-0.2) 22 return bw 23 24 25def _bw_silverman(x, x_std=None, **kwargs): # pylint: disable=unused-argument 26 """Silverman's Rule.""" 27 if x_std is None: 28 x_std = np.std(x) 29 q75, q25 = np.percentile(x, [75, 25]) 30 x_iqr = q75 - q25 31 a = min(x_std, x_iqr / 1.34) 32 bw = 0.9 * a * len(x) ** (-0.2) 33 return bw 34 35 36def _bw_isj(x, grid_counts=None, x_std=None, x_range=None): 37 """Improved Sheather-Jones bandwidth estimation. 38 39 Improved Sheather and Jones method as explained in [1]_. This method is used internally by the 40 KDE estimator, resulting in saved computation time as minimums, maximums and the grid are 41 pre-computed. 42 43 References 44 ---------- 45 .. [1] Kernel density estimation via diffusion. 46 Z. I. Botev, J. F. Grotowski, and D. P. Kroese. 47 Ann. Statist. 38 (2010), no. 5, 2916--2957. 48 """ 49 x_len = len(x) 50 if x_range is None: 51 x_min = np.min(x) 52 x_max = np.max(x) 53 x_range = x_max - x_min 54 55 # Relative frequency per bin 56 if grid_counts is None: 57 x_std = np.std(x) 58 grid_len = 256 59 grid_min = x_min - 0.5 * x_std 60 grid_max = x_max + 0.5 * x_std 61 grid_counts, _, _ = histogram(x, grid_len, (grid_min, grid_max)) 62 else: 63 grid_len = len(grid_counts) - 1 64 65 grid_relfreq = grid_counts / x_len 66 67 # Discrete cosine transform of the data 68 a_k = _dct1d(grid_relfreq) 69 70 k_sq = np.arange(1, grid_len) ** 2 71 a_sq = a_k[range(1, grid_len)] ** 2 72 73 t = _root(_fixed_point, x_len, args=(x_len, k_sq, a_sq), x=x) 74 h = t ** 0.5 * x_range 75 return h 76 77 78def _bw_experimental(x, grid_counts=None, x_std=None, x_range=None): 79 """Experimental bandwidth estimator.""" 80 bw_silverman = _bw_silverman(x, x_std=x_std) 81 bw_isj = _bw_isj(x, grid_counts=grid_counts, x_range=x_range) 82 return 0.5 * (bw_silverman + bw_isj) 83 84 85def _bw_taylor(x): 86 """Taylor's rule for circular bandwidth estimation. 87 88 This function implements a rule-of-thumb for choosing the bandwidth of a von Mises kernel 89 density estimator that assumes the underlying distribution is von Mises as introduced in [1]_. 90 It is analogous to Scott's rule for the Gaussian KDE. 91 92 Circular bandwidth has a different scale from linear bandwidth. Unlike linear scale, low 93 bandwidths are associated with oversmoothing and high values with undersmoothing. 94 95 References 96 ---------- 97 .. [1] C.C Taylor (2008). Automatic bandwidth selection for circular 98 density estimation. 99 Computational Statistics and Data Analysis, 52, 7, 3493–3500. 100 """ 101 x_len = len(x) 102 kappa = _kappa_mle(x) 103 num = 3 * x_len * kappa ** 2 * ive(2, 2 * kappa) 104 den = 4 * np.pi ** 0.5 * ive(0, kappa) ** 2 105 return (num / den) ** 0.4 106 107 108_BW_METHODS_LINEAR = { 109 "scott": _bw_scott, 110 "silverman": _bw_silverman, 111 "isj": _bw_isj, 112 "experimental": _bw_experimental, 113} 114 115 116def _get_bw(x, bw, grid_counts=None, x_std=None, x_range=None): 117 """Compute bandwidth for a given data `x` and `bw`. 118 119 Also checks `bw` is correctly specified. 120 121 Parameters 122 ---------- 123 x : 1-D numpy array 124 1 dimensional array of sample data from the 125 variable for which a density estimate is desired. 126 bw: int, float or str 127 If numeric, indicates the bandwidth and must be positive. 128 If str, indicates the method to estimate the bandwidth. 129 130 Returns 131 ------- 132 bw: float 133 Bandwidth 134 """ 135 if isinstance(bw, bool): 136 raise ValueError( 137 ( 138 "`bw` must not be of type `bool`.\n" 139 "Expected a positive numeric or one of the following strings:\n" 140 f"{list(_BW_METHODS_LINEAR.keys())}." 141 ) 142 ) 143 if isinstance(bw, (int, float)): 144 if bw < 0: 145 raise ValueError(f"Numeric `bw` must be positive.\nInput: {bw:.4f}.") 146 elif isinstance(bw, str): 147 bw_lower = bw.lower() 148 149 if bw_lower not in _BW_METHODS_LINEAR.keys(): 150 raise ValueError( 151 "Unrecognized bandwidth method.\n" 152 f"Input is: {bw_lower}.\n" 153 f"Expected one of: {list(_BW_METHODS_LINEAR.keys())}." 154 ) 155 156 bw_fun = _BW_METHODS_LINEAR[bw_lower] 157 bw = bw_fun(x, grid_counts=grid_counts, x_std=x_std, x_range=x_range) 158 else: 159 raise ValueError( 160 "Unrecognized `bw` argument.\n" 161 "Expected a positive numeric or one of the following strings:\n" 162 f"{list(_BW_METHODS_LINEAR.keys())}." 163 ) 164 return bw 165 166 167def _vonmises_pdf(x, mu, kappa): 168 """Calculate vonmises_pdf.""" 169 if kappa <= 0: 170 raise ValueError("Argument 'kappa' must be positive.") 171 pdf = 1 / (2 * np.pi * ive(0, kappa)) * np.exp(np.cos(x - mu) - 1) ** kappa 172 return pdf 173 174 175def _a1inv(x): 176 """Compute inverse function. 177 178 Inverse function of the ratio of the first and 179 zeroth order Bessel functions of the first kind. 180 181 Returns the value k, such that a1inv(x) = k, i.e. a1(k) = x. 182 """ 183 if 0 <= x < 0.53: 184 return 2 * x + x ** 3 + (5 * x ** 5) / 6 185 elif x < 0.85: 186 return -0.4 + 1.39 * x + 0.43 / (1 - x) 187 else: 188 return 1 / (x ** 3 - 4 * x ** 2 + 3 * x) 189 190 191def _kappa_mle(x): 192 mean = _circular_mean(x) 193 kappa = _a1inv(np.mean(np.cos(x - mean))) 194 return kappa 195 196 197def _dct1d(x): 198 """Discrete Cosine Transform in 1 Dimension. 199 200 Parameters 201 ---------- 202 x : numpy array 203 1 dimensional array of values for which the 204 DCT is desired 205 206 Returns 207 ------- 208 output : DTC transformed values 209 """ 210 x_len = len(x) 211 212 even_increasing = np.arange(0, x_len, 2) 213 odd_decreasing = np.arange(x_len - 1, 0, -2) 214 215 x = np.concatenate((x[even_increasing], x[odd_decreasing])) 216 217 w_1k = np.r_[1, (2 * np.exp(-(0 + 1j) * (np.arange(1, x_len)) * np.pi / (2 * x_len)))] 218 output = np.real(w_1k * fft(x)) 219 220 return output 221 222 223def _fixed_point(t, N, k_sq, a_sq): 224 """Calculate t-zeta*gamma^[l](t). 225 226 Implementation of the function t-zeta*gamma^[l](t) derived from equation (30) in [1]. 227 228 References 229 ---------- 230 .. [1] Kernel density estimation via diffusion. 231 Z. I. Botev, J. F. Grotowski, and D. P. Kroese. 232 Ann. Statist. 38 (2010), no. 5, 2916--2957. 233 """ 234 k_sq = np.asfarray(k_sq, dtype=np.float64) 235 a_sq = np.asfarray(a_sq, dtype=np.float64) 236 237 l = 7 238 f = np.sum(np.power(k_sq, l) * a_sq * np.exp(-k_sq * np.pi ** 2 * t)) 239 f *= 0.5 * np.pi ** (2.0 * l) 240 241 for j in np.arange(l - 1, 2 - 1, -1): 242 c1 = (1 + 0.5 ** (j + 0.5)) / 3 243 c2 = np.product(np.arange(1.0, 2 * j + 1, 2, dtype=np.float64)) 244 c2 /= (np.pi / 2) ** 0.5 245 t_j = np.power((c1 * (c2 / (N * f))), (2.0 / (3.0 + 2.0 * j))) 246 f = np.sum(k_sq ** j * a_sq * np.exp(-k_sq * np.pi ** 2.0 * t_j)) 247 f *= 0.5 * np.pi ** (2 * j) 248 249 out = t - (2 * N * np.pi ** 0.5 * f) ** (-0.4) 250 return out 251 252 253def _root(function, N, args, x): 254 # The right bound is at most 0.01 255 found = False 256 N = max(min(1050, N), 50) 257 tol = 10e-12 + 0.01 * (N - 50) / 1000 258 259 while not found: 260 try: 261 bw, res = brentq(function, 0, 0.01, args=args, full_output=True, disp=False) 262 found = res.converged 263 except ValueError: 264 bw = 0 265 tol *= 2.0 266 found = False 267 if bw <= 0 or tol >= 1: 268 bw = (_bw_silverman(x) / np.ptp(x)) ** 2 269 return bw 270 return bw 271 272 273def _check_custom_lims(custom_lims, x_min, x_max): 274 """Check if `custom_lims` are of the correct type. 275 276 It accepts numeric lists/tuples of length 2. 277 278 Parameters 279 ---------- 280 custom_lims : Object whose type is checked. 281 282 Returns 283 ------- 284 None: Object of type None 285 """ 286 if not isinstance(custom_lims, (list, tuple)): 287 raise TypeError( 288 "`custom_lims` must be a numeric list or tuple of length 2.\n" 289 f"Not an object of {type(custom_lims)}." 290 ) 291 292 if len(custom_lims) != 2: 293 raise AttributeError(f"`len(custom_lims)` must be 2, not {len(custom_lims)}.") 294 295 any_bool = any(isinstance(i, bool) for i in custom_lims) 296 if any_bool: 297 raise TypeError("Elements of `custom_lims` must be numeric or None, not bool.") 298 299 custom_lims = list(custom_lims) # convert to a mutable object 300 if custom_lims[0] is None: 301 custom_lims[0] = x_min 302 303 if custom_lims[1] is None: 304 custom_lims[1] = x_max 305 306 all_numeric = all(isinstance(i, (int, float, np.integer, np.float)) for i in custom_lims) 307 if not all_numeric: 308 raise TypeError( 309 ("Elements of `custom_lims` must be numeric or None.\n" "At least one of them is not.") 310 ) 311 312 if not custom_lims[0] < custom_lims[1]: 313 raise ValueError("`custom_lims[0]` must be smaller than `custom_lims[1]`.") 314 315 if custom_lims[0] > x_min or custom_lims[1] < x_max: 316 raise ValueError("Some observations are outside `custom_lims` boundaries.") 317 318 return custom_lims 319 320 321def _get_grid( 322 x_min, x_max, x_std, extend_fct, grid_len, custom_lims, extend=True, bound_correction=False 323): 324 """Compute the grid that bins the data used to estimate the density function. 325 326 Parameters 327 ---------- 328 x_min : float 329 Minimum value of the data 330 x_max: float 331 Maximum value of the data. 332 x_std: float 333 Standard deviation of the data. 334 extend_fct: bool 335 Indicates the factor by which `x_std` is multiplied 336 to extend the range of the data. 337 grid_len: int 338 Number of bins 339 custom_lims: tuple or list 340 Custom limits for the domain of the density estimation. 341 Must be numeric of length 2. Overrides `extend`. 342 extend: bool, optional 343 Whether to extend the range of the data or not. 344 Default is True. 345 bound_correction: bool, optional 346 Whether the density estimations performs boundary correction or not. 347 This does not impacts directly in the output, but is used 348 to override `extend`. Overrides `extend`. 349 Default is False. 350 351 Returns 352 ------- 353 grid_len: int 354 Number of bins 355 grid_min: float 356 Minimum value of the grid 357 grid_max: float 358 Maximum value of the grid 359 """ 360 # Set up number of bins. 361 grid_len = max(int(grid_len), 100) 362 363 # Set up domain 364 if custom_lims is not None: 365 custom_lims = _check_custom_lims(custom_lims, x_min, x_max) 366 grid_min = custom_lims[0] 367 grid_max = custom_lims[1] 368 elif extend and not bound_correction: 369 grid_extend = extend_fct * x_std 370 grid_min = x_min - grid_extend 371 grid_max = x_max + grid_extend 372 else: 373 grid_min = x_min 374 grid_max = x_max 375 return grid_min, grid_max, grid_len 376 377 378def kde(x, circular=False, **kwargs): 379 """One dimensional density estimation. 380 381 It is a wrapper around `kde_linear()` and `kde_circular()`. 382 383 Parameters 384 ---------- 385 x : 1D numpy array 386 Data used to calculate the density estimation. 387 circular: bool, optional 388 Whether `x` is a circular variable or not. Defaults to False. 389 **kwargs: Arguments passed to `kde_linear()` and `kde_circular()`. 390 See their documentation for more info. 391 392 Returns 393 ------- 394 grid : Gridded numpy array for the x values. 395 pdf : Numpy array for the density estimates. 396 bw: optional, the estimated bandwidth. 397 398 Examples 399 -------- 400 Default density estimation for linear data 401 .. plot:: 402 :context: close-figs 403 404 >>> import numpy as np 405 >>> import matplotlib.pyplot as plt 406 >>> from arviz import kde 407 >>> 408 >>> rvs = np.random.gamma(shape=1.8, size=1000) 409 >>> grid, pdf = kde(rvs) 410 >>> plt.plot(grid, pdf) 411 >>> plt.show() 412 413 Density estimation for linear data with Silverman's rule bandwidth 414 .. plot:: 415 :context: close-figs 416 417 >>> grid, pdf = kde(rvs, bw="silverman") 418 >>> plt.plot(grid, pdf) 419 >>> plt.show() 420 421 Density estimation for linear data with scaled bandwidth 422 .. plot:: 423 :context: close-figs 424 425 >>> # bw_fct > 1 means more smoothness. 426 >>> grid, pdf = kde(rvs, bw_fct=2.5) 427 >>> plt.plot(grid, pdf) 428 >>> plt.show() 429 430 Default density estimation for linear data with extended limits 431 .. plot:: 432 :context: close-figs 433 434 >>> grid, pdf = kde(rvs, bound_correction=False, extend=True, extend_fct=0.5) 435 >>> plt.plot(grid, pdf) 436 >>> plt.show() 437 438 Default density estimation for linear data with custom limits 439 .. plot:: 440 :context: close-figs 441 # It accepts tuples and lists of length 2. 442 >>> grid, pdf = kde(rvs, bound_correction=False, custom_lims=(0, 10)) 443 >>> plt.plot(grid, pdf) 444 >>> plt.show() 445 446 Default density estimation for circular data 447 .. plot:: 448 :context: close-figs 449 450 >>> rvs = np.random.vonmises(mu=np.pi, kappa=1, size=500) 451 >>> grid, pdf = kde(rvs, circular=True) 452 >>> plt.plot(grid, pdf) 453 >>> plt.show() 454 455 Density estimation for circular data with scaled bandwidth 456 .. plot:: 457 :context: close-figs 458 459 >>> rvs = np.random.vonmises(mu=np.pi, kappa=1, size=500) 460 >>> # bw_fct > 1 means less smoothness. 461 >>> grid, pdf = kde(rvs, circular=True, bw_fct=3) 462 >>> plt.plot(grid, pdf) 463 >>> plt.show() 464 465 Density estimation for circular data with custom limits 466 .. plot:: 467 :context: close-figs 468 469 >>> # This is still experimental, does not always work. 470 >>> rvs = np.random.vonmises(mu=0, kappa=30, size=500) 471 >>> grid, pdf = kde(rvs, circular=True, custom_lims=(-1, 1)) 472 >>> plt.plot(grid, pdf) 473 >>> plt.show() 474 475 See Also 476 -------- 477 plot_kde : Compute and plot a kernel density estimate. 478 """ 479 x = x[np.isfinite(x)] 480 if x.size == 0 or np.all(x == x[0]): 481 warnings.warn("Your data appears to have a single value or no finite values") 482 483 return np.zeros(2), np.array([np.nan] * 2) 484 485 if circular: 486 if circular == "degrees": 487 x = np.radians(x) 488 kde_fun = _kde_circular 489 else: 490 kde_fun = _kde_linear 491 492 return kde_fun(x, **kwargs) 493 494 495def _kde_linear( 496 x, 497 bw="experimental", 498 adaptive=False, 499 extend=False, 500 bound_correction=True, 501 extend_fct=0, 502 bw_fct=1, 503 bw_return=False, 504 custom_lims=None, 505 cumulative=False, 506 grid_len=512, 507 **kwargs, # pylint: disable=unused-argument 508): 509 """One dimensional density estimation for linear data. 510 511 Given an array of data points `x` it returns an estimate of 512 the probability density function that generated the samples in `x`. 513 514 Parameters 515 ---------- 516 x : 1D numpy array 517 Data used to calculate the density estimation. 518 bw: int, float or str, optional 519 If numeric, indicates the bandwidth and must be positive. 520 If str, indicates the method to estimate the bandwidth and must be one of "scott", 521 "silverman", "isj" or "experimental". Defaults to "experimental". 522 adaptive: boolean, optional 523 Indicates if the bandwidth is adaptive or not. 524 It is the recommended approach when there are multiple modes with different spread. 525 It is not compatible with convolution. Defaults to False. 526 extend: boolean, optional 527 Whether to extend the observed range for `x` in the estimation. 528 It extends each bound by a multiple of the standard deviation of `x` given by `extend_fct`. 529 Defaults to False. 530 bound_correction: boolean, optional 531 Whether to perform boundary correction on the bounds of `x` or not. 532 Defaults to True. 533 extend_fct: float, optional 534 Number of standard deviations used to widen the lower and upper bounds of `x`. 535 Defaults to 0.5. 536 bw_fct: float, optional 537 A value that multiplies `bw` which enables tuning smoothness by hand. 538 Must be positive. Values below 1 decrease smoothness while values above 1 decrease it. 539 Defaults to 1 (no modification). 540 bw_return: bool, optional 541 Whether to return the estimated bandwidth in addition to the other objects. 542 Defaults to False. 543 custom_lims: list or tuple, optional 544 A list or tuple of length 2 indicating custom bounds for the range of `x`. 545 Defaults to None which disables custom bounds. 546 cumulative: bool, optional 547 Whether return the PDF or the cumulative PDF. Defaults to False. 548 grid_len: int, optional 549 The number of intervals used to bin the data points i.e. the length of the grid used in 550 the estimation. Defaults to 512. 551 552 Returns 553 ------- 554 grid : Gridded numpy array for the x values. 555 pdf : Numpy array for the density estimates. 556 bw: optional, the estimated bandwidth. 557 """ 558 # Check `bw_fct` is numeric and positive 559 if not isinstance(bw_fct, (int, float, np.integer, np.floating)): 560 raise TypeError(f"`bw_fct` must be a positive number, not an object of {type(bw_fct)}.") 561 562 if bw_fct <= 0: 563 raise ValueError(f"`bw_fct` must be a positive number, not {bw_fct}.") 564 565 # Preliminary calculations 566 x_min = x.min() 567 x_max = x.max() 568 x_std = np.std(x) 569 x_range = x_max - x_min 570 571 # Determine grid 572 grid_min, grid_max, grid_len = _get_grid( 573 x_min, x_max, x_std, extend_fct, grid_len, custom_lims, extend, bound_correction 574 ) 575 grid_counts, _, grid_edges = histogram(x, grid_len, (grid_min, grid_max)) 576 577 # Bandwidth estimation 578 bw = bw_fct * _get_bw(x, bw, grid_counts, x_std, x_range) 579 580 # Density estimation 581 if adaptive: 582 grid, pdf = _kde_adaptive(x, bw, grid_edges, grid_counts, grid_len, bound_correction) 583 else: 584 grid, pdf = _kde_convolution(x, bw, grid_edges, grid_counts, grid_len, bound_correction) 585 586 if cumulative: 587 pdf = pdf.cumsum() / pdf.sum() 588 589 if bw_return: 590 return grid, pdf, bw 591 else: 592 return grid, pdf 593 594 595def _kde_circular( 596 x, 597 bw="taylor", 598 bw_fct=1, 599 bw_return=False, 600 custom_lims=None, 601 cumulative=False, 602 grid_len=512, 603 **kwargs, # pylint: disable=unused-argument 604): 605 """One dimensional density estimation for circular data. 606 607 Given an array of data points `x` measured in radians, it returns an estimate of the 608 probability density function that generated the samples in `x`. 609 610 Parameters 611 ---------- 612 x : 1D numpy array 613 Data used to calculate the density estimation. 614 bw: int, float or str, optional 615 If numeric, indicates the bandwidth and must be positive. 616 If str, indicates the method to estimate the bandwidth and must be "taylor" since it is the 617 only option supported so far. Defaults to "taylor". 618 bw_fct: float, optional 619 A value that multiplies `bw` which enables tuning smoothness by hand. Must be positive. 620 Values above 1 decrease smoothness while values below 1 decrease it. 621 Defaults to 1 (no modification). 622 bw_return: bool, optional 623 Whether to return the estimated bandwidth in addition to the other objects. 624 Defaults to False. 625 custom_lims: list or tuple, optional 626 A list or tuple of length 2 indicating custom bounds for the range of `x`. 627 Defaults to None which means the estimation limits are [-pi, pi]. 628 cumulative: bool, optional 629 Whether return the PDF or the cumulative PDF. Defaults to False. 630 grid_len: int, optional 631 The number of intervals used to bin the data pointa i.e. the length of the grid used in the 632 estimation. Defaults to 512. 633 """ 634 # All values between -pi and pi 635 x = _normalize_angle(x) 636 637 # Check `bw_fct` is numeric and positive 638 if not isinstance(bw_fct, (int, float, np.integer, np.floating)): 639 raise TypeError(f"`bw_fct` must be a positive number, not an object of {type(bw_fct)}.") 640 641 if bw_fct <= 0: 642 raise ValueError(f"`bw_fct` must be a positive number, not {bw_fct}.") 643 644 # Determine bandwidth 645 if isinstance(bw, bool): 646 raise ValueError( 647 "`bw` can't be of type `bool`.\n" "Expected a positive numeric or 'taylor'" 648 ) 649 if isinstance(bw, (int, float)): 650 if bw < 0: 651 raise ValueError(f"Numeric `bw` must be positive.\nInput: {bw:.4f}.") 652 if isinstance(bw, str): 653 if bw == "taylor": 654 bw = _bw_taylor(x) 655 else: 656 raise ValueError(f"`bw` must be a positive numeric or `taylor`, not {bw}") 657 bw *= bw_fct 658 659 # Determine grid 660 if custom_lims is not None: 661 custom_lims = _check_custom_lims(custom_lims, x.min(), x.max()) 662 grid_min = custom_lims[0] 663 grid_max = custom_lims[1] 664 assert grid_min >= -np.pi, "Lower limit can't be smaller than -pi" 665 assert grid_max <= np.pi, "Upper limit can't be larger than pi" 666 else: 667 grid_min = -np.pi 668 grid_max = np.pi 669 670 bins = np.linspace(grid_min, grid_max, grid_len + 1) 671 bin_counts, _, bin_edges = histogram(x, bins=bins) 672 grid = 0.5 * (bin_edges[1:] + bin_edges[:-1]) 673 674 kern = _vonmises_pdf(x=grid, mu=0, kappa=bw) 675 pdf = np.fft.fftshift(np.fft.irfft(np.fft.rfft(kern) * np.fft.rfft(bin_counts))) 676 pdf /= len(x) 677 678 if cumulative: 679 pdf = pdf.cumsum() / pdf.sum() 680 681 if bw_return: 682 return grid, pdf, bw 683 else: 684 return grid, pdf 685 686 687# pylint: disable=unused-argument 688def _kde_convolution(x, bw, grid_edges, grid_counts, grid_len, bound_correction, **kwargs): 689 """Kernel density with convolution. 690 691 One dimensional Gaussian kernel density estimation via convolution of the binned relative 692 frequencies and a Gaussian filter. This is an internal function used by `kde()`. 693 """ 694 # Calculate relative frequencies per bin 695 bin_width = grid_edges[1] - grid_edges[0] 696 f = grid_counts / bin_width / len(x) 697 698 # Bandwidth must consider the bin width 699 bw /= bin_width 700 701 # See: https://stackoverflow.com/questions/2773606/gaussian-filter-in-matlab 702 703 grid = (grid_edges[1:] + grid_edges[:-1]) / 2 704 705 kernel_n = int(bw * 2 * np.pi) 706 if kernel_n == 0: 707 kernel_n = 1 708 709 kernel = gaussian(kernel_n, bw) 710 711 if bound_correction: 712 npad = int(grid_len / 5) 713 f = np.concatenate([f[npad - 1 :: -1], f, f[grid_len : grid_len - npad - 1 : -1]]) 714 pdf = convolve(f, kernel, mode="same", method="direct")[npad : npad + grid_len] 715 pdf /= bw * (2 * np.pi) ** 0.5 716 else: 717 pdf = convolve(f, kernel, mode="same", method="direct") 718 pdf /= bw * (2 * np.pi) ** 0.5 719 720 return grid, pdf 721 722 723def _kde_adaptive(x, bw, grid_edges, grid_counts, grid_len, bound_correction, **kwargs): 724 """Compute Adaptive Kernel Density Estimation. 725 726 One dimensional adaptive Gaussian kernel density estimation. The implementation uses the binning 727 technique. Since there is not an unique `bw`, the convolution is not possible. The alternative 728 implemented in this function is known as Abramson's method. 729 This is an internal function used by `kde()`. 730 """ 731 # Pilot computations used for bandwidth adjustment 732 pilot_grid, pilot_pdf = _kde_convolution( 733 x, bw, grid_edges, grid_counts, grid_len, bound_correction 734 ) 735 736 # Adds to avoid np.log(0) and zero division 737 pilot_pdf += 1e-9 738 739 # Determine the modification factors 740 pdf_interp = np.interp(x, pilot_grid, pilot_pdf) 741 geom_mean = np.exp(np.mean(np.log(pdf_interp))) 742 743 # Power of c = 0.5 -> Abramson's method 744 adj_factor = (geom_mean / pilot_pdf) ** 0.5 745 bw_adj = bw * adj_factor 746 747 # Estimation of Gaussian KDE via binned method (convolution not possible) 748 grid = pilot_grid 749 750 if bound_correction: 751 grid_npad = int(grid_len / 5) 752 grid_width = grid_edges[1] - grid_edges[0] 753 grid_pad = grid_npad * grid_width 754 grid_padded = np.linspace( 755 grid_edges[0] - grid_pad, 756 grid_edges[grid_len - 1] + grid_pad, 757 num=grid_len + 2 * grid_npad, 758 ) 759 grid_counts = np.concatenate( 760 [ 761 grid_counts[grid_npad - 1 :: -1], 762 grid_counts, 763 grid_counts[grid_len : grid_len - grid_npad - 1 : -1], 764 ] 765 ) 766 bw_adj = np.concatenate( 767 [bw_adj[grid_npad - 1 :: -1], bw_adj, bw_adj[grid_len : grid_len - grid_npad - 1 : -1]] 768 ) 769 pdf_mat = (grid_padded - grid_padded[:, None]) / bw_adj[:, None] 770 pdf_mat = np.exp(-0.5 * pdf_mat ** 2) * grid_counts[:, None] 771 pdf_mat /= (2 * np.pi) ** 0.5 * bw_adj[:, None] 772 pdf = np.sum(pdf_mat[:, grid_npad : grid_npad + grid_len], axis=0) / len(x) 773 774 else: 775 pdf_mat = (grid - grid[:, None]) / bw_adj[:, None] 776 pdf_mat = np.exp(-0.5 * pdf_mat ** 2) * grid_counts[:, None] 777 pdf_mat /= (2 * np.pi) ** 0.5 * bw_adj[:, None] 778 pdf = np.sum(pdf_mat, axis=0) / len(x) 779 780 return grid, pdf 781 782 783def _fast_kde_2d(x, y, gridsize=(128, 128), circular=False): 784 """ 785 2D fft-based Gaussian kernel density estimate (KDE). 786 787 The code was adapted from https://github.com/mfouesneau/faststats 788 789 Parameters 790 ---------- 791 x : Numpy array or list 792 y : Numpy array or list 793 gridsize : tuple 794 Number of points used to discretize data. Use powers of 2 for fft optimization 795 circular: bool 796 If True use circular boundaries. Defaults to False 797 798 Returns 799 ------- 800 grid: A gridded 2D KDE of the input points (x, y) 801 xmin: minimum value of x 802 xmax: maximum value of x 803 ymin: minimum value of y 804 ymax: maximum value of y 805 """ 806 x = np.asarray(x, dtype=float) 807 x = x[np.isfinite(x)] 808 y = np.asarray(y, dtype=float) 809 y = y[np.isfinite(y)] 810 811 xmin, xmax = x.min(), x.max() 812 ymin, ymax = y.min(), y.max() 813 814 len_x = len(x) 815 weights = np.ones(len_x) 816 n_x, n_y = gridsize 817 818 d_x = (xmax - xmin) / (n_x - 1) 819 d_y = (ymax - ymin) / (n_y - 1) 820 821 xyi = _stack(x, y).T 822 xyi -= [xmin, ymin] 823 xyi /= [d_x, d_y] 824 xyi = np.floor(xyi, xyi).T 825 826 scotts_factor = len_x ** (-1 / 6) 827 cov = _cov(xyi) 828 std_devs = np.diag(cov) ** 0.5 829 kern_nx, kern_ny = np.round(scotts_factor * 2 * np.pi * std_devs) 830 831 inv_cov = np.linalg.inv(cov * scotts_factor ** 2) 832 833 x_x = np.arange(kern_nx) - kern_nx / 2 834 y_y = np.arange(kern_ny) - kern_ny / 2 835 x_x, y_y = np.meshgrid(x_x, y_y) 836 837 kernel = _stack(x_x.flatten(), y_y.flatten()) 838 kernel = _dot(inv_cov, kernel) * kernel 839 kernel = np.exp(-kernel.sum(axis=0) / 2) 840 kernel = kernel.reshape((int(kern_ny), int(kern_nx))) 841 842 boundary = "wrap" if circular else "symm" 843 844 grid = coo_matrix((weights, xyi), shape=(n_x, n_y)).toarray() 845 grid = convolve2d(grid, kernel, mode="same", boundary=boundary) 846 847 norm_factor = np.linalg.det(2 * np.pi * cov * scotts_factor ** 2) 848 norm_factor = len_x * d_x * d_y * norm_factor ** 0.5 849 850 grid /= norm_factor 851 852 return grid, xmin, xmax, ymin, ymax 853 854 855def get_bins(values): 856 """ 857 Automatically compute the number of bins for discrete variables. 858 859 Parameters 860 ---------- 861 values = numpy array 862 values 863 864 Returns 865 ------- 866 array with the bins 867 868 Notes 869 ----- 870 Computes the width of the bins by taking the maximum of the Sturges and the Freedman-Diaconis 871 estimators. According to numpy `np.histogram` this provides good all around performance. 872 873 The Sturges is a very simplistic estimator based on the assumption of normality of the data. 874 This estimator has poor performance for non-normal data, which becomes especially obvious for 875 large data sets. The estimate depends only on size of the data. 876 877 The Freedman-Diaconis rule uses interquartile range (IQR) to estimate the binwidth. 878 It is considered a robust version of the Scott rule as the IQR is less affected by outliers 879 than the standard deviation. However, the IQR depends on fewer points than the standard 880 deviation, so it is less accurate, especially for long tailed distributions. 881 """ 882 dtype = values.dtype.kind 883 884 if dtype == "i": 885 x_min = values.min().astype(int) 886 x_max = values.max().astype(int) 887 else: 888 x_min = values.min().astype(float) 889 x_max = values.max().astype(float) 890 891 # Sturges histogram bin estimator 892 bins_sturges = (x_max - x_min) / (np.log2(values.size) + 1) 893 894 # The Freedman-Diaconis histogram bin estimator. 895 iqr = np.subtract(*np.percentile(values, [75, 25])) # pylint: disable=assignment-from-no-return 896 bins_fd = 2 * iqr * values.size ** (-1 / 3) 897 898 if dtype == "i": 899 width = np.round(np.max([1, bins_sturges, bins_fd])).astype(int) 900 bins = np.arange(x_min, x_max + width + 1, width) 901 else: 902 width = np.max([bins_sturges, bins_fd]) 903 if np.isclose(x_min, x_max): 904 width = 1e-3 905 bins = np.arange(x_min, x_max + width, width) 906 907 return bins 908 909 910def _sturges_formula(dataset, mult=1): 911 """Use Sturges' formula to determine number of bins. 912 913 See https://en.wikipedia.org/wiki/Histogram#Sturges'_formula 914 or https://doi.org/10.1080%2F01621459.1926.10502161 915 916 Parameters 917 ---------- 918 dataset: xarray.DataSet 919 Must have the `draw` dimension 920 921 mult: float 922 Used to scale the number of bins up or down. Default is 1 for Sturges' formula. 923 924 Returns 925 ------- 926 int 927 Number of bins to use 928 """ 929 return int(np.ceil(mult * np.log2(dataset.draw.size)) + 1) 930 931 932def _circular_mean(x): 933 """Compute mean of circular variable measured in radians. 934 935 The result is between -pi and pi. 936 """ 937 sinr = np.sum(np.sin(x)) 938 cosr = np.sum(np.cos(x)) 939 mean = np.arctan2(sinr, cosr) 940 941 return mean 942 943 944def _normalize_angle(x, zero_centered=True): 945 """Normalize angles. 946 947 Normalize angles in radians to [-pi, pi) or [0, 2 * pi) according to `zero_centered`. 948 """ 949 if zero_centered: 950 return (x + np.pi) % (2 * np.pi) - np.pi 951 else: 952 return x % (2 * np.pi) 953 954 955@conditional_jit(cache=True) 956def histogram(data, bins, range_hist=None): 957 """Conditionally jitted histogram. 958 959 Parameters 960 ---------- 961 data : array-like 962 Input data. Passed as first positional argument to ``np.histogram``. 963 bins : int or array-like 964 Passed as keyword argument ``bins`` to ``np.histogram``. 965 range_hist : (float, float), optional 966 Passed as keyword argument ``range`` to ``np.histogram``. 967 968 Returns 969 ------- 970 hist : array 971 The number of counts per bin. 972 density : array 973 The density corresponding to each bin. 974 bin_edges : array 975 The edges of the bins used. 976 """ 977 hist, bin_edges = np.histogram(data, bins=bins, range=range_hist) 978 hist_dens = hist / (hist.sum() * np.diff(bin_edges)) 979 return hist, hist_dens, bin_edges 980 981 982def _find_hdi_contours(density, hdi_probs): 983 """ 984 Find contours enclosing regions of highest posterior density. 985 986 Parameters 987 ---------- 988 density : array-like 989 A 2D KDE on a grid with cells of equal area. 990 hdi_probs : array-like 991 An array of highest density interval confidence probabilities. 992 993 Returns 994 ------- 995 contour_levels : array 996 The contour levels corresponding to the given HDI probabilities. 997 """ 998 # Using the algorithm from corner.py 999 sorted_density = np.sort(density, axis=None)[::-1] 1000 sm = sorted_density.cumsum() 1001 sm /= sm[-1] 1002 1003 contours = np.empty_like(hdi_probs) 1004 for idx, hdi_prob in enumerate(hdi_probs): 1005 try: 1006 contours[idx] = sorted_density[sm <= hdi_prob][-1] 1007 except IndexError: 1008 contours[idx] = sorted_density[0] 1009 1010 return contours 1011