1# Licensed under a 3-clause BSD style license - see LICENSE.rst 2""" 3This module contains simple statistical algorithms that are 4straightforwardly implemented as a single python function (or family of 5functions). 6 7This module should generally not be used directly. Everything in 8`__all__` is imported into `astropy.stats`, and hence that package 9should be used for access. 10""" 11 12import math 13 14import numpy as np 15 16import astropy.units as u 17from . import _stats 18 19__all__ = ['gaussian_fwhm_to_sigma', 'gaussian_sigma_to_fwhm', 20 'binom_conf_interval', 'binned_binom_proportion', 21 'poisson_conf_interval', 'median_absolute_deviation', 'mad_std', 22 'signal_to_noise_oir_ccd', 'bootstrap', 'kuiper', 'kuiper_two', 23 'kuiper_false_positive_probability', 'cdf_from_intervals', 24 'interval_overlap_length', 'histogram_intervals', 'fold_intervals'] 25 26__doctest_skip__ = ['binned_binom_proportion'] 27__doctest_requires__ = {'binom_conf_interval': ['scipy'], 28 'poisson_conf_interval': ['scipy']} 29 30 31gaussian_sigma_to_fwhm = 2.0 * math.sqrt(2.0 * math.log(2.0)) 32""" 33Factor with which to multiply Gaussian 1-sigma standard deviation to 34convert it to full width at half maximum (FWHM). 35""" 36 37gaussian_fwhm_to_sigma = 1. / gaussian_sigma_to_fwhm 38""" 39Factor with which to multiply Gaussian full width at half maximum (FWHM) 40to convert it to 1-sigma standard deviation. 41""" 42 43 44# NUMPY_LT_1_18 45def _expand_dims(data, axis): 46 """ 47 Expand the shape of an array. 48 49 Insert a new axis that will appear at the `axis` position in the 50 expanded array shape. 51 52 This function allows for tuple axis arguments. 53 ``numpy.expand_dims`` currently does not allow that, but it will in 54 numpy v1.18 (https://github.com/numpy/numpy/pull/14051). 55 ``_expand_dims`` can be replaced with ``numpy.expand_dims`` when the 56 minimum support numpy version is v1.18. 57 58 Parameters 59 ---------- 60 data : array-like 61 Input array. 62 axis : int or tuple of int 63 Position in the expanded axes where the new axis (or axes) is 64 placed. A tuple of axes is now supported. Out of range axes as 65 described above are now forbidden and raise an `AxisError`. 66 67 Returns 68 ------- 69 result : ndarray 70 View of ``data`` with the number of dimensions increased. 71 """ 72 73 if isinstance(data, np.matrix): 74 data = np.asarray(data) 75 else: 76 data = np.asanyarray(data) 77 78 if not isinstance(axis, (tuple, list)): 79 axis = (axis,) 80 81 out_ndim = len(axis) + data.ndim 82 axis = np.core.numeric.normalize_axis_tuple(axis, out_ndim) 83 84 shape_it = iter(data.shape) 85 shape = [1 if ax in axis else next(shape_it) for ax in range(out_ndim)] 86 87 return data.reshape(shape) 88 89 90def binom_conf_interval(k, n, confidence_level=0.68269, interval='wilson'): 91 r"""Binomial proportion confidence interval given k successes, 92 n trials. 93 94 Parameters 95 ---------- 96 k : int or numpy.ndarray 97 Number of successes (0 <= ``k`` <= ``n``). 98 n : int or numpy.ndarray 99 Number of trials (``n`` > 0). If both ``k`` and ``n`` are arrays, 100 they must have the same shape. 101 confidence_level : float, optional 102 Desired probability content of interval. Default is 0.68269, 103 corresponding to 1 sigma in a 1-dimensional Gaussian distribution. 104 Confidence level must be in range [0, 1]. 105 interval : {'wilson', 'jeffreys', 'flat', 'wald'}, optional 106 Formula used for confidence interval. See notes for details. The 107 ``'wilson'`` and ``'jeffreys'`` intervals generally give similar 108 results, while 'flat' is somewhat different, especially for small 109 values of ``n``. ``'wilson'`` should be somewhat faster than 110 ``'flat'`` or ``'jeffreys'``. The 'wald' interval is generally not 111 recommended. It is provided for comparison purposes. Default is 112 ``'wilson'``. 113 114 Returns 115 ------- 116 conf_interval : ndarray 117 ``conf_interval[0]`` and ``conf_interval[1]`` correspond to the lower 118 and upper limits, respectively, for each element in ``k``, ``n``. 119 120 Notes 121 ----- 122 In situations where a probability of success is not known, it can 123 be estimated from a number of trials (n) and number of 124 observed successes (k). For example, this is done in Monte 125 Carlo experiments designed to estimate a detection efficiency. It 126 is simple to take the sample proportion of successes (k/n) 127 as a reasonable best estimate of the true probability 128 :math:`\epsilon`. However, deriving an accurate confidence 129 interval on :math:`\epsilon` is non-trivial. There are several 130 formulas for this interval (see [1]_). Four intervals are implemented 131 here: 132 133 **1. The Wilson Interval.** This interval, attributed to Wilson [2]_, 134 is given by 135 136 .. math:: 137 138 CI_{\rm Wilson} = \frac{k + \kappa^2/2}{n + \kappa^2} 139 \pm \frac{\kappa n^{1/2}}{n + \kappa^2} 140 ((\hat{\epsilon}(1 - \hat{\epsilon}) + \kappa^2/(4n))^{1/2} 141 142 where :math:`\hat{\epsilon} = k / n` and :math:`\kappa` is the 143 number of standard deviations corresponding to the desired 144 confidence interval for a *normal* distribution (for example, 145 1.0 for a confidence interval of 68.269%). For a 146 confidence interval of 100(1 - :math:`\alpha`)%, 147 148 .. math:: 149 150 \kappa = \Phi^{-1}(1-\alpha/2) = \sqrt{2}{\rm erf}^{-1}(1-\alpha). 151 152 **2. The Jeffreys Interval.** This interval is derived by applying 153 Bayes' theorem to the binomial distribution with the 154 noninformative Jeffreys prior [3]_, [4]_. The noninformative Jeffreys 155 prior is the Beta distribution, Beta(1/2, 1/2), which has the density 156 function 157 158 .. math:: 159 160 f(\epsilon) = \pi^{-1} \epsilon^{-1/2}(1-\epsilon)^{-1/2}. 161 162 The justification for this prior is that it is invariant under 163 reparameterizations of the binomial proportion. 164 The posterior density function is also a Beta distribution: Beta(k 165 + 1/2, n - k + 1/2). The interval is then chosen so that it is 166 *equal-tailed*: Each tail (outside the interval) contains 167 :math:`\alpha`/2 of the posterior probability, and the interval 168 itself contains 1 - :math:`\alpha`. This interval must be 169 calculated numerically. Additionally, when k = 0 the lower limit 170 is set to 0 and when k = n the upper limit is set to 1, so that in 171 these cases, there is only one tail containing :math:`\alpha`/2 172 and the interval itself contains 1 - :math:`\alpha`/2 rather than 173 the nominal 1 - :math:`\alpha`. 174 175 **3. A Flat prior.** This is similar to the Jeffreys interval, 176 but uses a flat (uniform) prior on the binomial proportion 177 over the range 0 to 1 rather than the reparametrization-invariant 178 Jeffreys prior. The posterior density function is a Beta distribution: 179 Beta(k + 1, n - k + 1). The same comments about the nature of the 180 interval (equal-tailed, etc.) also apply to this option. 181 182 **4. The Wald Interval.** This interval is given by 183 184 .. math:: 185 186 CI_{\rm Wald} = \hat{\epsilon} \pm 187 \kappa \sqrt{\frac{\hat{\epsilon}(1-\hat{\epsilon})}{n}} 188 189 The Wald interval gives acceptable results in some limiting 190 cases. Particularly, when n is very large, and the true proportion 191 :math:`\epsilon` is not "too close" to 0 or 1. However, as the 192 later is not verifiable when trying to estimate :math:`\epsilon`, 193 this is not very helpful. Its use is not recommended, but it is 194 provided here for comparison purposes due to its prevalence in 195 everyday practical statistics. 196 197 This function requires ``scipy`` for all interval types. 198 199 References 200 ---------- 201 .. [1] Brown, Lawrence D.; Cai, T. Tony; DasGupta, Anirban (2001). 202 "Interval Estimation for a Binomial Proportion". Statistical 203 Science 16 (2): 101-133. doi:10.1214/ss/1009213286 204 205 .. [2] Wilson, E. B. (1927). "Probable inference, the law of 206 succession, and statistical inference". Journal of the American 207 Statistical Association 22: 209-212. 208 209 .. [3] Jeffreys, Harold (1946). "An Invariant Form for the Prior 210 Probability in Estimation Problems". Proc. R. Soc. Lond.. A 24 186 211 (1007): 453-461. doi:10.1098/rspa.1946.0056 212 213 .. [4] Jeffreys, Harold (1998). Theory of Probability. Oxford 214 University Press, 3rd edition. ISBN 978-0198503682 215 216 Examples 217 -------- 218 Integer inputs return an array with shape (2,): 219 220 >>> binom_conf_interval(4, 5, interval='wilson') # doctest: +FLOAT_CMP 221 array([0.57921724, 0.92078259]) 222 223 Arrays of arbitrary dimension are supported. The Wilson and Jeffreys 224 intervals give similar results, even for small k, n: 225 226 >>> binom_conf_interval([1, 2], 5, interval='wilson') # doctest: +FLOAT_CMP 227 array([[0.07921741, 0.21597328], 228 [0.42078276, 0.61736012]]) 229 230 >>> binom_conf_interval([1, 2,], 5, interval='jeffreys') # doctest: +FLOAT_CMP 231 array([[0.0842525 , 0.21789949], 232 [0.42218001, 0.61753691]]) 233 234 >>> binom_conf_interval([1, 2], 5, interval='flat') # doctest: +FLOAT_CMP 235 array([[0.12139799, 0.24309021], 236 [0.45401727, 0.61535699]]) 237 238 In contrast, the Wald interval gives poor results for small k, n. 239 For k = 0 or k = n, the interval always has zero length. 240 241 >>> binom_conf_interval([1, 2], 5, interval='wald') # doctest: +FLOAT_CMP 242 array([[0.02111437, 0.18091075], 243 [0.37888563, 0.61908925]]) 244 245 For confidence intervals approaching 1, the Wald interval for 246 0 < k < n can give intervals that extend outside [0, 1]: 247 248 >>> binom_conf_interval([1, 2], 5, interval='wald', confidence_level=0.99) # doctest: +FLOAT_CMP 249 array([[-0.26077835, -0.16433593], 250 [ 0.66077835, 0.96433593]]) 251 252 """ # noqa 253 if confidence_level < 0. or confidence_level > 1.: 254 raise ValueError('confidence_level must be between 0. and 1.') 255 alpha = 1. - confidence_level 256 257 k = np.asarray(k).astype(int) 258 n = np.asarray(n).astype(int) 259 260 if (n <= 0).any(): 261 raise ValueError('n must be positive') 262 if (k < 0).any() or (k > n).any(): 263 raise ValueError('k must be in {0, 1, .., n}') 264 265 if interval == 'wilson' or interval == 'wald': 266 from scipy.special import erfinv 267 kappa = np.sqrt(2.) * min(erfinv(confidence_level), 1.e10) # Avoid overflows. 268 k = k.astype(float) 269 n = n.astype(float) 270 p = k / n 271 272 if interval == 'wilson': 273 midpoint = (k + kappa ** 2 / 2.) / (n + kappa ** 2) 274 halflength = (kappa * np.sqrt(n)) / (n + kappa ** 2) * \ 275 np.sqrt(p * (1 - p) + kappa ** 2 / (4 * n)) 276 conf_interval = np.array([midpoint - halflength, 277 midpoint + halflength]) 278 279 # Correct intervals out of range due to floating point errors. 280 conf_interval[conf_interval < 0.] = 0. 281 conf_interval[conf_interval > 1.] = 1. 282 else: 283 midpoint = p 284 halflength = kappa * np.sqrt(p * (1. - p) / n) 285 conf_interval = np.array([midpoint - halflength, 286 midpoint + halflength]) 287 288 elif interval == 'jeffreys' or interval == 'flat': 289 from scipy.special import betaincinv 290 291 if interval == 'jeffreys': 292 lowerbound = betaincinv(k + 0.5, n - k + 0.5, 0.5 * alpha) 293 upperbound = betaincinv(k + 0.5, n - k + 0.5, 1. - 0.5 * alpha) 294 else: 295 lowerbound = betaincinv(k + 1, n - k + 1, 0.5 * alpha) 296 upperbound = betaincinv(k + 1, n - k + 1, 1. - 0.5 * alpha) 297 298 # Set lower or upper bound to k/n when k/n = 0 or 1 299 # We have to treat the special case of k/n being scalars, 300 # which is an ugly kludge 301 if lowerbound.ndim == 0: 302 if k == 0: 303 lowerbound = 0. 304 elif k == n: 305 upperbound = 1. 306 else: 307 lowerbound[k == 0] = 0 308 upperbound[k == n] = 1 309 310 conf_interval = np.array([lowerbound, upperbound]) 311 else: 312 raise ValueError(f'Unrecognized interval: {interval:s}') 313 314 return conf_interval 315 316 317def binned_binom_proportion(x, success, bins=10, range=None, 318 confidence_level=0.68269, interval='wilson'): 319 """Binomial proportion and confidence interval in bins of a continuous 320 variable ``x``. 321 322 Given a set of datapoint pairs where the ``x`` values are 323 continuously distributed and the ``success`` values are binomial 324 ("success / failure" or "true / false"), place the pairs into 325 bins according to ``x`` value and calculate the binomial proportion 326 (fraction of successes) and confidence interval in each bin. 327 328 Parameters 329 ---------- 330 x : sequence 331 Values. 332 success : sequence of bool 333 Success (`True`) or failure (`False`) corresponding to each value 334 in ``x``. Must be same length as ``x``. 335 bins : int or sequence of scalar, optional 336 If bins is an int, it defines the number of equal-width bins 337 in the given range (10, by default). If bins is a sequence, it 338 defines the bin edges, including the rightmost edge, allowing 339 for non-uniform bin widths (in this case, 'range' is ignored). 340 range : (float, float), optional 341 The lower and upper range of the bins. If `None` (default), 342 the range is set to ``(x.min(), x.max())``. Values outside the 343 range are ignored. 344 confidence_level : float, optional 345 Must be in range [0, 1]. 346 Desired probability content in the confidence 347 interval ``(p - perr[0], p + perr[1])`` in each bin. Default is 348 0.68269. 349 interval : {'wilson', 'jeffreys', 'flat', 'wald'}, optional 350 Formula used to calculate confidence interval on the 351 binomial proportion in each bin. See `binom_conf_interval` for 352 definition of the intervals. The 'wilson', 'jeffreys', 353 and 'flat' intervals generally give similar results. 'wilson' 354 should be somewhat faster, while 'jeffreys' and 'flat' are 355 marginally superior, but differ in the assumed prior. 356 The 'wald' interval is generally not recommended. 357 It is provided for comparison purposes. Default is 'wilson'. 358 359 Returns 360 ------- 361 bin_ctr : ndarray 362 Central value of bins. Bins without any entries are not returned. 363 bin_halfwidth : ndarray 364 Half-width of each bin such that ``bin_ctr - bin_halfwidth`` and 365 ``bin_ctr + bins_halfwidth`` give the left and right side of each bin, 366 respectively. 367 p : ndarray 368 Efficiency in each bin. 369 perr : ndarray 370 2-d array of shape (2, len(p)) representing the upper and lower 371 uncertainty on p in each bin. 372 373 Notes 374 ----- 375 This function requires ``scipy`` for all interval types. 376 377 See Also 378 -------- 379 binom_conf_interval : Function used to estimate confidence interval in 380 each bin. 381 382 Examples 383 -------- 384 Suppose we wish to estimate the efficiency of a survey in 385 detecting astronomical sources as a function of magnitude (i.e., 386 the probability of detecting a source given its magnitude). In a 387 realistic case, we might prepare a large number of sources with 388 randomly selected magnitudes, inject them into simulated images, 389 and then record which were detected at the end of the reduction 390 pipeline. As a toy example, we generate 100 data points with 391 randomly selected magnitudes between 20 and 30 and "observe" them 392 with a known detection function (here, the error function, with 393 50% detection probability at magnitude 25): 394 395 >>> from scipy.special import erf 396 >>> from scipy.stats.distributions import binom 397 >>> def true_efficiency(x): 398 ... return 0.5 - 0.5 * erf((x - 25.) / 2.) 399 >>> mag = 20. + 10. * np.random.rand(100) 400 >>> detected = binom.rvs(1, true_efficiency(mag)) 401 >>> bins, binshw, p, perr = binned_binom_proportion(mag, detected, bins=20) 402 >>> plt.errorbar(bins, p, xerr=binshw, yerr=perr, ls='none', marker='o', 403 ... label='estimate') 404 405 .. plot:: 406 407 import numpy as np 408 from scipy.special import erf 409 from scipy.stats.distributions import binom 410 import matplotlib.pyplot as plt 411 from astropy.stats import binned_binom_proportion 412 def true_efficiency(x): 413 return 0.5 - 0.5 * erf((x - 25.) / 2.) 414 np.random.seed(400) 415 mag = 20. + 10. * np.random.rand(100) 416 np.random.seed(600) 417 detected = binom.rvs(1, true_efficiency(mag)) 418 bins, binshw, p, perr = binned_binom_proportion(mag, detected, bins=20) 419 plt.errorbar(bins, p, xerr=binshw, yerr=perr, ls='none', marker='o', 420 label='estimate') 421 X = np.linspace(20., 30., 1000) 422 plt.plot(X, true_efficiency(X), label='true efficiency') 423 plt.ylim(0., 1.) 424 plt.title('Detection efficiency vs magnitude') 425 plt.xlabel('Magnitude') 426 plt.ylabel('Detection efficiency') 427 plt.legend() 428 plt.show() 429 430 The above example uses the Wilson confidence interval to calculate 431 the uncertainty ``perr`` in each bin (see the definition of various 432 confidence intervals in `binom_conf_interval`). A commonly used 433 alternative is the Wald interval. However, the Wald interval can 434 give nonsensical uncertainties when the efficiency is near 0 or 1, 435 and is therefore **not** recommended. As an illustration, the 436 following example shows the same data as above but uses the Wald 437 interval rather than the Wilson interval to calculate ``perr``: 438 439 >>> bins, binshw, p, perr = binned_binom_proportion(mag, detected, bins=20, 440 ... interval='wald') 441 >>> plt.errorbar(bins, p, xerr=binshw, yerr=perr, ls='none', marker='o', 442 ... label='estimate') 443 444 .. plot:: 445 446 import numpy as np 447 from scipy.special import erf 448 from scipy.stats.distributions import binom 449 import matplotlib.pyplot as plt 450 from astropy.stats import binned_binom_proportion 451 def true_efficiency(x): 452 return 0.5 - 0.5 * erf((x - 25.) / 2.) 453 np.random.seed(400) 454 mag = 20. + 10. * np.random.rand(100) 455 np.random.seed(600) 456 detected = binom.rvs(1, true_efficiency(mag)) 457 bins, binshw, p, perr = binned_binom_proportion(mag, detected, bins=20, 458 interval='wald') 459 plt.errorbar(bins, p, xerr=binshw, yerr=perr, ls='none', marker='o', 460 label='estimate') 461 X = np.linspace(20., 30., 1000) 462 plt.plot(X, true_efficiency(X), label='true efficiency') 463 plt.ylim(0., 1.) 464 plt.title('The Wald interval can give nonsensical uncertainties') 465 plt.xlabel('Magnitude') 466 plt.ylabel('Detection efficiency') 467 plt.legend() 468 plt.show() 469 470 """ 471 x = np.ravel(x) 472 success = np.ravel(success).astype(bool) 473 if x.shape != success.shape: 474 raise ValueError('sizes of x and success must match') 475 476 # Put values into a histogram (`n`). Put "successful" values 477 # into a second histogram (`k`) with identical binning. 478 n, bin_edges = np.histogram(x, bins=bins, range=range) 479 k, bin_edges = np.histogram(x[success], bins=bin_edges) 480 bin_ctr = (bin_edges[:-1] + bin_edges[1:]) / 2. 481 bin_halfwidth = bin_ctr - bin_edges[:-1] 482 483 # Remove bins with zero entries. 484 valid = n > 0 485 bin_ctr = bin_ctr[valid] 486 bin_halfwidth = bin_halfwidth[valid] 487 n = n[valid] 488 k = k[valid] 489 490 p = k / n 491 bounds = binom_conf_interval(k, n, confidence_level=confidence_level, interval=interval) 492 perr = np.abs(bounds - p) 493 494 return bin_ctr, bin_halfwidth, p, perr 495 496 497def _check_poisson_conf_inputs(sigma, background, confidence_level, name): 498 if sigma != 1: 499 raise ValueError(f"Only sigma=1 supported for interval {name}") 500 if background != 0: 501 raise ValueError(f"background not supported for interval {name}") 502 if confidence_level is not None: 503 raise ValueError(f"confidence_level not supported for interval {name}") 504 505 506def poisson_conf_interval(n, interval='root-n', sigma=1, background=0, 507 confidence_level=None): 508 r"""Poisson parameter confidence interval given observed counts 509 510 Parameters 511 ---------- 512 n : int or numpy.ndarray 513 Number of counts (0 <= ``n``). 514 interval : {'root-n','root-n-0','pearson','sherpagehrels','frequentist-confidence', 'kraft-burrows-nousek'}, optional 515 Formula used for confidence interval. See notes for details. 516 Default is ``'root-n'``. 517 sigma : float, optional 518 Number of sigma for confidence interval; only supported for 519 the 'frequentist-confidence' mode. 520 background : float, optional 521 Number of counts expected from the background; only supported for 522 the 'kraft-burrows-nousek' mode. This number is assumed to be determined 523 from a large region so that the uncertainty on its value is negligible. 524 confidence_level : float, optional 525 Confidence level between 0 and 1; only supported for the 526 'kraft-burrows-nousek' mode. 527 528 Returns 529 ------- 530 conf_interval : ndarray 531 ``conf_interval[0]`` and ``conf_interval[1]`` correspond to the lower 532 and upper limits, respectively, for each element in ``n``. 533 534 Notes 535 ----- 536 537 The "right" confidence interval to use for Poisson data is a 538 matter of debate. The CDF working group `recommends 539 <https://web.archive.org/web/20210222093249/https://www-cdf.fnal.gov/physics/statistics/notes/pois_eb.txt>`_ 540 using root-n throughout, largely in the interest of 541 comprehensibility, but discusses other possibilities. The ATLAS 542 group also `discusses 543 <http://www.pp.rhul.ac.uk/~cowan/atlas/ErrorBars.pdf>`_ several 544 possibilities but concludes that no single representation is 545 suitable for all cases. The suggestion has also been `floated 546 <https://ui.adsabs.harvard.edu/abs/2012EPJP..127...24A>`_ that error 547 bars should be attached to theoretical predictions instead of 548 observed data, which this function will not help with (but it's 549 easy; then you really should use the square root of the theoretical 550 prediction). 551 552 The intervals implemented here are: 553 554 **1. 'root-n'** This is a very widely used standard rule derived 555 from the maximum-likelihood estimator for the mean of the Poisson 556 process. While it produces questionable results for small n and 557 outright wrong results for n=0, it is standard enough that people are 558 (supposedly) used to interpreting these wonky values. The interval is 559 560 .. math:: 561 562 CI = (n-\sqrt{n}, n+\sqrt{n}) 563 564 **2. 'root-n-0'** This is identical to the above except that where 565 n is zero the interval returned is (0,1). 566 567 **3. 'pearson'** This is an only-slightly-more-complicated rule 568 based on Pearson's chi-squared rule (as `explained 569 <https://web.archive.org/web/20210222093249/https://www-cdf.fnal.gov/physics/statistics/notes/pois_eb.txt>`_ by 570 the CDF working group). It also has the nice feature that if your 571 theory curve touches an endpoint of the interval, then your data 572 point is indeed one sigma away. The interval is 573 574 .. math:: 575 576 CI = (n+0.5-\sqrt{n+0.25}, n+0.5+\sqrt{n+0.25}) 577 578 **4. 'sherpagehrels'** This rule is used by default in the fitting 579 package 'sherpa'. The `documentation 580 <https://cxc.harvard.edu/sherpa4.4/statistics/#chigehrels>`_ claims 581 it is based on a numerical approximation published in `Gehrels 582 (1986) <https://ui.adsabs.harvard.edu/abs/1986ApJ...303..336G>`_ but it 583 does not actually appear there. It is symmetrical, and while the 584 upper limits are within about 1% of those given by 585 'frequentist-confidence', the lower limits can be badly wrong. The 586 interval is 587 588 .. math:: 589 590 CI = (n-1-\sqrt{n+0.75}, n+1+\sqrt{n+0.75}) 591 592 **5. 'frequentist-confidence'** These are frequentist central 593 confidence intervals: 594 595 .. math:: 596 597 CI = (0.5 F_{\chi^2}^{-1}(\alpha;2n), 598 0.5 F_{\chi^2}^{-1}(1-\alpha;2(n+1))) 599 600 where :math:`F_{\chi^2}^{-1}` is the quantile of the chi-square 601 distribution with the indicated number of degrees of freedom and 602 :math:`\alpha` is the one-tailed probability of the normal 603 distribution (at the point given by the parameter 'sigma'). See 604 `Maxwell (2011) 605 <https://ui.adsabs.harvard.edu/abs/2011arXiv1102.0822M>`_ for further 606 details. 607 608 **6. 'kraft-burrows-nousek'** This is a Bayesian approach which allows 609 for the presence of a known background :math:`B` in the source signal 610 :math:`N`. 611 For a given confidence level :math:`CL` the confidence interval 612 :math:`[S_\mathrm{min}, S_\mathrm{max}]` is given by: 613 614 .. math:: 615 616 CL = \int^{S_\mathrm{max}}_{S_\mathrm{min}} f_{N,B}(S)dS 617 618 where the function :math:`f_{N,B}` is: 619 620 .. math:: 621 622 f_{N,B}(S) = C \frac{e^{-(S+B)}(S+B)^N}{N!} 623 624 and the normalization constant :math:`C`: 625 626 .. math:: 627 628 C = \left[ \int_0^\infty \frac{e^{-(S+B)}(S+B)^N}{N!} dS \right] ^{-1} 629 = \left( \sum^N_{n=0} \frac{e^{-B}B^n}{n!} \right)^{-1} 630 631 See `Kraft, Burrows, and Nousek (1991) 632 <https://ui.adsabs.harvard.edu/abs/1991ApJ...374..344K>`_ for further 633 details. 634 635 These formulas implement a positive, uniform prior. 636 `Kraft, Burrows, and Nousek (1991) 637 <https://ui.adsabs.harvard.edu/abs/1991ApJ...374..344K>`_ discuss this 638 choice in more detail and show that the problem is relatively 639 insensitive to the choice of prior. 640 641 This function has an optional dependency: Either `Scipy 642 <https://www.scipy.org/>`_ or `mpmath <http://mpmath.org/>`_ need 643 to be available (Scipy works only for N < 100). 644 This code is very intense numerically, which makes it much slower than 645 the other methods, in particular for large count numbers (above 1000 646 even with ``mpmath``). Fortunately, some of the other methods or a 647 Gaussian approximation usually work well in this regime. 648 649 Examples 650 -------- 651 >>> poisson_conf_interval(np.arange(10), interval='root-n').T 652 array([[ 0. , 0. ], 653 [ 0. , 2. ], 654 [ 0.58578644, 3.41421356], 655 [ 1.26794919, 4.73205081], 656 [ 2. , 6. ], 657 [ 2.76393202, 7.23606798], 658 [ 3.55051026, 8.44948974], 659 [ 4.35424869, 9.64575131], 660 [ 5.17157288, 10.82842712], 661 [ 6. , 12. ]]) 662 663 >>> poisson_conf_interval(np.arange(10), interval='root-n-0').T 664 array([[ 0. , 1. ], 665 [ 0. , 2. ], 666 [ 0.58578644, 3.41421356], 667 [ 1.26794919, 4.73205081], 668 [ 2. , 6. ], 669 [ 2.76393202, 7.23606798], 670 [ 3.55051026, 8.44948974], 671 [ 4.35424869, 9.64575131], 672 [ 5.17157288, 10.82842712], 673 [ 6. , 12. ]]) 674 675 >>> poisson_conf_interval(np.arange(10), interval='pearson').T 676 array([[ 0. , 1. ], 677 [ 0.38196601, 2.61803399], 678 [ 1. , 4. ], 679 [ 1.69722436, 5.30277564], 680 [ 2.43844719, 6.56155281], 681 [ 3.20871215, 7.79128785], 682 [ 4. , 9. ], 683 [ 4.8074176 , 10.1925824 ], 684 [ 5.62771868, 11.37228132], 685 [ 6.45861873, 12.54138127]]) 686 687 >>> poisson_conf_interval( 688 ... np.arange(10), interval='frequentist-confidence').T 689 array([[ 0. , 1.84102165], 690 [ 0.17275378, 3.29952656], 691 [ 0.70818544, 4.63785962], 692 [ 1.36729531, 5.91818583], 693 [ 2.08566081, 7.16275317], 694 [ 2.84030886, 8.38247265], 695 [ 3.62006862, 9.58364155], 696 [ 4.41852954, 10.77028072], 697 [ 5.23161394, 11.94514152], 698 [ 6.05653896, 13.11020414]]) 699 700 >>> poisson_conf_interval( 701 ... 7, interval='frequentist-confidence').T 702 array([ 4.41852954, 10.77028072]) 703 704 >>> poisson_conf_interval( 705 ... 10, background=1.5, confidence_level=0.95, 706 ... interval='kraft-burrows-nousek').T # doctest: +FLOAT_CMP 707 array([[ 3.47894005, 16.113329533]]) 708 709 """ # noqa 710 711 if not np.isscalar(n): 712 n = np.asanyarray(n) 713 714 if interval == 'root-n': 715 _check_poisson_conf_inputs(sigma, background, confidence_level, interval) 716 conf_interval = np.array([n - np.sqrt(n), 717 n + np.sqrt(n)]) 718 elif interval == 'root-n-0': 719 _check_poisson_conf_inputs(sigma, background, confidence_level, interval) 720 conf_interval = np.array([n - np.sqrt(n), 721 n + np.sqrt(n)]) 722 if np.isscalar(n): 723 if n == 0: 724 conf_interval[1] = 1 725 else: 726 conf_interval[1, n == 0] = 1 727 elif interval == 'pearson': 728 _check_poisson_conf_inputs(sigma, background, confidence_level, interval) 729 conf_interval = np.array([n + 0.5 - np.sqrt(n + 0.25), 730 n + 0.5 + np.sqrt(n + 0.25)]) 731 elif interval == 'sherpagehrels': 732 _check_poisson_conf_inputs(sigma, background, confidence_level, interval) 733 conf_interval = np.array([n - 1 - np.sqrt(n + 0.75), 734 n + 1 + np.sqrt(n + 0.75)]) 735 elif interval == 'frequentist-confidence': 736 _check_poisson_conf_inputs(1., background, confidence_level, interval) 737 import scipy.stats 738 alpha = scipy.stats.norm.sf(sigma) 739 conf_interval = np.array([0.5 * scipy.stats.chi2(2 * n).ppf(alpha), 740 0.5 * scipy.stats.chi2(2 * n + 2).isf(alpha)]) 741 if np.isscalar(n): 742 if n == 0: 743 conf_interval[0] = 0 744 else: 745 conf_interval[0, n == 0] = 0 746 elif interval == 'kraft-burrows-nousek': 747 # Deprecation warning in Python 3.9 when N is float, so we force int, 748 # see https://github.com/astropy/astropy/issues/10832 749 if np.isscalar(n): 750 if not isinstance(n, int): 751 raise TypeError('Number of counts must be integer.') 752 elif not issubclass(n.dtype.type, np.integer): 753 raise TypeError('Number of counts must be integer.') 754 755 if confidence_level is None: 756 raise ValueError('Set confidence_level for method {}. (sigma is ' 757 'ignored.)'.format(interval)) 758 confidence_level = np.asanyarray(confidence_level) 759 if np.any(confidence_level <= 0) or np.any(confidence_level >= 1): 760 raise ValueError('confidence_level must be a number between 0 and 1.') 761 background = np.asanyarray(background) 762 if np.any(background < 0): 763 raise ValueError('Background must be >= 0.') 764 conf_interval = np.vectorize(_kraft_burrows_nousek, 765 cache=True)(n, background, confidence_level) 766 conf_interval = np.vstack(conf_interval) 767 else: 768 raise ValueError(f"Invalid method for Poisson confidence intervals: {interval}") 769 return conf_interval 770 771 772def median_absolute_deviation(data, axis=None, func=None, ignore_nan=False): 773 """ 774 Calculate the median absolute deviation (MAD). 775 776 The MAD is defined as ``median(abs(a - median(a)))``. 777 778 Parameters 779 ---------- 780 data : array-like 781 Input array or object that can be converted to an array. 782 axis : None, int, or tuple of int, optional 783 The axis or axes along which the MADs are computed. The default 784 (`None`) is to compute the MAD of the flattened array. 785 func : callable, optional 786 The function used to compute the median. Defaults to `numpy.ma.median` 787 for masked arrays, otherwise to `numpy.median`. 788 ignore_nan : bool 789 Ignore NaN values (treat them as if they are not in the array) when 790 computing the median. This will use `numpy.ma.median` if ``axis`` is 791 specified, or `numpy.nanmedian` if ``axis==None`` and numpy's version 792 is >1.10 because nanmedian is slightly faster in this case. 793 794 Returns 795 ------- 796 mad : float or `~numpy.ndarray` 797 The median absolute deviation of the input array. If ``axis`` 798 is `None` then a scalar will be returned, otherwise a 799 `~numpy.ndarray` will be returned. 800 801 Examples 802 -------- 803 Generate random variates from a Gaussian distribution and return the 804 median absolute deviation for that distribution:: 805 806 >>> import numpy as np 807 >>> from astropy.stats import median_absolute_deviation 808 >>> rand = np.random.default_rng(12345) 809 >>> from numpy.random import randn 810 >>> mad = median_absolute_deviation(rand.standard_normal(1000)) 811 >>> print(mad) # doctest: +FLOAT_CMP 812 0.6829504282771885 813 814 See Also 815 -------- 816 mad_std 817 """ 818 819 if func is None: 820 # Check if the array has a mask and if so use np.ma.median 821 # See https://github.com/numpy/numpy/issues/7330 why using np.ma.median 822 # for normal arrays should not be done (summary: np.ma.median always 823 # returns an masked array even if the result should be scalar). (#4658) 824 if isinstance(data, np.ma.MaskedArray): 825 is_masked = True 826 func = np.ma.median 827 if ignore_nan: 828 data = np.ma.masked_where(np.isnan(data), data, copy=True) 829 elif ignore_nan: 830 is_masked = False 831 func = np.nanmedian 832 else: 833 is_masked = False 834 func = np.median # drops units if result is NaN 835 else: 836 is_masked = None 837 838 data = np.asanyarray(data) 839 # np.nanmedian has `keepdims`, which is a good option if we're not allowing 840 # user-passed functions here 841 data_median = func(data, axis=axis) 842 # this conditional can be removed after this PR is merged: 843 # https://github.com/astropy/astropy/issues/12165 844 if (isinstance(data, u.Quantity) and func is np.median 845 and data_median.ndim == 0 and np.isnan(data_median)): 846 data_median = data.__array_wrap__(data_median) 847 848 # broadcast the median array before subtraction 849 if axis is not None: 850 data_median = _expand_dims(data_median, axis=axis) # NUMPY_LT_1_18 851 852 result = func(np.abs(data - data_median), axis=axis, overwrite_input=True) 853 # this conditional can be removed after this PR is merged: 854 # https://github.com/astropy/astropy/issues/12165 855 if (isinstance(data, u.Quantity) and func is np.median 856 and result.ndim == 0 and np.isnan(result)): 857 result = data.__array_wrap__(result) 858 859 if axis is None and np.ma.isMaskedArray(result): 860 # return scalar version 861 result = result.item() 862 elif np.ma.isMaskedArray(result) and not is_masked: 863 # if the input array was not a masked array, we don't want to return a 864 # masked array 865 result = result.filled(fill_value=np.nan) 866 867 return result 868 869 870def mad_std(data, axis=None, func=None, ignore_nan=False): 871 r""" 872 Calculate a robust standard deviation using the `median absolute 873 deviation (MAD) 874 <https://en.wikipedia.org/wiki/Median_absolute_deviation>`_. 875 876 The standard deviation estimator is given by: 877 878 .. math:: 879 880 \sigma \approx \frac{\textrm{MAD}}{\Phi^{-1}(3/4)} 881 \approx 1.4826 \ \textrm{MAD} 882 883 where :math:`\Phi^{-1}(P)` is the normal inverse cumulative 884 distribution function evaluated at probability :math:`P = 3/4`. 885 886 Parameters 887 ---------- 888 data : array-like 889 Data array or object that can be converted to an array. 890 axis : None, int, or tuple of int, optional 891 The axis or axes along which the robust standard deviations are 892 computed. The default (`None`) is to compute the robust 893 standard deviation of the flattened array. 894 func : callable, optional 895 The function used to compute the median. Defaults to `numpy.ma.median` 896 for masked arrays, otherwise to `numpy.median`. 897 ignore_nan : bool 898 Ignore NaN values (treat them as if they are not in the array) when 899 computing the median. This will use `numpy.ma.median` if ``axis`` is 900 specified, or `numpy.nanmedian` if ``axis=None`` and numpy's version is 901 >1.10 because nanmedian is slightly faster in this case. 902 903 Returns 904 ------- 905 mad_std : float or `~numpy.ndarray` 906 The robust standard deviation of the input data. If ``axis`` is 907 `None` then a scalar will be returned, otherwise a 908 `~numpy.ndarray` will be returned. 909 910 Examples 911 -------- 912 >>> import numpy as np 913 >>> from astropy.stats import mad_std 914 >>> rand = np.random.default_rng(12345) 915 >>> madstd = mad_std(rand.normal(5, 2, (100, 100))) 916 >>> print(madstd) # doctest: +FLOAT_CMP 917 1.984147963351707 918 919 See Also 920 -------- 921 biweight_midvariance, biweight_midcovariance, median_absolute_deviation 922 """ 923 924 # NOTE: 1. / scipy.stats.norm.ppf(0.75) = 1.482602218505602 925 MAD = median_absolute_deviation( 926 data, axis=axis, func=func, ignore_nan=ignore_nan) 927 return MAD * 1.482602218505602 928 929 930def signal_to_noise_oir_ccd(t, source_eps, sky_eps, dark_eps, rd, npix, 931 gain=1.0): 932 """Computes the signal to noise ratio for source being observed in the 933 optical/IR using a CCD. 934 935 Parameters 936 ---------- 937 t : float or numpy.ndarray 938 CCD integration time in seconds 939 source_eps : float 940 Number of electrons (photons) or DN per second in the aperture from the 941 source. Note that this should already have been scaled by the filter 942 transmission and the quantum efficiency of the CCD. If the input is in 943 DN, then be sure to set the gain to the proper value for the CCD. 944 If the input is in electrons per second, then keep the gain as its 945 default of 1.0. 946 sky_eps : float 947 Number of electrons (photons) or DN per second per pixel from the sky 948 background. Should already be scaled by filter transmission and QE. 949 This must be in the same units as source_eps for the calculation to 950 make sense. 951 dark_eps : float 952 Number of thermal electrons per second per pixel. If this is given in 953 DN or ADU, then multiply by the gain to get the value in electrons. 954 rd : float 955 Read noise of the CCD in electrons. If this is given in 956 DN or ADU, then multiply by the gain to get the value in electrons. 957 npix : float 958 Size of the aperture in pixels 959 gain : float, optional 960 Gain of the CCD. In units of electrons per DN. 961 962 Returns 963 ------- 964 SNR : float or numpy.ndarray 965 Signal to noise ratio calculated from the inputs 966 """ 967 signal = t * source_eps * gain 968 noise = np.sqrt(t * (source_eps * gain + npix * 969 (sky_eps * gain + dark_eps)) + npix * rd ** 2) 970 return signal / noise 971 972 973def bootstrap(data, bootnum=100, samples=None, bootfunc=None): 974 """Performs bootstrap resampling on numpy arrays. 975 976 Bootstrap resampling is used to understand confidence intervals of sample 977 estimates. This function returns versions of the dataset resampled with 978 replacement ("case bootstrapping"). These can all be run through a function 979 or statistic to produce a distribution of values which can then be used to 980 find the confidence intervals. 981 982 Parameters 983 ---------- 984 data : ndarray 985 N-D array. The bootstrap resampling will be performed on the first 986 index, so the first index should access the relevant information 987 to be bootstrapped. 988 bootnum : int, optional 989 Number of bootstrap resamples 990 samples : int, optional 991 Number of samples in each resample. The default `None` sets samples to 992 the number of datapoints 993 bootfunc : function, optional 994 Function to reduce the resampled data. Each bootstrap resample will 995 be put through this function and the results returned. If `None`, the 996 bootstrapped data will be returned 997 998 Returns 999 ------- 1000 boot : ndarray 1001 1002 If bootfunc is None, then each row is a bootstrap resample of the data. 1003 If bootfunc is specified, then the columns will correspond to the 1004 outputs of bootfunc. 1005 1006 Examples 1007 -------- 1008 Obtain a twice resampled array: 1009 1010 >>> from astropy.stats import bootstrap 1011 >>> import numpy as np 1012 >>> from astropy.utils import NumpyRNGContext 1013 >>> bootarr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 0]) 1014 >>> with NumpyRNGContext(1): 1015 ... bootresult = bootstrap(bootarr, 2) 1016 ... 1017 >>> bootresult # doctest: +FLOAT_CMP 1018 array([[6., 9., 0., 6., 1., 1., 2., 8., 7., 0.], 1019 [3., 5., 6., 3., 5., 3., 5., 8., 8., 0.]]) 1020 >>> bootresult.shape 1021 (2, 10) 1022 1023 Obtain a statistic on the array 1024 1025 >>> with NumpyRNGContext(1): 1026 ... bootresult = bootstrap(bootarr, 2, bootfunc=np.mean) 1027 ... 1028 >>> bootresult # doctest: +FLOAT_CMP 1029 array([4. , 4.6]) 1030 1031 Obtain a statistic with two outputs on the array 1032 1033 >>> test_statistic = lambda x: (np.sum(x), np.mean(x)) 1034 >>> with NumpyRNGContext(1): 1035 ... bootresult = bootstrap(bootarr, 3, bootfunc=test_statistic) 1036 >>> bootresult # doctest: +FLOAT_CMP 1037 array([[40. , 4. ], 1038 [46. , 4.6], 1039 [35. , 3.5]]) 1040 >>> bootresult.shape 1041 (3, 2) 1042 1043 Obtain a statistic with two outputs on the array, keeping only the first 1044 output 1045 1046 >>> bootfunc = lambda x:test_statistic(x)[0] 1047 >>> with NumpyRNGContext(1): 1048 ... bootresult = bootstrap(bootarr, 3, bootfunc=bootfunc) 1049 ... 1050 >>> bootresult # doctest: +FLOAT_CMP 1051 array([40., 46., 35.]) 1052 >>> bootresult.shape 1053 (3,) 1054 1055 """ 1056 if samples is None: 1057 samples = data.shape[0] 1058 1059 # make sure the input is sane 1060 if samples < 1 or bootnum < 1: 1061 raise ValueError("neither 'samples' nor 'bootnum' can be less than 1.") 1062 1063 if bootfunc is None: 1064 resultdims = (bootnum,) + (samples,) + data.shape[1:] 1065 else: 1066 # test number of outputs from bootfunc, avoid single outputs which are 1067 # array-like 1068 try: 1069 resultdims = (bootnum, len(bootfunc(data))) 1070 except TypeError: 1071 resultdims = (bootnum,) 1072 1073 # create empty boot array 1074 boot = np.empty(resultdims) 1075 1076 for i in range(bootnum): 1077 bootarr = np.random.randint(low=0, high=data.shape[0], size=samples) 1078 if bootfunc is None: 1079 boot[i] = data[bootarr] 1080 else: 1081 boot[i] = bootfunc(data[bootarr]) 1082 1083 return boot 1084 1085 1086def _scipy_kraft_burrows_nousek(N, B, CL): 1087 '''Upper limit on a poisson count rate 1088 1089 The implementation is based on Kraft, Burrows and Nousek 1090 `ApJ 374, 344 (1991) <https://ui.adsabs.harvard.edu/abs/1991ApJ...374..344K>`_. 1091 The XMM-Newton upper limit server uses the same formalism. 1092 1093 Parameters 1094 ---------- 1095 N : int or np.int32/np.int64 1096 Total observed count number 1097 B : float or np.float32/np.float64 1098 Background count rate (assumed to be known with negligible error 1099 from a large background area). 1100 CL : float or np.float32/np.float64 1101 Confidence level (number between 0 and 1) 1102 1103 Returns 1104 ------- 1105 S : source count limit 1106 1107 Notes 1108 ----- 1109 Requires :mod:`~scipy`. This implementation will cause Overflow Errors for 1110 about N > 100 (the exact limit depends on details of how scipy was 1111 compiled). See `~astropy.stats.mpmath_poisson_upper_limit` for an 1112 implementation that is slower, but can deal with arbitrarily high numbers 1113 since it is based on the `mpmath <http://mpmath.org/>`_ library. 1114 ''' 1115 1116 from scipy.optimize import brentq 1117 from scipy.integrate import quad 1118 from scipy.special import factorial 1119 1120 from math import exp 1121 1122 def eqn8(N, B): 1123 n = np.arange(N + 1, dtype=np.float64) 1124 return 1. / (exp(-B) * np.sum(np.power(B, n) / factorial(n))) 1125 1126 # The parameters of eqn8 do not vary between calls so we can calculate the 1127 # result once and reuse it. The same is True for the factorial of N. 1128 # eqn7 is called hundred times so "caching" these values yields a 1129 # significant speedup (factor 10). 1130 eqn8_res = eqn8(N, B) 1131 factorial_N = float(math.factorial(N)) 1132 1133 def eqn7(S, N, B): 1134 SpB = S + B 1135 return eqn8_res * (exp(-SpB) * SpB**N / factorial_N) 1136 1137 def eqn9_left(S_min, S_max, N, B): 1138 return quad(eqn7, S_min, S_max, args=(N, B), limit=500) 1139 1140 def find_s_min(S_max, N, B): 1141 ''' 1142 Kraft, Burrows and Nousek suggest to integrate from N-B in both 1143 directions at once, so that S_min and S_max move similarly (see 1144 the article for details). Here, this is implemented differently: 1145 Treat S_max as the optimization parameters in func and then 1146 calculate the matching s_min that has has eqn7(S_max) = 1147 eqn7(S_min) here. 1148 ''' 1149 y_S_max = eqn7(S_max, N, B) 1150 if eqn7(0, N, B) >= y_S_max: 1151 return 0. 1152 else: 1153 return brentq(lambda x: eqn7(x, N, B) - y_S_max, 0, N - B) 1154 1155 def func(s): 1156 s_min = find_s_min(s, N, B) 1157 out = eqn9_left(s_min, s, N, B) 1158 return out[0] - CL 1159 1160 S_max = brentq(func, N - B, 100) 1161 S_min = find_s_min(S_max, N, B) 1162 return S_min, S_max 1163 1164 1165def _mpmath_kraft_burrows_nousek(N, B, CL): 1166 '''Upper limit on a poisson count rate 1167 1168 The implementation is based on Kraft, Burrows and Nousek in 1169 `ApJ 374, 344 (1991) <https://ui.adsabs.harvard.edu/abs/1991ApJ...374..344K>`_. 1170 The XMM-Newton upper limit server used the same formalism. 1171 1172 Parameters 1173 ---------- 1174 N : int or np.int32/np.int64 1175 Total observed count number 1176 B : float or np.float32/np.float64 1177 Background count rate (assumed to be known with negligible error 1178 from a large background area). 1179 CL : float or np.float32/np.float64 1180 Confidence level (number between 0 and 1) 1181 1182 Returns 1183 ------- 1184 S : source count limit 1185 1186 Notes 1187 ----- 1188 Requires the `mpmath <http://mpmath.org/>`_ library. See 1189 `~astropy.stats.scipy_poisson_upper_limit` for an implementation 1190 that is based on scipy and evaluates faster, but runs only to about 1191 N = 100. 1192 ''' 1193 from mpmath import mpf, factorial, findroot, fsum, power, exp, quad 1194 1195 # We convert these values to float. Because for some reason, 1196 # mpmath.mpf cannot convert from numpy.int64 1197 N = mpf(float(N)) 1198 B = mpf(float(B)) 1199 CL = mpf(float(CL)) 1200 tol = 1e-4 1201 1202 def eqn8(N, B): 1203 sumterms = [power(B, n) / factorial(n) for n in range(int(N) + 1)] 1204 return 1. / (exp(-B) * fsum(sumterms)) 1205 1206 eqn8_res = eqn8(N, B) 1207 factorial_N = factorial(N) 1208 1209 def eqn7(S, N, B): 1210 SpB = S + B 1211 return eqn8_res * (exp(-SpB) * SpB**N / factorial_N) 1212 1213 def eqn9_left(S_min, S_max, N, B): 1214 def eqn7NB(S): 1215 return eqn7(S, N, B) 1216 return quad(eqn7NB, [S_min, S_max]) 1217 1218 def find_s_min(S_max, N, B): 1219 ''' 1220 Kraft, Burrows and Nousek suggest to integrate from N-B in both 1221 directions at once, so that S_min and S_max move similarly (see 1222 the article for details). Here, this is implemented differently: 1223 Treat S_max as the optimization parameters in func and then 1224 calculate the matching s_min that has has eqn7(S_max) = 1225 eqn7(S_min) here. 1226 ''' 1227 y_S_max = eqn7(S_max, N, B) 1228 # If B > N, then N-B, the "most probable" values is < 0 1229 # and thus s_min is certainly 0. 1230 # Note: For small N, s_max is also close to 0 and root finding 1231 # might find the wrong root, thus it is important to handle this 1232 # case here and return the analytical answer (s_min = 0). 1233 if (B >= N) or (eqn7(0, N, B) >= y_S_max): 1234 return 0. 1235 else: 1236 def eqn7ysmax(x): 1237 return eqn7(x, N, B) - y_S_max 1238 return findroot(eqn7ysmax, [0., N - B], solver='ridder', 1239 tol=tol) 1240 1241 def func(s): 1242 s_min = find_s_min(s, N, B) 1243 out = eqn9_left(s_min, s, N, B) 1244 return out - CL 1245 1246 # Several numerical problems were found prevent the solvers from finding 1247 # the roots unless the starting values are very close to the final values. 1248 # Thus, this primitive, time-wasting, brute-force stepping here to get 1249 # an interval that can be fed into the ridder solver. 1250 s_max_guess = max(N - B, 1.) 1251 while func(s_max_guess) < 0: 1252 s_max_guess += 1 1253 S_max = findroot(func, [s_max_guess - 1, s_max_guess], solver='ridder', 1254 tol=tol) 1255 S_min = find_s_min(S_max, N, B) 1256 return float(S_min), float(S_max) 1257 1258 1259def _kraft_burrows_nousek(N, B, CL): 1260 '''Upper limit on a poisson count rate 1261 1262 The implementation is based on Kraft, Burrows and Nousek in 1263 `ApJ 374, 344 (1991) <https://ui.adsabs.harvard.edu/abs/1991ApJ...374..344K>`_. 1264 The XMM-Newton upper limit server used the same formalism. 1265 1266 Parameters 1267 ---------- 1268 N : int or np.int32/np.int64 1269 Total observed count number 1270 B : float or np.float32/np.float64 1271 Background count rate (assumed to be known with negligible error 1272 from a large background area). 1273 CL : float or np.float32/np.float64 1274 Confidence level (number between 0 and 1) 1275 1276 Returns 1277 ------- 1278 S : source count limit 1279 1280 Notes 1281 ----- 1282 This functions has an optional dependency: Either :mod:`scipy` or `mpmath 1283 <http://mpmath.org/>`_ need to be available. (Scipy only works for 1284 N < 100). 1285 ''' 1286 from astropy.utils.compat.optional_deps import HAS_SCIPY, HAS_MPMATH 1287 1288 if HAS_SCIPY and N <= 100: 1289 try: 1290 return _scipy_kraft_burrows_nousek(N, B, CL) 1291 except OverflowError: 1292 if not HAS_MPMATH: 1293 raise ValueError('Need mpmath package for input numbers this ' 1294 'large.') 1295 if HAS_MPMATH: 1296 return _mpmath_kraft_burrows_nousek(N, B, CL) 1297 1298 raise ImportError('Either scipy or mpmath are required.') 1299 1300 1301def kuiper_false_positive_probability(D, N): 1302 """Compute the false positive probability for the Kuiper statistic. 1303 1304 Uses the set of four formulas described in Paltani 2004; they report 1305 the resulting function never underestimates the false positive 1306 probability but can be a bit high in the N=40..50 range. 1307 (They quote a factor 1.5 at the 1e-7 level.) 1308 1309 Parameters 1310 ---------- 1311 D : float 1312 The Kuiper test score. 1313 N : float 1314 The effective sample size. 1315 1316 Returns 1317 ------- 1318 fpp : float 1319 The probability of a score this large arising from the null hypothesis. 1320 1321 Notes 1322 ----- 1323 Eq 7 of Paltani 2004 appears to incorrectly quote the original formula 1324 (Stephens 1965). This function implements the original formula, as it 1325 produces a result closer to Monte Carlo simulations. 1326 1327 References 1328 ---------- 1329 1330 .. [1] Paltani, S., "Searching for periods in X-ray observations using 1331 Kuiper's test. Application to the ROSAT PSPC archive", 1332 Astronomy and Astrophysics, v.240, p.789-790, 2004. 1333 1334 .. [2] Stephens, M. A., "The goodness-of-fit statistic VN: distribution 1335 and significance points", Biometrika, v.52, p.309, 1965. 1336 1337 """ 1338 try: 1339 from scipy.special import factorial, comb 1340 except ImportError: 1341 # Retained for backwards compatibility with older versions of scipy 1342 # (factorial appears to have moved here in 0.14) 1343 from scipy.misc import factorial, comb 1344 1345 if D < 0. or D > 2.: 1346 raise ValueError("Must have 0<=D<=2 by definition of the Kuiper test") 1347 1348 if D < 2. / N: 1349 return 1. - factorial(N) * (D - 1. / N)**(N - 1) 1350 elif D < 3. / N: 1351 k = -(N * D - 1.) / 2. 1352 r = np.sqrt(k**2 - (N * D - 2.)**2 / 2.) 1353 a, b = -k + r, -k - r 1354 return 1 - (factorial(N - 1) * (b**(N - 1) * (1 - a) - a**(N - 1) * (1 - b)) 1355 / N**(N - 2) / (b - a)) 1356 elif (D > 0.5 and N % 2 == 0) or (D > (N - 1.) / (2. * N) and N % 2 == 1): 1357 # NOTE: the upper limit of this sum is taken from Stephens 1965 1358 t = np.arange(np.floor(N * (1 - D)) + 1) 1359 y = D + t / N 1360 Tt = y**(t - 3) * (y**3 * N 1361 - y**2 * t * (3 - 2 / N) 1362 + y * t * (t - 1) * (3 - 2 / N) / N 1363 - t * (t - 1) * (t - 2) / N**2) 1364 term = Tt * comb(N, t) * (1 - D - t / N)**(N - t - 1) 1365 return term.sum() 1366 else: 1367 z = D * np.sqrt(N) 1368 # When m*z>18.82 (sqrt(-log(finfo(double))/2)), exp(-2m**2z**2) 1369 # underflows. Cutting off just before avoids triggering a (pointless) 1370 # underflow warning if `under="warn"`. 1371 ms = np.arange(1, 18.82 / z) 1372 S1 = (2 * (4 * ms**2 * z**2 - 1) * np.exp(-2 * ms**2 * z**2)).sum() 1373 S2 = (ms**2 * (4 * ms**2 * z**2 - 3) * np.exp(-2 * ms**2 * z**2)).sum() 1374 return S1 - 8 * D / 3 * S2 1375 1376 1377def kuiper(data, cdf=lambda x: x, args=()): 1378 """Compute the Kuiper statistic. 1379 1380 Use the Kuiper statistic version of the Kolmogorov-Smirnov test to 1381 find the probability that a sample like ``data`` was drawn from the 1382 distribution whose CDF is given as ``cdf``. 1383 1384 .. warning:: 1385 This will not work correctly for distributions that are actually 1386 discrete (Poisson, for example). 1387 1388 Parameters 1389 ---------- 1390 data : array-like 1391 The data values. 1392 cdf : callable 1393 A callable to evaluate the CDF of the distribution being tested 1394 against. Will be called with a vector of all values at once. 1395 The default is a uniform distribution. 1396 args : list-like, optional 1397 Additional arguments to be supplied to cdf. 1398 1399 Returns 1400 ------- 1401 D : float 1402 The raw statistic. 1403 fpp : float 1404 The probability of a D this large arising with a sample drawn from 1405 the distribution whose CDF is cdf. 1406 1407 Notes 1408 ----- 1409 The Kuiper statistic resembles the Kolmogorov-Smirnov test in that 1410 it is nonparametric and invariant under reparameterizations of the data. 1411 The Kuiper statistic, in addition, is equally sensitive throughout 1412 the domain, and it is also invariant under cyclic permutations (making 1413 it particularly appropriate for analyzing circular data). 1414 1415 Returns (D, fpp), where D is the Kuiper D number and fpp is the 1416 probability that a value as large as D would occur if data was 1417 drawn from cdf. 1418 1419 .. warning:: 1420 The fpp is calculated only approximately, and it can be 1421 as much as 1.5 times the true value. 1422 1423 Stephens 1970 claims this is more effective than the KS at detecting 1424 changes in the variance of a distribution; the KS is (he claims) more 1425 sensitive at detecting changes in the mean. 1426 1427 If cdf was obtained from data by fitting, then fpp is not correct and 1428 it will be necessary to do Monte Carlo simulations to interpret D. 1429 D should normally be independent of the shape of CDF. 1430 1431 References 1432 ---------- 1433 1434 .. [1] Stephens, M. A., "Use of the Kolmogorov-Smirnov, Cramer-Von Mises 1435 and Related Statistics Without Extensive Tables", Journal of the 1436 Royal Statistical Society. Series B (Methodological), Vol. 32, 1437 No. 1. (1970), pp. 115-122. 1438 1439 1440 """ 1441 1442 data = np.sort(data) 1443 cdfv = cdf(data, *args) 1444 N = len(data) 1445 D = (np.amax(cdfv - np.arange(N) / float(N)) + 1446 np.amax((np.arange(N) + 1) / float(N) - cdfv)) 1447 1448 return D, kuiper_false_positive_probability(D, N) 1449 1450 1451def kuiper_two(data1, data2): 1452 """Compute the Kuiper statistic to compare two samples. 1453 1454 Parameters 1455 ---------- 1456 data1 : array-like 1457 The first set of data values. 1458 data2 : array-like 1459 The second set of data values. 1460 1461 Returns 1462 ------- 1463 D : float 1464 The raw test statistic. 1465 fpp : float 1466 The probability of obtaining two samples this different from 1467 the same distribution. 1468 1469 .. warning:: 1470 The fpp is quite approximate, especially for small samples. 1471 1472 """ 1473 data1 = np.sort(data1) 1474 data2 = np.sort(data2) 1475 n1, = data1.shape 1476 n2, = data2.shape 1477 common_type = np.find_common_type([], [data1.dtype, data2.dtype]) 1478 if not (np.issubdtype(common_type, np.number) 1479 and not np.issubdtype(common_type, np.complexfloating)): 1480 raise ValueError('kuiper_two only accepts real inputs') 1481 # nans, if any, are at the end after sorting. 1482 if np.isnan(data1[-1]) or np.isnan(data2[-1]): 1483 raise ValueError('kuiper_two only accepts non-nan inputs') 1484 D = _stats.ks_2samp(np.asarray(data1, common_type), 1485 np.asarray(data2, common_type)) 1486 Ne = len(data1) * len(data2) / float(len(data1) + len(data2)) 1487 return D, kuiper_false_positive_probability(D, Ne) 1488 1489 1490def fold_intervals(intervals): 1491 """Fold the weighted intervals to the interval (0,1). 1492 1493 Convert a list of intervals (ai, bi, wi) to a list of non-overlapping 1494 intervals covering (0,1). Each output interval has a weight equal 1495 to the sum of the wis of all the intervals that include it. All intervals 1496 are interpreted modulo 1, and weights are accumulated counting 1497 multiplicity. This is appropriate, for example, if you have one or more 1498 blocks of observation and you want to determine how much observation 1499 time was spent on different parts of a system's orbit (the blocks 1500 should be converted to units of the orbital period first). 1501 1502 Parameters 1503 ---------- 1504 intervals : list of (3,) tuple 1505 For each tuple (ai,bi,wi); ai and bi are the limits of the interval, 1506 and wi is the weight to apply to the interval. 1507 1508 Returns 1509 ------- 1510 breaks : (N,) array of float 1511 The endpoints of a set of intervals covering [0,1]; breaks[0]=0 and 1512 breaks[-1] = 1 1513 weights : (N-1,) array of float 1514 The ith element is the sum of number of times the interval 1515 breaks[i],breaks[i+1] is included in each interval times the weight 1516 associated with that interval. 1517 1518 """ 1519 r = [] 1520 breaks = set() 1521 tot = 0 1522 for (a, b, wt) in intervals: 1523 tot += (np.ceil(b) - np.floor(a)) * wt 1524 fa = a % 1 1525 breaks.add(fa) 1526 r.append((0, fa, -wt)) 1527 fb = b % 1 1528 breaks.add(fb) 1529 r.append((fb, 1, -wt)) 1530 1531 breaks.add(0.) 1532 breaks.add(1.) 1533 breaks = sorted(breaks) 1534 breaks_map = dict([(f, i) for (i, f) in enumerate(breaks)]) 1535 totals = np.zeros(len(breaks) - 1) 1536 totals += tot 1537 for (a, b, wt) in r: 1538 totals[breaks_map[a]:breaks_map[b]] += wt 1539 return np.array(breaks), totals 1540 1541 1542def cdf_from_intervals(breaks, totals): 1543 """Construct a callable piecewise-linear CDF from a pair of arrays. 1544 1545 Take a pair of arrays in the format returned by fold_intervals and 1546 make a callable cumulative distribution function on the interval 1547 (0,1). 1548 1549 Parameters 1550 ---------- 1551 breaks : (N,) array of float 1552 The boundaries of successive intervals. 1553 totals : (N-1,) array of float 1554 The weight for each interval. 1555 1556 Returns 1557 ------- 1558 f : callable 1559 A cumulative distribution function corresponding to the 1560 piecewise-constant probability distribution given by breaks, weights 1561 1562 """ 1563 if breaks[0] != 0 or breaks[-1] != 1: 1564 raise ValueError("Intervals must be restricted to [0,1]") 1565 if np.any(np.diff(breaks) <= 0): 1566 raise ValueError("Breaks must be strictly increasing") 1567 if np.any(totals < 0): 1568 raise ValueError( 1569 "Total weights in each subinterval must be nonnegative") 1570 if np.all(totals == 0): 1571 raise ValueError("At least one interval must have positive exposure") 1572 b = breaks.copy() 1573 c = np.concatenate(((0,), np.cumsum(totals * np.diff(b)))) 1574 c /= c[-1] 1575 return lambda x: np.interp(x, b, c, 0, 1) 1576 1577 1578def interval_overlap_length(i1, i2): 1579 """Compute the length of overlap of two intervals. 1580 1581 Parameters 1582 ---------- 1583 i1, i2 : (float, float) 1584 The two intervals, (interval 1, interval 2). 1585 1586 Returns 1587 ------- 1588 l : float 1589 The length of the overlap between the two intervals. 1590 1591 """ 1592 (a, b) = i1 1593 (c, d) = i2 1594 if a < c: 1595 if b < c: 1596 return 0. 1597 elif b < d: 1598 return b - c 1599 else: 1600 return d - c 1601 elif a < d: 1602 if b < d: 1603 return b - a 1604 else: 1605 return d - a 1606 else: 1607 return 0 1608 1609 1610def histogram_intervals(n, breaks, totals): 1611 """Histogram of a piecewise-constant weight function. 1612 1613 This function takes a piecewise-constant weight function and 1614 computes the average weight in each histogram bin. 1615 1616 Parameters 1617 ---------- 1618 n : int 1619 The number of bins 1620 breaks : (N,) array of float 1621 Endpoints of the intervals in the PDF 1622 totals : (N-1,) array of float 1623 Probability densities in each bin 1624 1625 Returns 1626 ------- 1627 h : array of float 1628 The average weight for each bin 1629 1630 """ 1631 h = np.zeros(n) 1632 start = breaks[0] 1633 for i in range(len(totals)): 1634 end = breaks[i + 1] 1635 for j in range(n): 1636 ol = interval_overlap_length((float(j) / n, 1637 float(j + 1) / n), (start, end)) 1638 h[j] += ol / (1. / n) * totals[i] 1639 start = end 1640 1641 return h 1642