1import pytest 2from scipy.stats import (betabinom, hypergeom, nhypergeom, bernoulli, 3 boltzmann, skellam, zipf, zipfian, binom, nbinom, 4 nchypergeom_fisher, nchypergeom_wallenius, randint) 5 6import numpy as np 7from numpy.testing import assert_almost_equal, assert_equal, assert_allclose 8from scipy.special import binom as special_binom 9import pytest 10from scipy.optimize import root_scalar 11from scipy.integrate import quad 12 13 14def test_hypergeom_logpmf(): 15 # symmetries test 16 # f(k,N,K,n) = f(n-k,N,N-K,n) = f(K-k,N,K,N-n) = f(k,N,n,K) 17 k = 5 18 N = 50 19 K = 10 20 n = 5 21 logpmf1 = hypergeom.logpmf(k, N, K, n) 22 logpmf2 = hypergeom.logpmf(n - k, N, N - K, n) 23 logpmf3 = hypergeom.logpmf(K - k, N, K, N - n) 24 logpmf4 = hypergeom.logpmf(k, N, n, K) 25 assert_almost_equal(logpmf1, logpmf2, decimal=12) 26 assert_almost_equal(logpmf1, logpmf3, decimal=12) 27 assert_almost_equal(logpmf1, logpmf4, decimal=12) 28 29 # test related distribution 30 # Bernoulli distribution if n = 1 31 k = 1 32 N = 10 33 K = 7 34 n = 1 35 hypergeom_logpmf = hypergeom.logpmf(k, N, K, n) 36 bernoulli_logpmf = bernoulli.logpmf(k, K/N) 37 assert_almost_equal(hypergeom_logpmf, bernoulli_logpmf, decimal=12) 38 39 40def test_nhypergeom_pmf(): 41 # test with hypergeom 42 M, n, r = 45, 13, 8 43 k = 6 44 NHG = nhypergeom.pmf(k, M, n, r) 45 HG = hypergeom.pmf(k, M, n, k+r-1) * (M - n - (r-1)) / (M - (k+r-1)) 46 assert_allclose(HG, NHG, rtol=1e-10) 47 48 49def test_nhypergeom_pmfcdf(): 50 # test pmf and cdf with arbitrary values. 51 M = 8 52 n = 3 53 r = 4 54 support = np.arange(n+1) 55 pmf = nhypergeom.pmf(support, M, n, r) 56 cdf = nhypergeom.cdf(support, M, n, r) 57 assert_allclose(pmf, [1/14, 3/14, 5/14, 5/14], rtol=1e-13) 58 assert_allclose(cdf, [1/14, 4/14, 9/14, 1.0], rtol=1e-13) 59 60 61def test_nhypergeom_r0(): 62 # test with `r = 0`. 63 M = 10 64 n = 3 65 r = 0 66 pmf = nhypergeom.pmf([[0, 1, 2, 0], [1, 2, 0, 3]], M, n, r) 67 assert_allclose(pmf, [[1, 0, 0, 1], [0, 0, 1, 0]], rtol=1e-13) 68 69 70def test_nhypergeom_rvs_shape(): 71 # Check that when given a size with more dimensions than the 72 # dimensions of the broadcast parameters, rvs returns an array 73 # with the correct shape. 74 x = nhypergeom.rvs(22, [7, 8, 9], [[12], [13]], size=(5, 1, 2, 3)) 75 assert x.shape == (5, 1, 2, 3) 76 77 78def test_nhypergeom_accuracy(): 79 # Check that nhypergeom.rvs post-gh-13431 gives the same values as 80 # inverse transform sampling 81 np.random.seed(0) 82 x = nhypergeom.rvs(22, 7, 11, size=100) 83 np.random.seed(0) 84 p = np.random.uniform(size=100) 85 y = nhypergeom.ppf(p, 22, 7, 11) 86 assert_equal(x, y) 87 88 89def test_boltzmann_upper_bound(): 90 k = np.arange(-3, 5) 91 92 N = 1 93 p = boltzmann.pmf(k, 0.123, N) 94 expected = k == 0 95 assert_equal(p, expected) 96 97 lam = np.log(2) 98 N = 3 99 p = boltzmann.pmf(k, lam, N) 100 expected = [0, 0, 0, 4/7, 2/7, 1/7, 0, 0] 101 assert_allclose(p, expected, rtol=1e-13) 102 103 c = boltzmann.cdf(k, lam, N) 104 expected = [0, 0, 0, 4/7, 6/7, 1, 1, 1] 105 assert_allclose(c, expected, rtol=1e-13) 106 107 108def test_betabinom_a_and_b_unity(): 109 # test limiting case that betabinom(n, 1, 1) is a discrete uniform 110 # distribution from 0 to n 111 n = 20 112 k = np.arange(n + 1) 113 p = betabinom(n, 1, 1).pmf(k) 114 expected = np.repeat(1 / (n + 1), n + 1) 115 assert_almost_equal(p, expected) 116 117 118def test_betabinom_bernoulli(): 119 # test limiting case that betabinom(1, a, b) = bernoulli(a / (a + b)) 120 a = 2.3 121 b = 0.63 122 k = np.arange(2) 123 p = betabinom(1, a, b).pmf(k) 124 expected = bernoulli(a / (a + b)).pmf(k) 125 assert_almost_equal(p, expected) 126 127 128def test_issue_10317(): 129 alpha, n, p = 0.9, 10, 1 130 assert_equal(nbinom.interval(alpha=alpha, n=n, p=p), (0, 0)) 131 132 133def test_issue_11134(): 134 alpha, n, p = 0.95, 10, 0 135 assert_equal(binom.interval(alpha=alpha, n=n, p=p), (0, 0)) 136 137 138def test_issue_7406(): 139 np.random.seed(0) 140 assert_equal(binom.ppf(np.random.rand(10), 0, 0.5), 0) 141 142 # Also check that endpoints (q=0, q=1) are correct 143 assert_equal(binom.ppf(0, 0, 0.5), -1) 144 assert_equal(binom.ppf(1, 0, 0.5), 0) 145 146 147def test_issue_5122(): 148 p = 0 149 n = np.random.randint(100, size=10) 150 151 x = 0 152 ppf = binom.ppf(x, n, p) 153 assert_equal(ppf, -1) 154 155 x = np.linspace(0.01, 0.99, 10) 156 ppf = binom.ppf(x, n, p) 157 assert_equal(ppf, 0) 158 159 x = 1 160 ppf = binom.ppf(x, n, p) 161 assert_equal(ppf, n) 162 163 164def test_issue_1603(): 165 assert_equal(binom(1000, np.logspace(-3, -100)).ppf(0.01), 0) 166 167 168def test_issue_5503(): 169 p = 0.5 170 x = np.logspace(3, 14, 12) 171 assert_allclose(binom.cdf(x, 2*x, p), 0.5, atol=1e-2) 172 173 174@pytest.mark.parametrize('x, n, p, cdf_desired', [ 175 (300, 1000, 3/10, 0.51559351981411995636), 176 (3000, 10000, 3/10, 0.50493298381929698016), 177 (30000, 100000, 3/10, 0.50156000591726422864), 178 (300000, 1000000, 3/10, 0.50049331906666960038), 179 (3000000, 10000000, 3/10, 0.50015600124585261196), 180 (30000000, 100000000, 3/10, 0.50004933192735230102), 181 (30010000, 100000000, 3/10, 0.98545384016570790717), 182 (29990000, 100000000, 3/10, 0.01455017177985268670), 183 (29950000, 100000000, 3/10, 5.02250963487432024943e-28), 184]) 185def test_issue_5503pt2(x, n, p, cdf_desired): 186 assert_allclose(binom.cdf(x, n, p), cdf_desired) 187 188 189def test_issue_5503pt3(): 190 # From Wolfram Alpha: CDF[BinomialDistribution[1e12, 1e-12], 2] 191 assert_allclose(binom.cdf(2, 10**12, 10**-12), 0.91969860292869777384) 192 193 194def test_issue_6682(): 195 # Reference value from R: 196 # options(digits=16) 197 # print(pnbinom(250, 50, 32/63, lower.tail=FALSE)) 198 assert_allclose(nbinom.sf(250, 50, 32./63.), 1.460458510976452e-35) 199 200 201def test_skellam_gh11474(): 202 # test issue reported in gh-11474 caused by `cdfchn` 203 mu = [1, 10, 100, 1000, 5000, 5050, 5100, 5250, 6000] 204 cdf = skellam.cdf(0, mu, mu) 205 # generated in R 206 # library(skellam) 207 # options(digits = 16) 208 # mu = c(1, 10, 100, 1000, 5000, 5050, 5100, 5250, 6000) 209 # pskellam(0, mu, mu, TRUE) 210 cdf_expected = [0.6542541612768356, 0.5448901559424127, 0.5141135799745580, 211 0.5044605891382528, 0.5019947363350450, 0.5019848365953181, 212 0.5019750827993392, 0.5019466621805060, 0.5018209330219539] 213 assert_allclose(cdf, cdf_expected) 214 215 216class TestZipfian: 217 def test_zipfian_asymptotic(self): 218 # test limiting case that zipfian(a, n) -> zipf(a) as n-> oo 219 a = 6.5 220 N = 10000000 221 k = np.arange(1, 21) 222 assert_allclose(zipfian.pmf(k, a, N), zipf.pmf(k, a)) 223 assert_allclose(zipfian.cdf(k, a, N), zipf.cdf(k, a)) 224 assert_allclose(zipfian.sf(k, a, N), zipf.sf(k, a)) 225 assert_allclose(zipfian.stats(a, N, moments='msvk'), 226 zipf.stats(a, moments='msvk')) 227 228 def test_zipfian_continuity(self): 229 # test that zipfian(0.999999, n) ~ zipfian(1.000001, n) 230 # (a = 1 switches between methods of calculating harmonic sum) 231 alt1, agt1 = 0.99999999, 1.00000001 232 N = 30 233 k = np.arange(1, N + 1) 234 assert_allclose(zipfian.pmf(k, alt1, N), zipfian.pmf(k, agt1, N)) 235 assert_allclose(zipfian.cdf(k, alt1, N), zipfian.cdf(k, agt1, N)) 236 assert_allclose(zipfian.sf(k, alt1, N), zipfian.sf(k, agt1, N)) 237 assert_allclose(zipfian.stats(alt1, N, moments='msvk'), 238 zipfian.stats(agt1, N, moments='msvk'), rtol=2e-7) 239 240 def test_zipfian_R(self): 241 # test against R VGAM package 242 # library(VGAM) 243 # k <- c(13, 16, 1, 4, 4, 8, 10, 19, 5, 7) 244 # a <- c(1.56712977, 3.72656295, 5.77665117, 9.12168729, 5.79977172, 245 # 4.92784796, 9.36078764, 4.3739616 , 7.48171872, 4.6824154) 246 # n <- c(70, 80, 48, 65, 83, 89, 50, 30, 20, 20) 247 # pmf <- dzipf(k, N = n, shape = a) 248 # cdf <- pzipf(k, N = n, shape = a) 249 # print(pmf) 250 # print(cdf) 251 np.random.seed(0) 252 k = np.random.randint(1, 20, size=10) 253 a = np.random.rand(10)*10 + 1 254 n = np.random.randint(1, 100, size=10) 255 pmf = [8.076972e-03, 2.950214e-05, 9.799333e-01, 3.216601e-06, 256 3.158895e-04, 3.412497e-05, 4.350472e-10, 2.405773e-06, 257 5.860662e-06, 1.053948e-04] 258 cdf = [0.8964133, 0.9998666, 0.9799333, 0.9999995, 0.9998584, 259 0.9999458, 1.0000000, 0.9999920, 0.9999977, 0.9998498] 260 # skip the first point; zipUC is not accurate for low a, n 261 assert_allclose(zipfian.pmf(k, a, n)[1:], pmf[1:], rtol=1e-6) 262 assert_allclose(zipfian.cdf(k, a, n)[1:], cdf[1:], rtol=5e-5) 263 264 np.random.seed(0) 265 naive_tests = np.vstack((np.logspace(-2, 1, 10), 266 np.random.randint(2, 40, 10))).T 267 268 @pytest.mark.parametrize("a, n", naive_tests) 269 def test_zipfian_naive(self, a, n): 270 # test against bare-bones implementation 271 272 @np.vectorize 273 def Hns(n, s): 274 """Naive implementation of harmonic sum""" 275 return (1/np.arange(1, n+1)**s).sum() 276 277 @np.vectorize 278 def pzip(k, a, n): 279 """Naive implementation of zipfian pmf""" 280 if k < 1 or k > n: 281 return 0. 282 else: 283 return 1 / k**a / Hns(n, a) 284 285 k = np.arange(n+1) 286 pmf = pzip(k, a, n) 287 cdf = np.cumsum(pmf) 288 mean = np.average(k, weights=pmf) 289 var = np.average((k - mean)**2, weights=pmf) 290 std = var**0.5 291 skew = np.average(((k-mean)/std)**3, weights=pmf) 292 kurtosis = np.average(((k-mean)/std)**4, weights=pmf) - 3 293 assert_allclose(zipfian.pmf(k, a, n), pmf) 294 assert_allclose(zipfian.cdf(k, a, n), cdf) 295 assert_allclose(zipfian.stats(a, n, moments="mvsk"), 296 [mean, var, skew, kurtosis]) 297 298 299class TestNCH(): 300 np.random.seed(2) # seeds 0 and 1 had some xl = xu; randint failed 301 shape = (2, 4, 3) 302 max_m = 100 303 m1 = np.random.randint(1, max_m, size=shape) # red balls 304 m2 = np.random.randint(1, max_m, size=shape) # white balls 305 N = m1 + m2 # total balls 306 n = randint.rvs(0, N, size=N.shape) # number of draws 307 xl = np.maximum(0, n-m2) # lower bound of support 308 xu = np.minimum(n, m1) # upper bound of support 309 x = randint.rvs(xl, xu, size=xl.shape) 310 odds = np.random.rand(*x.shape)*2 311 312 # test output is more readable when function names (strings) are passed 313 @pytest.mark.parametrize('dist_name', 314 ['nchypergeom_fisher', 'nchypergeom_wallenius']) 315 def test_nch_hypergeom(self, dist_name): 316 # Both noncentral hypergeometric distributions reduce to the 317 # hypergeometric distribution when odds = 1 318 dists = {'nchypergeom_fisher': nchypergeom_fisher, 319 'nchypergeom_wallenius': nchypergeom_wallenius} 320 dist = dists[dist_name] 321 x, N, m1, n = self.x, self.N, self.m1, self.n 322 assert_allclose(dist.pmf(x, N, m1, n, odds=1), 323 hypergeom.pmf(x, N, m1, n)) 324 325 def test_nchypergeom_fisher_naive(self): 326 # test against a very simple implementation 327 x, N, m1, n, odds = self.x, self.N, self.m1, self.n, self.odds 328 329 @np.vectorize 330 def pmf_mean_var(x, N, m1, n, w): 331 # simple implementation of nchypergeom_fisher pmf 332 m2 = N - m1 333 xl = np.maximum(0, n-m2) 334 xu = np.minimum(n, m1) 335 336 def f(x): 337 t1 = special_binom(m1, x) 338 t2 = special_binom(m2, n - x) 339 return t1 * t2 * w**x 340 341 def P(k): 342 return sum((f(y)*y**k for y in range(xl, xu + 1))) 343 344 P0 = P(0) 345 P1 = P(1) 346 P2 = P(2) 347 pmf = f(x) / P0 348 mean = P1 / P0 349 var = P2 / P0 - (P1 / P0)**2 350 return pmf, mean, var 351 352 pmf, mean, var = pmf_mean_var(x, N, m1, n, odds) 353 assert_allclose(nchypergeom_fisher.pmf(x, N, m1, n, odds), pmf) 354 assert_allclose(nchypergeom_fisher.stats(N, m1, n, odds, moments='m'), 355 mean) 356 assert_allclose(nchypergeom_fisher.stats(N, m1, n, odds, moments='v'), 357 var) 358 359 def test_nchypergeom_wallenius_naive(self): 360 # test against a very simple implementation 361 362 np.random.seed(2) 363 shape = (2, 4, 3) 364 max_m = 100 365 m1 = np.random.randint(1, max_m, size=shape) 366 m2 = np.random.randint(1, max_m, size=shape) 367 N = m1 + m2 368 n = randint.rvs(0, N, size=N.shape) 369 xl = np.maximum(0, n-m2) 370 xu = np.minimum(n, m1) 371 x = randint.rvs(xl, xu, size=xl.shape) 372 w = np.random.rand(*x.shape)*2 373 374 def support(N, m1, n, w): 375 m2 = N - m1 376 xl = np.maximum(0, n-m2) 377 xu = np.minimum(n, m1) 378 return xl, xu 379 380 @np.vectorize 381 def mean(N, m1, n, w): 382 m2 = N - m1 383 xl, xu = support(N, m1, n, w) 384 385 def fun(u): 386 return u/m1 + (1 - (n-u)/m2)**w - 1 387 388 return root_scalar(fun, bracket=(xl, xu)).root 389 390 assert_allclose(nchypergeom_wallenius.mean(N, m1, n, w), 391 mean(N, m1, n, w), rtol=2e-2) 392 393 @np.vectorize 394 def variance(N, m1, n, w): 395 m2 = N - m1 396 u = mean(N, m1, n, w) 397 a = u * (m1 - u) 398 b = (n-u)*(u + m2 - n) 399 return N*a*b / ((N-1) * (m1*b + m2*a)) 400 401 assert_allclose(nchypergeom_wallenius.stats(N, m1, n, w, moments='v'), 402 variance(N, m1, n, w), rtol=5e-2) 403 404 @np.vectorize 405 def pmf(x, N, m1, n, w): 406 m2 = N - m1 407 xl, xu = support(N, m1, n, w) 408 409 def integrand(t): 410 D = w*(m1 - x) + (m2 - (n-x)) 411 res = (1-t**(w/D))**x * (1-t**(1/D))**(n-x) 412 return res 413 414 def f(x): 415 t1 = special_binom(m1, x) 416 t2 = special_binom(m2, n - x) 417 the_integral = quad(integrand, 0, 1, 418 epsrel=1e-16, epsabs=1e-16) 419 return t1 * t2 * the_integral[0] 420 421 return f(x) 422 423 pmf0 = pmf(x, N, m1, n, w) 424 pmf1 = nchypergeom_wallenius.pmf(x, N, m1, n, w) 425 426 atol, rtol = 1e-6, 1e-6 427 i = np.abs(pmf1 - pmf0) < atol + rtol*np.abs(pmf0) 428 assert(i.sum() > np.prod(shape) / 2) # works at least half the time 429 430 # for those that fail, discredit the naive implementation 431 for N, m1, n, w in zip(N[~i], m1[~i], n[~i], w[~i]): 432 # get the support 433 m2 = N - m1 434 xl, xu = support(N, m1, n, w) 435 x = np.arange(xl, xu + 1) 436 437 # calculate sum of pmf over the support 438 # the naive implementation is very wrong in these cases 439 assert pmf(x, N, m1, n, w).sum() < .5 440 assert_allclose(nchypergeom_wallenius.pmf(x, N, m1, n, w).sum(), 1) 441 442 def test_wallenius_against_mpmath(self): 443 # precompute data with mpmath since naive implementation above 444 # is not reliable. See source code in gh-13330. 445 M = 50 446 n = 30 447 N = 20 448 odds = 2.25 449 # Expected results, computed with mpmath. 450 sup = np.arange(21) 451 pmf = np.array([3.699003068656875e-20, 452 5.89398584245431e-17, 453 2.1594437742911123e-14, 454 3.221458044649955e-12, 455 2.4658279241205077e-10, 456 1.0965862603981212e-08, 457 3.057890479665704e-07, 458 5.622818831643761e-06, 459 7.056482841531681e-05, 460 0.000618899425358671, 461 0.003854172932571669, 462 0.01720592676256026, 463 0.05528844897093792, 464 0.12772363313574242, 465 0.21065898367825722, 466 0.24465958845359234, 467 0.1955114898110033, 468 0.10355390084949237, 469 0.03414490375225675, 470 0.006231989845775931, 471 0.0004715577304677075]) 472 mean = 14.808018384813426 473 var = 2.6085975877923717 474 475 # nchypergeom_wallenius.pmf returns 0 for pmf(0) and pmf(1), and pmf(2) 476 # has only three digits of accuracy (~ 2.1511e-14). 477 assert_allclose(nchypergeom_wallenius.pmf(sup, M, n, N, odds), pmf, 478 rtol=1e-13, atol=1e-13) 479 assert_allclose(nchypergeom_wallenius.mean(M, n, N, odds), 480 mean, rtol=1e-13) 481 assert_allclose(nchypergeom_wallenius.var(M, n, N, odds), 482 var, rtol=1e-11) 483 484 @pytest.mark.parametrize('dist_name', 485 ['nchypergeom_fisher', 'nchypergeom_wallenius']) 486 def test_rvs_shape(self, dist_name): 487 # Check that when given a size with more dimensions than the 488 # dimensions of the broadcast parameters, rvs returns an array 489 # with the correct shape. 490 dists = {'nchypergeom_fisher': nchypergeom_fisher, 491 'nchypergeom_wallenius': nchypergeom_wallenius} 492 dist = dists[dist_name] 493 x = dist.rvs(50, 30, [[10], [20]], [0.5, 1.0, 2.0], size=(5, 1, 2, 3)) 494 assert x.shape == (5, 1, 2, 3) 495 496 497@pytest.mark.parametrize("mu, q, expected", 498 [[10, 120, -1.240089881791596e-38], 499 [1500, 0, -86.61466680572661]]) 500def test_nbinom_11465(mu, q, expected): 501 # test nbinom.logcdf at extreme tails 502 size = 20 503 n, p = size, size/(size+mu) 504 # In R: 505 # options(digits=16) 506 # pnbinom(mu=10, size=20, q=120, log.p=TRUE) 507 assert_allclose(nbinom.logcdf(q, n, p), expected) 508