1import pytest
2from scipy.stats import (betabinom, hypergeom, nhypergeom, bernoulli,
3                         boltzmann, skellam, zipf, zipfian, binom, nbinom,
4                         nchypergeom_fisher, nchypergeom_wallenius, randint)
5
6import numpy as np
7from numpy.testing import assert_almost_equal, assert_equal, assert_allclose
8from scipy.special import binom as special_binom
9import pytest
10from scipy.optimize import root_scalar
11from scipy.integrate import quad
12
13
14def test_hypergeom_logpmf():
15    # symmetries test
16    # f(k,N,K,n) = f(n-k,N,N-K,n) = f(K-k,N,K,N-n) = f(k,N,n,K)
17    k = 5
18    N = 50
19    K = 10
20    n = 5
21    logpmf1 = hypergeom.logpmf(k, N, K, n)
22    logpmf2 = hypergeom.logpmf(n - k, N, N - K, n)
23    logpmf3 = hypergeom.logpmf(K - k, N, K, N - n)
24    logpmf4 = hypergeom.logpmf(k, N, n, K)
25    assert_almost_equal(logpmf1, logpmf2, decimal=12)
26    assert_almost_equal(logpmf1, logpmf3, decimal=12)
27    assert_almost_equal(logpmf1, logpmf4, decimal=12)
28
29    # test related distribution
30    # Bernoulli distribution if n = 1
31    k = 1
32    N = 10
33    K = 7
34    n = 1
35    hypergeom_logpmf = hypergeom.logpmf(k, N, K, n)
36    bernoulli_logpmf = bernoulli.logpmf(k, K/N)
37    assert_almost_equal(hypergeom_logpmf, bernoulli_logpmf, decimal=12)
38
39
40def test_nhypergeom_pmf():
41    # test with hypergeom
42    M, n, r = 45, 13, 8
43    k = 6
44    NHG = nhypergeom.pmf(k, M, n, r)
45    HG = hypergeom.pmf(k, M, n, k+r-1) * (M - n - (r-1)) / (M - (k+r-1))
46    assert_allclose(HG, NHG, rtol=1e-10)
47
48
49def test_nhypergeom_pmfcdf():
50    # test pmf and cdf with arbitrary values.
51    M = 8
52    n = 3
53    r = 4
54    support = np.arange(n+1)
55    pmf = nhypergeom.pmf(support, M, n, r)
56    cdf = nhypergeom.cdf(support, M, n, r)
57    assert_allclose(pmf, [1/14, 3/14, 5/14, 5/14], rtol=1e-13)
58    assert_allclose(cdf, [1/14, 4/14, 9/14, 1.0], rtol=1e-13)
59
60
61def test_nhypergeom_r0():
62    # test with `r = 0`.
63    M = 10
64    n = 3
65    r = 0
66    pmf = nhypergeom.pmf([[0, 1, 2, 0], [1, 2, 0, 3]], M, n, r)
67    assert_allclose(pmf, [[1, 0, 0, 1], [0, 0, 1, 0]], rtol=1e-13)
68
69
70def test_nhypergeom_rvs_shape():
71    # Check that when given a size with more dimensions than the
72    # dimensions of the broadcast parameters, rvs returns an array
73    # with the correct shape.
74    x = nhypergeom.rvs(22, [7, 8, 9], [[12], [13]], size=(5, 1, 2, 3))
75    assert x.shape == (5, 1, 2, 3)
76
77
78def test_nhypergeom_accuracy():
79    # Check that nhypergeom.rvs post-gh-13431 gives the same values as
80    # inverse transform sampling
81    np.random.seed(0)
82    x = nhypergeom.rvs(22, 7, 11, size=100)
83    np.random.seed(0)
84    p = np.random.uniform(size=100)
85    y = nhypergeom.ppf(p, 22, 7, 11)
86    assert_equal(x, y)
87
88
89def test_boltzmann_upper_bound():
90    k = np.arange(-3, 5)
91
92    N = 1
93    p = boltzmann.pmf(k, 0.123, N)
94    expected = k == 0
95    assert_equal(p, expected)
96
97    lam = np.log(2)
98    N = 3
99    p = boltzmann.pmf(k, lam, N)
100    expected = [0, 0, 0, 4/7, 2/7, 1/7, 0, 0]
101    assert_allclose(p, expected, rtol=1e-13)
102
103    c = boltzmann.cdf(k, lam, N)
104    expected = [0, 0, 0, 4/7, 6/7, 1, 1, 1]
105    assert_allclose(c, expected, rtol=1e-13)
106
107
108def test_betabinom_a_and_b_unity():
109    # test limiting case that betabinom(n, 1, 1) is a discrete uniform
110    # distribution from 0 to n
111    n = 20
112    k = np.arange(n + 1)
113    p = betabinom(n, 1, 1).pmf(k)
114    expected = np.repeat(1 / (n + 1), n + 1)
115    assert_almost_equal(p, expected)
116
117
118def test_betabinom_bernoulli():
119    # test limiting case that betabinom(1, a, b) = bernoulli(a / (a + b))
120    a = 2.3
121    b = 0.63
122    k = np.arange(2)
123    p = betabinom(1, a, b).pmf(k)
124    expected = bernoulli(a / (a + b)).pmf(k)
125    assert_almost_equal(p, expected)
126
127
128def test_issue_10317():
129    alpha, n, p = 0.9, 10, 1
130    assert_equal(nbinom.interval(alpha=alpha, n=n, p=p), (0, 0))
131
132
133def test_issue_11134():
134    alpha, n, p = 0.95, 10, 0
135    assert_equal(binom.interval(alpha=alpha, n=n, p=p), (0, 0))
136
137
138def test_issue_7406():
139    np.random.seed(0)
140    assert_equal(binom.ppf(np.random.rand(10), 0, 0.5), 0)
141
142    # Also check that endpoints (q=0, q=1) are correct
143    assert_equal(binom.ppf(0, 0, 0.5), -1)
144    assert_equal(binom.ppf(1, 0, 0.5), 0)
145
146
147def test_issue_5122():
148    p = 0
149    n = np.random.randint(100, size=10)
150
151    x = 0
152    ppf = binom.ppf(x, n, p)
153    assert_equal(ppf, -1)
154
155    x = np.linspace(0.01, 0.99, 10)
156    ppf = binom.ppf(x, n, p)
157    assert_equal(ppf, 0)
158
159    x = 1
160    ppf = binom.ppf(x, n, p)
161    assert_equal(ppf, n)
162
163
164def test_issue_1603():
165    assert_equal(binom(1000, np.logspace(-3, -100)).ppf(0.01), 0)
166
167
168def test_issue_5503():
169    p = 0.5
170    x = np.logspace(3, 14, 12)
171    assert_allclose(binom.cdf(x, 2*x, p), 0.5, atol=1e-2)
172
173
174@pytest.mark.parametrize('x, n, p, cdf_desired', [
175    (300, 1000, 3/10, 0.51559351981411995636),
176    (3000, 10000, 3/10, 0.50493298381929698016),
177    (30000, 100000, 3/10, 0.50156000591726422864),
178    (300000, 1000000, 3/10, 0.50049331906666960038),
179    (3000000, 10000000, 3/10, 0.50015600124585261196),
180    (30000000, 100000000, 3/10, 0.50004933192735230102),
181    (30010000, 100000000, 3/10, 0.98545384016570790717),
182    (29990000, 100000000, 3/10, 0.01455017177985268670),
183    (29950000, 100000000, 3/10, 5.02250963487432024943e-28),
184])
185def test_issue_5503pt2(x, n, p, cdf_desired):
186    assert_allclose(binom.cdf(x, n, p), cdf_desired)
187
188
189def test_issue_5503pt3():
190    # From Wolfram Alpha: CDF[BinomialDistribution[1e12, 1e-12], 2]
191    assert_allclose(binom.cdf(2, 10**12, 10**-12), 0.91969860292869777384)
192
193
194def test_issue_6682():
195    # Reference value from R:
196    # options(digits=16)
197    # print(pnbinom(250, 50, 32/63, lower.tail=FALSE))
198    assert_allclose(nbinom.sf(250, 50, 32./63.), 1.460458510976452e-35)
199
200
201def test_skellam_gh11474():
202    # test issue reported in gh-11474 caused by `cdfchn`
203    mu = [1, 10, 100, 1000, 5000, 5050, 5100, 5250, 6000]
204    cdf = skellam.cdf(0, mu, mu)
205    # generated in R
206    # library(skellam)
207    # options(digits = 16)
208    # mu = c(1, 10, 100, 1000, 5000, 5050, 5100, 5250, 6000)
209    # pskellam(0, mu, mu, TRUE)
210    cdf_expected = [0.6542541612768356, 0.5448901559424127, 0.5141135799745580,
211                    0.5044605891382528, 0.5019947363350450, 0.5019848365953181,
212                    0.5019750827993392, 0.5019466621805060, 0.5018209330219539]
213    assert_allclose(cdf, cdf_expected)
214
215
216class TestZipfian:
217    def test_zipfian_asymptotic(self):
218        # test limiting case that zipfian(a, n) -> zipf(a) as n-> oo
219        a = 6.5
220        N = 10000000
221        k = np.arange(1, 21)
222        assert_allclose(zipfian.pmf(k, a, N), zipf.pmf(k, a))
223        assert_allclose(zipfian.cdf(k, a, N), zipf.cdf(k, a))
224        assert_allclose(zipfian.sf(k, a, N), zipf.sf(k, a))
225        assert_allclose(zipfian.stats(a, N, moments='msvk'),
226                        zipf.stats(a, moments='msvk'))
227
228    def test_zipfian_continuity(self):
229        # test that zipfian(0.999999, n) ~ zipfian(1.000001, n)
230        # (a = 1 switches between methods of calculating harmonic sum)
231        alt1, agt1 = 0.99999999, 1.00000001
232        N = 30
233        k = np.arange(1, N + 1)
234        assert_allclose(zipfian.pmf(k, alt1, N), zipfian.pmf(k, agt1, N))
235        assert_allclose(zipfian.cdf(k, alt1, N), zipfian.cdf(k, agt1, N))
236        assert_allclose(zipfian.sf(k, alt1, N), zipfian.sf(k, agt1, N))
237        assert_allclose(zipfian.stats(alt1, N, moments='msvk'),
238                        zipfian.stats(agt1, N, moments='msvk'), rtol=2e-7)
239
240    def test_zipfian_R(self):
241        # test against R VGAM package
242        # library(VGAM)
243        # k <- c(13, 16,  1,  4,  4,  8, 10, 19,  5,  7)
244        # a <- c(1.56712977, 3.72656295, 5.77665117, 9.12168729, 5.79977172,
245        #        4.92784796, 9.36078764, 4.3739616 , 7.48171872, 4.6824154)
246        # n <- c(70, 80, 48, 65, 83, 89, 50, 30, 20, 20)
247        # pmf <- dzipf(k, N = n, shape = a)
248        # cdf <- pzipf(k, N = n, shape = a)
249        # print(pmf)
250        # print(cdf)
251        np.random.seed(0)
252        k = np.random.randint(1, 20, size=10)
253        a = np.random.rand(10)*10 + 1
254        n = np.random.randint(1, 100, size=10)
255        pmf = [8.076972e-03, 2.950214e-05, 9.799333e-01, 3.216601e-06,
256               3.158895e-04, 3.412497e-05, 4.350472e-10, 2.405773e-06,
257               5.860662e-06, 1.053948e-04]
258        cdf = [0.8964133, 0.9998666, 0.9799333, 0.9999995, 0.9998584,
259               0.9999458, 1.0000000, 0.9999920, 0.9999977, 0.9998498]
260        # skip the first point; zipUC is not accurate for low a, n
261        assert_allclose(zipfian.pmf(k, a, n)[1:], pmf[1:], rtol=1e-6)
262        assert_allclose(zipfian.cdf(k, a, n)[1:], cdf[1:], rtol=5e-5)
263
264    np.random.seed(0)
265    naive_tests = np.vstack((np.logspace(-2, 1, 10),
266                             np.random.randint(2, 40, 10))).T
267
268    @pytest.mark.parametrize("a, n", naive_tests)
269    def test_zipfian_naive(self, a, n):
270        # test against bare-bones implementation
271
272        @np.vectorize
273        def Hns(n, s):
274            """Naive implementation of harmonic sum"""
275            return (1/np.arange(1, n+1)**s).sum()
276
277        @np.vectorize
278        def pzip(k, a, n):
279            """Naive implementation of zipfian pmf"""
280            if k < 1 or k > n:
281                return 0.
282            else:
283                return 1 / k**a / Hns(n, a)
284
285        k = np.arange(n+1)
286        pmf = pzip(k, a, n)
287        cdf = np.cumsum(pmf)
288        mean = np.average(k, weights=pmf)
289        var = np.average((k - mean)**2, weights=pmf)
290        std = var**0.5
291        skew = np.average(((k-mean)/std)**3, weights=pmf)
292        kurtosis = np.average(((k-mean)/std)**4, weights=pmf) - 3
293        assert_allclose(zipfian.pmf(k, a, n), pmf)
294        assert_allclose(zipfian.cdf(k, a, n), cdf)
295        assert_allclose(zipfian.stats(a, n, moments="mvsk"),
296                        [mean, var, skew, kurtosis])
297
298
299class TestNCH():
300    np.random.seed(2)  # seeds 0 and 1 had some xl = xu; randint failed
301    shape = (2, 4, 3)
302    max_m = 100
303    m1 = np.random.randint(1, max_m, size=shape)    # red balls
304    m2 = np.random.randint(1, max_m, size=shape)    # white balls
305    N = m1 + m2                                     # total balls
306    n = randint.rvs(0, N, size=N.shape)             # number of draws
307    xl = np.maximum(0, n-m2)                        # lower bound of support
308    xu = np.minimum(n, m1)                          # upper bound of support
309    x = randint.rvs(xl, xu, size=xl.shape)
310    odds = np.random.rand(*x.shape)*2
311
312    # test output is more readable when function names (strings) are passed
313    @pytest.mark.parametrize('dist_name',
314                             ['nchypergeom_fisher', 'nchypergeom_wallenius'])
315    def test_nch_hypergeom(self, dist_name):
316        # Both noncentral hypergeometric distributions reduce to the
317        # hypergeometric distribution when odds = 1
318        dists = {'nchypergeom_fisher': nchypergeom_fisher,
319                 'nchypergeom_wallenius': nchypergeom_wallenius}
320        dist = dists[dist_name]
321        x, N, m1, n = self.x, self.N, self.m1, self.n
322        assert_allclose(dist.pmf(x, N, m1, n, odds=1),
323                        hypergeom.pmf(x, N, m1, n))
324
325    def test_nchypergeom_fisher_naive(self):
326        # test against a very simple implementation
327        x, N, m1, n, odds = self.x, self.N, self.m1, self.n, self.odds
328
329        @np.vectorize
330        def pmf_mean_var(x, N, m1, n, w):
331            # simple implementation of nchypergeom_fisher pmf
332            m2 = N - m1
333            xl = np.maximum(0, n-m2)
334            xu = np.minimum(n, m1)
335
336            def f(x):
337                t1 = special_binom(m1, x)
338                t2 = special_binom(m2, n - x)
339                return t1 * t2 * w**x
340
341            def P(k):
342                return sum((f(y)*y**k for y in range(xl, xu + 1)))
343
344            P0 = P(0)
345            P1 = P(1)
346            P2 = P(2)
347            pmf = f(x) / P0
348            mean = P1 / P0
349            var = P2 / P0 - (P1 / P0)**2
350            return pmf, mean, var
351
352        pmf, mean, var = pmf_mean_var(x, N, m1, n, odds)
353        assert_allclose(nchypergeom_fisher.pmf(x, N, m1, n, odds), pmf)
354        assert_allclose(nchypergeom_fisher.stats(N, m1, n, odds, moments='m'),
355                        mean)
356        assert_allclose(nchypergeom_fisher.stats(N, m1, n, odds, moments='v'),
357                        var)
358
359    def test_nchypergeom_wallenius_naive(self):
360        # test against a very simple implementation
361
362        np.random.seed(2)
363        shape = (2, 4, 3)
364        max_m = 100
365        m1 = np.random.randint(1, max_m, size=shape)
366        m2 = np.random.randint(1, max_m, size=shape)
367        N = m1 + m2
368        n = randint.rvs(0, N, size=N.shape)
369        xl = np.maximum(0, n-m2)
370        xu = np.minimum(n, m1)
371        x = randint.rvs(xl, xu, size=xl.shape)
372        w = np.random.rand(*x.shape)*2
373
374        def support(N, m1, n, w):
375            m2 = N - m1
376            xl = np.maximum(0, n-m2)
377            xu = np.minimum(n, m1)
378            return xl, xu
379
380        @np.vectorize
381        def mean(N, m1, n, w):
382            m2 = N - m1
383            xl, xu = support(N, m1, n, w)
384
385            def fun(u):
386                return u/m1 + (1 - (n-u)/m2)**w - 1
387
388            return root_scalar(fun, bracket=(xl, xu)).root
389
390        assert_allclose(nchypergeom_wallenius.mean(N, m1, n, w),
391                        mean(N, m1, n, w), rtol=2e-2)
392
393        @np.vectorize
394        def variance(N, m1, n, w):
395            m2 = N - m1
396            u = mean(N, m1, n, w)
397            a = u * (m1 - u)
398            b = (n-u)*(u + m2 - n)
399            return N*a*b / ((N-1) * (m1*b + m2*a))
400
401        assert_allclose(nchypergeom_wallenius.stats(N, m1, n, w, moments='v'),
402                        variance(N, m1, n, w), rtol=5e-2)
403
404        @np.vectorize
405        def pmf(x, N, m1, n, w):
406            m2 = N - m1
407            xl, xu = support(N, m1, n, w)
408
409            def integrand(t):
410                D = w*(m1 - x) + (m2 - (n-x))
411                res = (1-t**(w/D))**x * (1-t**(1/D))**(n-x)
412                return res
413
414            def f(x):
415                t1 = special_binom(m1, x)
416                t2 = special_binom(m2, n - x)
417                the_integral = quad(integrand, 0, 1,
418                                    epsrel=1e-16, epsabs=1e-16)
419                return t1 * t2 * the_integral[0]
420
421            return f(x)
422
423        pmf0 = pmf(x, N, m1, n, w)
424        pmf1 = nchypergeom_wallenius.pmf(x, N, m1, n, w)
425
426        atol, rtol = 1e-6, 1e-6
427        i = np.abs(pmf1 - pmf0) < atol + rtol*np.abs(pmf0)
428        assert(i.sum() > np.prod(shape) / 2)  # works at least half the time
429
430        # for those that fail, discredit the naive implementation
431        for N, m1, n, w in zip(N[~i], m1[~i], n[~i], w[~i]):
432            # get the support
433            m2 = N - m1
434            xl, xu = support(N, m1, n, w)
435            x = np.arange(xl, xu + 1)
436
437            # calculate sum of pmf over the support
438            # the naive implementation is very wrong in these cases
439            assert pmf(x, N, m1, n, w).sum() < .5
440            assert_allclose(nchypergeom_wallenius.pmf(x, N, m1, n, w).sum(), 1)
441
442    def test_wallenius_against_mpmath(self):
443        # precompute data with mpmath since naive implementation above
444        # is not reliable. See source code in gh-13330.
445        M = 50
446        n = 30
447        N = 20
448        odds = 2.25
449        # Expected results, computed with mpmath.
450        sup = np.arange(21)
451        pmf = np.array([3.699003068656875e-20,
452                        5.89398584245431e-17,
453                        2.1594437742911123e-14,
454                        3.221458044649955e-12,
455                        2.4658279241205077e-10,
456                        1.0965862603981212e-08,
457                        3.057890479665704e-07,
458                        5.622818831643761e-06,
459                        7.056482841531681e-05,
460                        0.000618899425358671,
461                        0.003854172932571669,
462                        0.01720592676256026,
463                        0.05528844897093792,
464                        0.12772363313574242,
465                        0.21065898367825722,
466                        0.24465958845359234,
467                        0.1955114898110033,
468                        0.10355390084949237,
469                        0.03414490375225675,
470                        0.006231989845775931,
471                        0.0004715577304677075])
472        mean = 14.808018384813426
473        var = 2.6085975877923717
474
475        # nchypergeom_wallenius.pmf returns 0 for pmf(0) and pmf(1), and pmf(2)
476        # has only three digits of accuracy (~ 2.1511e-14).
477        assert_allclose(nchypergeom_wallenius.pmf(sup, M, n, N, odds), pmf,
478                        rtol=1e-13, atol=1e-13)
479        assert_allclose(nchypergeom_wallenius.mean(M, n, N, odds),
480                        mean, rtol=1e-13)
481        assert_allclose(nchypergeom_wallenius.var(M, n, N, odds),
482                        var, rtol=1e-11)
483
484    @pytest.mark.parametrize('dist_name',
485                             ['nchypergeom_fisher', 'nchypergeom_wallenius'])
486    def test_rvs_shape(self, dist_name):
487        # Check that when given a size with more dimensions than the
488        # dimensions of the broadcast parameters, rvs returns an array
489        # with the correct shape.
490        dists = {'nchypergeom_fisher': nchypergeom_fisher,
491                 'nchypergeom_wallenius': nchypergeom_wallenius}
492        dist = dists[dist_name]
493        x = dist.rvs(50, 30, [[10], [20]], [0.5, 1.0, 2.0], size=(5, 1, 2, 3))
494        assert x.shape == (5, 1, 2, 3)
495
496
497@pytest.mark.parametrize("mu, q, expected",
498                         [[10, 120, -1.240089881791596e-38],
499                          [1500, 0, -86.61466680572661]])
500def test_nbinom_11465(mu, q, expected):
501    # test nbinom.logcdf at extreme tails
502    size = 20
503    n, p = size, size/(size+mu)
504    # In R:
505    # options(digits=16)
506    # pnbinom(mu=10, size=20, q=120, log.p=TRUE)
507    assert_allclose(nbinom.logcdf(q, n, p), expected)
508