1from __future__ import annotations 2import math 3import warnings 4from collections import namedtuple 5 6import numpy as np 7from numpy import (isscalar, r_, log, around, unique, asarray, zeros, 8 arange, sort, amin, amax, atleast_1d, sqrt, array, 9 compress, pi, exp, ravel, count_nonzero, sin, cos, 10 arctan2, hypot) 11 12from scipy import optimize 13from scipy import special 14from . import statlib 15from . import stats 16from .stats import find_repeats, _contains_nan, _normtest_finish 17from .contingency import chi2_contingency 18from . import distributions 19from ._distn_infrastructure import rv_generic 20from ._hypotests import _get_wilcoxon_distr 21 22 23__all__ = ['mvsdist', 24 'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot', 25 'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot', 26 'shapiro', 'anderson', 'ansari', 'bartlett', 'levene', 'binom_test', 27 'fligner', 'mood', 'wilcoxon', 'median_test', 28 'circmean', 'circvar', 'circstd', 'anderson_ksamp', 29 'yeojohnson_llf', 'yeojohnson', 'yeojohnson_normmax', 30 'yeojohnson_normplot' 31 ] 32 33 34Mean = namedtuple('Mean', ('statistic', 'minmax')) 35Variance = namedtuple('Variance', ('statistic', 'minmax')) 36Std_dev = namedtuple('Std_dev', ('statistic', 'minmax')) 37 38 39def bayes_mvs(data, alpha=0.90): 40 r""" 41 Bayesian confidence intervals for the mean, var, and std. 42 43 Parameters 44 ---------- 45 data : array_like 46 Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`. 47 Requires 2 or more data points. 48 alpha : float, optional 49 Probability that the returned confidence interval contains 50 the true parameter. 51 52 Returns 53 ------- 54 mean_cntr, var_cntr, std_cntr : tuple 55 The three results are for the mean, variance and standard deviation, 56 respectively. Each result is a tuple of the form:: 57 58 (center, (lower, upper)) 59 60 with `center` the mean of the conditional pdf of the value given the 61 data, and `(lower, upper)` a confidence interval, centered on the 62 median, containing the estimate to a probability ``alpha``. 63 64 See Also 65 -------- 66 mvsdist 67 68 Notes 69 ----- 70 Each tuple of mean, variance, and standard deviation estimates represent 71 the (center, (lower, upper)) with center the mean of the conditional pdf 72 of the value given the data and (lower, upper) is a confidence interval 73 centered on the median, containing the estimate to a probability 74 ``alpha``. 75 76 Converts data to 1-D and assumes all data has the same mean and variance. 77 Uses Jeffrey's prior for variance and std. 78 79 Equivalent to ``tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat))`` 80 81 References 82 ---------- 83 T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and 84 standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278, 85 2006. 86 87 Examples 88 -------- 89 First a basic example to demonstrate the outputs: 90 91 >>> from scipy import stats 92 >>> data = [6, 9, 12, 7, 8, 8, 13] 93 >>> mean, var, std = stats.bayes_mvs(data) 94 >>> mean 95 Mean(statistic=9.0, minmax=(7.103650222612533, 10.896349777387467)) 96 >>> var 97 Variance(statistic=10.0, minmax=(3.176724206..., 24.45910382...)) 98 >>> std 99 Std_dev(statistic=2.9724954732045084, minmax=(1.7823367265645143, 4.945614605014631)) 100 101 Now we generate some normally distributed random data, and get estimates of 102 mean and standard deviation with 95% confidence intervals for those 103 estimates: 104 105 >>> n_samples = 100000 106 >>> data = stats.norm.rvs(size=n_samples) 107 >>> res_mean, res_var, res_std = stats.bayes_mvs(data, alpha=0.95) 108 109 >>> import matplotlib.pyplot as plt 110 >>> fig = plt.figure() 111 >>> ax = fig.add_subplot(111) 112 >>> ax.hist(data, bins=100, density=True, label='Histogram of data') 113 >>> ax.vlines(res_mean.statistic, 0, 0.5, colors='r', label='Estimated mean') 114 >>> ax.axvspan(res_mean.minmax[0],res_mean.minmax[1], facecolor='r', 115 ... alpha=0.2, label=r'Estimated mean (95% limits)') 116 >>> ax.vlines(res_std.statistic, 0, 0.5, colors='g', label='Estimated scale') 117 >>> ax.axvspan(res_std.minmax[0],res_std.minmax[1], facecolor='g', alpha=0.2, 118 ... label=r'Estimated scale (95% limits)') 119 120 >>> ax.legend(fontsize=10) 121 >>> ax.set_xlim([-4, 4]) 122 >>> ax.set_ylim([0, 0.5]) 123 >>> plt.show() 124 125 """ 126 m, v, s = mvsdist(data) 127 if alpha >= 1 or alpha <= 0: 128 raise ValueError("0 < alpha < 1 is required, but alpha=%s was given." 129 % alpha) 130 131 m_res = Mean(m.mean(), m.interval(alpha)) 132 v_res = Variance(v.mean(), v.interval(alpha)) 133 s_res = Std_dev(s.mean(), s.interval(alpha)) 134 135 return m_res, v_res, s_res 136 137 138def mvsdist(data): 139 """ 140 'Frozen' distributions for mean, variance, and standard deviation of data. 141 142 Parameters 143 ---------- 144 data : array_like 145 Input array. Converted to 1-D using ravel. 146 Requires 2 or more data-points. 147 148 Returns 149 ------- 150 mdist : "frozen" distribution object 151 Distribution object representing the mean of the data. 152 vdist : "frozen" distribution object 153 Distribution object representing the variance of the data. 154 sdist : "frozen" distribution object 155 Distribution object representing the standard deviation of the data. 156 157 See Also 158 -------- 159 bayes_mvs 160 161 Notes 162 ----- 163 The return values from ``bayes_mvs(data)`` is equivalent to 164 ``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``. 165 166 In other words, calling ``<dist>.mean()`` and ``<dist>.interval(0.90)`` 167 on the three distribution objects returned from this function will give 168 the same results that are returned from `bayes_mvs`. 169 170 References 171 ---------- 172 T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and 173 standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278, 174 2006. 175 176 Examples 177 -------- 178 >>> from scipy import stats 179 >>> data = [6, 9, 12, 7, 8, 8, 13] 180 >>> mean, var, std = stats.mvsdist(data) 181 182 We now have frozen distribution objects "mean", "var" and "std" that we can 183 examine: 184 185 >>> mean.mean() 186 9.0 187 >>> mean.interval(0.95) 188 (6.6120585482655692, 11.387941451734431) 189 >>> mean.std() 190 1.1952286093343936 191 192 """ 193 x = ravel(data) 194 n = len(x) 195 if n < 2: 196 raise ValueError("Need at least 2 data-points.") 197 xbar = x.mean() 198 C = x.var() 199 if n > 1000: # gaussian approximations for large n 200 mdist = distributions.norm(loc=xbar, scale=math.sqrt(C / n)) 201 sdist = distributions.norm(loc=math.sqrt(C), scale=math.sqrt(C / (2. * n))) 202 vdist = distributions.norm(loc=C, scale=math.sqrt(2.0 / n) * C) 203 else: 204 nm1 = n - 1 205 fac = n * C / 2. 206 val = nm1 / 2. 207 mdist = distributions.t(nm1, loc=xbar, scale=math.sqrt(C / nm1)) 208 sdist = distributions.gengamma(val, -2, scale=math.sqrt(fac)) 209 vdist = distributions.invgamma(val, scale=fac) 210 return mdist, vdist, sdist 211 212 213def kstat(data, n=2): 214 r""" 215 Return the nth k-statistic (1<=n<=4 so far). 216 217 The nth k-statistic k_n is the unique symmetric unbiased estimator of the 218 nth cumulant kappa_n. 219 220 Parameters 221 ---------- 222 data : array_like 223 Input array. Note that n-D input gets flattened. 224 n : int, {1, 2, 3, 4}, optional 225 Default is equal to 2. 226 227 Returns 228 ------- 229 kstat : float 230 The nth k-statistic. 231 232 See Also 233 -------- 234 kstatvar: Returns an unbiased estimator of the variance of the k-statistic. 235 moment: Returns the n-th central moment about the mean for a sample. 236 237 Notes 238 ----- 239 For a sample size n, the first few k-statistics are given by: 240 241 .. math:: 242 243 k_{1} = \mu 244 k_{2} = \frac{n}{n-1} m_{2} 245 k_{3} = \frac{ n^{2} } {(n-1) (n-2)} m_{3} 246 k_{4} = \frac{ n^{2} [(n + 1)m_{4} - 3(n - 1) m^2_{2}]} {(n-1) (n-2) (n-3)} 247 248 where :math:`\mu` is the sample mean, :math:`m_2` is the sample 249 variance, and :math:`m_i` is the i-th sample central moment. 250 251 References 252 ---------- 253 http://mathworld.wolfram.com/k-Statistic.html 254 255 http://mathworld.wolfram.com/Cumulant.html 256 257 Examples 258 -------- 259 >>> from scipy import stats 260 >>> from numpy.random import default_rng 261 >>> rng = default_rng() 262 263 As sample size increases, n-th moment and n-th k-statistic converge to the 264 same number (although they aren't identical). In the case of the normal 265 distribution, they converge to zero. 266 267 >>> for n in [2, 3, 4, 5, 6, 7]: 268 ... x = rng.normal(size=10**n) 269 ... m, k = stats.moment(x, 3), stats.kstat(x, 3) 270 ... print("%.3g %.3g %.3g" % (m, k, m-k)) 271 -0.631 -0.651 0.0194 # random 272 0.0282 0.0283 -8.49e-05 273 -0.0454 -0.0454 1.36e-05 274 7.53e-05 7.53e-05 -2.26e-09 275 0.00166 0.00166 -4.99e-09 276 -2.88e-06 -2.88e-06 8.63e-13 277 """ 278 if n > 4 or n < 1: 279 raise ValueError("k-statistics only supported for 1<=n<=4") 280 n = int(n) 281 S = np.zeros(n + 1, np.float64) 282 data = ravel(data) 283 N = data.size 284 285 # raise ValueError on empty input 286 if N == 0: 287 raise ValueError("Data input must not be empty") 288 289 # on nan input, return nan without warning 290 if np.isnan(np.sum(data)): 291 return np.nan 292 293 for k in range(1, n + 1): 294 S[k] = np.sum(data**k, axis=0) 295 if n == 1: 296 return S[1] * 1.0/N 297 elif n == 2: 298 return (N*S[2] - S[1]**2.0) / (N*(N - 1.0)) 299 elif n == 3: 300 return (2*S[1]**3 - 3*N*S[1]*S[2] + N*N*S[3]) / (N*(N - 1.0)*(N - 2.0)) 301 elif n == 4: 302 return ((-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 - 303 4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) / 304 (N*(N-1.0)*(N-2.0)*(N-3.0))) 305 else: 306 raise ValueError("Should not be here.") 307 308 309def kstatvar(data, n=2): 310 r"""Return an unbiased estimator of the variance of the k-statistic. 311 312 See `kstat` for more details of the k-statistic. 313 314 Parameters 315 ---------- 316 data : array_like 317 Input array. Note that n-D input gets flattened. 318 n : int, {1, 2}, optional 319 Default is equal to 2. 320 321 Returns 322 ------- 323 kstatvar : float 324 The nth k-statistic variance. 325 326 See Also 327 -------- 328 kstat: Returns the n-th k-statistic. 329 moment: Returns the n-th central moment about the mean for a sample. 330 331 Notes 332 ----- 333 The variances of the first few k-statistics are given by: 334 335 .. math:: 336 337 var(k_{1}) = \frac{\kappa^2}{n} 338 var(k_{2}) = \frac{\kappa^4}{n} + \frac{2\kappa^2_{2}}{n - 1} 339 var(k_{3}) = \frac{\kappa^6}{n} + \frac{9 \kappa_2 \kappa_4}{n - 1} + 340 \frac{9 \kappa^2_{3}}{n - 1} + 341 \frac{6 n \kappa^3_{2}}{(n-1) (n-2)} 342 var(k_{4}) = \frac{\kappa^8}{n} + \frac{16 \kappa_2 \kappa_6}{n - 1} + 343 \frac{48 \kappa_{3} \kappa_5}{n - 1} + 344 \frac{34 \kappa^2_{4}}{n-1} + \frac{72 n \kappa^2_{2} \kappa_4}{(n - 1) (n - 2)} + 345 \frac{144 n \kappa_{2} \kappa^2_{3}}{(n - 1) (n - 2)} + 346 \frac{24 (n + 1) n \kappa^4_{2}}{(n - 1) (n - 2) (n - 3)} 347 """ 348 data = ravel(data) 349 N = len(data) 350 if n == 1: 351 return kstat(data, n=2) * 1.0/N 352 elif n == 2: 353 k2 = kstat(data, n=2) 354 k4 = kstat(data, n=4) 355 return (2*N*k2**2 + (N-1)*k4) / (N*(N+1)) 356 else: 357 raise ValueError("Only n=1 or n=2 supported.") 358 359 360def _calc_uniform_order_statistic_medians(n): 361 """Approximations of uniform order statistic medians. 362 363 Parameters 364 ---------- 365 n : int 366 Sample size. 367 368 Returns 369 ------- 370 v : 1d float array 371 Approximations of the order statistic medians. 372 373 References 374 ---------- 375 .. [1] James J. Filliben, "The Probability Plot Correlation Coefficient 376 Test for Normality", Technometrics, Vol. 17, pp. 111-117, 1975. 377 378 Examples 379 -------- 380 Order statistics of the uniform distribution on the unit interval 381 are marginally distributed according to beta distributions. 382 The expectations of these order statistic are evenly spaced across 383 the interval, but the distributions are skewed in a way that 384 pushes the medians slightly towards the endpoints of the unit interval: 385 386 >>> n = 4 387 >>> k = np.arange(1, n+1) 388 >>> from scipy.stats import beta 389 >>> a = k 390 >>> b = n-k+1 391 >>> beta.mean(a, b) 392 array([0.2, 0.4, 0.6, 0.8]) 393 >>> beta.median(a, b) 394 array([0.15910358, 0.38572757, 0.61427243, 0.84089642]) 395 396 The Filliben approximation uses the exact medians of the smallest 397 and greatest order statistics, and the remaining medians are approximated 398 by points spread evenly across a sub-interval of the unit interval: 399 400 >>> from scipy.morestats import _calc_uniform_order_statistic_medians 401 >>> _calc_uniform_order_statistic_medians(n) 402 array([0.15910358, 0.38545246, 0.61454754, 0.84089642]) 403 404 This plot shows the skewed distributions of the order statistics 405 of a sample of size four from a uniform distribution on the unit interval: 406 407 >>> import matplotlib.pyplot as plt 408 >>> x = np.linspace(0.0, 1.0, num=50, endpoint=True) 409 >>> pdfs = [beta.pdf(x, a[i], b[i]) for i in range(n)] 410 >>> plt.figure() 411 >>> plt.plot(x, pdfs[0], x, pdfs[1], x, pdfs[2], x, pdfs[3]) 412 413 """ 414 v = np.empty(n, dtype=np.float64) 415 v[-1] = 0.5**(1.0 / n) 416 v[0] = 1 - v[-1] 417 i = np.arange(2, n) 418 v[1:-1] = (i - 0.3175) / (n + 0.365) 419 return v 420 421 422def _parse_dist_kw(dist, enforce_subclass=True): 423 """Parse `dist` keyword. 424 425 Parameters 426 ---------- 427 dist : str or stats.distributions instance. 428 Several functions take `dist` as a keyword, hence this utility 429 function. 430 enforce_subclass : bool, optional 431 If True (default), `dist` needs to be a 432 `_distn_infrastructure.rv_generic` instance. 433 It can sometimes be useful to set this keyword to False, if a function 434 wants to accept objects that just look somewhat like such an instance 435 (for example, they have a ``ppf`` method). 436 437 """ 438 if isinstance(dist, rv_generic): 439 pass 440 elif isinstance(dist, str): 441 try: 442 dist = getattr(distributions, dist) 443 except AttributeError as e: 444 raise ValueError("%s is not a valid distribution name" % dist) from e 445 elif enforce_subclass: 446 msg = ("`dist` should be a stats.distributions instance or a string " 447 "with the name of such a distribution.") 448 raise ValueError(msg) 449 450 return dist 451 452 453def _add_axis_labels_title(plot, xlabel, ylabel, title): 454 """Helper function to add axes labels and a title to stats plots.""" 455 try: 456 if hasattr(plot, 'set_title'): 457 # Matplotlib Axes instance or something that looks like it 458 plot.set_title(title) 459 plot.set_xlabel(xlabel) 460 plot.set_ylabel(ylabel) 461 else: 462 # matplotlib.pyplot module 463 plot.title(title) 464 plot.xlabel(xlabel) 465 plot.ylabel(ylabel) 466 except Exception: 467 # Not an MPL object or something that looks (enough) like it. 468 # Don't crash on adding labels or title 469 pass 470 471 472def probplot(x, sparams=(), dist='norm', fit=True, plot=None, rvalue=False): 473 """ 474 Calculate quantiles for a probability plot, and optionally show the plot. 475 476 Generates a probability plot of sample data against the quantiles of a 477 specified theoretical distribution (the normal distribution by default). 478 `probplot` optionally calculates a best-fit line for the data and plots the 479 results using Matplotlib or a given plot function. 480 481 Parameters 482 ---------- 483 x : array_like 484 Sample/response data from which `probplot` creates the plot. 485 sparams : tuple, optional 486 Distribution-specific shape parameters (shape parameters plus location 487 and scale). 488 dist : str or stats.distributions instance, optional 489 Distribution or distribution function name. The default is 'norm' for a 490 normal probability plot. Objects that look enough like a 491 stats.distributions instance (i.e. they have a ``ppf`` method) are also 492 accepted. 493 fit : bool, optional 494 Fit a least-squares regression (best-fit) line to the sample data if 495 True (default). 496 plot : object, optional 497 If given, plots the quantiles. 498 If given and `fit` is True, also plots the least squares fit. 499 `plot` is an object that has to have methods "plot" and "text". 500 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, 501 or a custom object with the same methods. 502 Default is None, which means that no plot is created. 503 504 Returns 505 ------- 506 (osm, osr) : tuple of ndarrays 507 Tuple of theoretical quantiles (osm, or order statistic medians) and 508 ordered responses (osr). `osr` is simply sorted input `x`. 509 For details on how `osm` is calculated see the Notes section. 510 (slope, intercept, r) : tuple of floats, optional 511 Tuple containing the result of the least-squares fit, if that is 512 performed by `probplot`. `r` is the square root of the coefficient of 513 determination. If ``fit=False`` and ``plot=None``, this tuple is not 514 returned. 515 516 Notes 517 ----- 518 Even if `plot` is given, the figure is not shown or saved by `probplot`; 519 ``plt.show()`` or ``plt.savefig('figname.png')`` should be used after 520 calling `probplot`. 521 522 `probplot` generates a probability plot, which should not be confused with 523 a Q-Q or a P-P plot. Statsmodels has more extensive functionality of this 524 type, see ``statsmodels.api.ProbPlot``. 525 526 The formula used for the theoretical quantiles (horizontal axis of the 527 probability plot) is Filliben's estimate:: 528 529 quantiles = dist.ppf(val), for 530 531 0.5**(1/n), for i = n 532 val = (i - 0.3175) / (n + 0.365), for i = 2, ..., n-1 533 1 - 0.5**(1/n), for i = 1 534 535 where ``i`` indicates the i-th ordered value and ``n`` is the total number 536 of values. 537 538 Examples 539 -------- 540 >>> from scipy import stats 541 >>> import matplotlib.pyplot as plt 542 >>> nsample = 100 543 >>> rng = np.random.default_rng() 544 545 A t distribution with small degrees of freedom: 546 547 >>> ax1 = plt.subplot(221) 548 >>> x = stats.t.rvs(3, size=nsample, random_state=rng) 549 >>> res = stats.probplot(x, plot=plt) 550 551 A t distribution with larger degrees of freedom: 552 553 >>> ax2 = plt.subplot(222) 554 >>> x = stats.t.rvs(25, size=nsample, random_state=rng) 555 >>> res = stats.probplot(x, plot=plt) 556 557 A mixture of two normal distributions with broadcasting: 558 559 >>> ax3 = plt.subplot(223) 560 >>> x = stats.norm.rvs(loc=[0,5], scale=[1,1.5], 561 ... size=(nsample//2,2), random_state=rng).ravel() 562 >>> res = stats.probplot(x, plot=plt) 563 564 A standard normal distribution: 565 566 >>> ax4 = plt.subplot(224) 567 >>> x = stats.norm.rvs(loc=0, scale=1, size=nsample, random_state=rng) 568 >>> res = stats.probplot(x, plot=plt) 569 570 Produce a new figure with a loggamma distribution, using the ``dist`` and 571 ``sparams`` keywords: 572 573 >>> fig = plt.figure() 574 >>> ax = fig.add_subplot(111) 575 >>> x = stats.loggamma.rvs(c=2.5, size=500, random_state=rng) 576 >>> res = stats.probplot(x, dist=stats.loggamma, sparams=(2.5,), plot=ax) 577 >>> ax.set_title("Probplot for loggamma dist with shape parameter 2.5") 578 579 Show the results with Matplotlib: 580 581 >>> plt.show() 582 583 """ 584 x = np.asarray(x) 585 if x.size == 0: 586 if fit: 587 return (x, x), (np.nan, np.nan, 0.0) 588 else: 589 return x, x 590 591 osm_uniform = _calc_uniform_order_statistic_medians(len(x)) 592 dist = _parse_dist_kw(dist, enforce_subclass=False) 593 if sparams is None: 594 sparams = () 595 if isscalar(sparams): 596 sparams = (sparams,) 597 if not isinstance(sparams, tuple): 598 sparams = tuple(sparams) 599 600 osm = dist.ppf(osm_uniform, *sparams) 601 osr = sort(x) 602 if fit: 603 # perform a linear least squares fit. 604 slope, intercept, r, prob, _ = stats.linregress(osm, osr) 605 606 if plot is not None: 607 plot.plot(osm, osr, 'bo') 608 if fit: 609 plot.plot(osm, slope*osm + intercept, 'r-') 610 _add_axis_labels_title(plot, xlabel='Theoretical quantiles', 611 ylabel='Ordered Values', 612 title='Probability Plot') 613 614 # Add R^2 value to the plot as text 615 if rvalue: 616 xmin = amin(osm) 617 xmax = amax(osm) 618 ymin = amin(x) 619 ymax = amax(x) 620 posx = xmin + 0.70 * (xmax - xmin) 621 posy = ymin + 0.01 * (ymax - ymin) 622 plot.text(posx, posy, "$R^2=%1.4f$" % r**2) 623 624 if fit: 625 return (osm, osr), (slope, intercept, r) 626 else: 627 return osm, osr 628 629 630def ppcc_max(x, brack=(0.0, 1.0), dist='tukeylambda'): 631 """Calculate the shape parameter that maximizes the PPCC. 632 633 The probability plot correlation coefficient (PPCC) plot can be used 634 to determine the optimal shape parameter for a one-parameter family 635 of distributions. ``ppcc_max`` returns the shape parameter that would 636 maximize the probability plot correlation coefficient for the given 637 data to a one-parameter family of distributions. 638 639 Parameters 640 ---------- 641 x : array_like 642 Input array. 643 brack : tuple, optional 644 Triple (a,b,c) where (a<b<c). If bracket consists of two numbers (a, c) 645 then they are assumed to be a starting interval for a downhill bracket 646 search (see `scipy.optimize.brent`). 647 dist : str or stats.distributions instance, optional 648 Distribution or distribution function name. Objects that look enough 649 like a stats.distributions instance (i.e. they have a ``ppf`` method) 650 are also accepted. The default is ``'tukeylambda'``. 651 652 Returns 653 ------- 654 shape_value : float 655 The shape parameter at which the probability plot correlation 656 coefficient reaches its max value. 657 658 See Also 659 -------- 660 ppcc_plot, probplot, boxcox 661 662 Notes 663 ----- 664 The brack keyword serves as a starting point which is useful in corner 665 cases. One can use a plot to obtain a rough visual estimate of the location 666 for the maximum to start the search near it. 667 668 References 669 ---------- 670 .. [1] J.J. Filliben, "The Probability Plot Correlation Coefficient Test 671 for Normality", Technometrics, Vol. 17, pp. 111-117, 1975. 672 .. [2] Engineering Statistics Handbook, NIST/SEMATEC, 673 https://www.itl.nist.gov/div898/handbook/eda/section3/ppccplot.htm 674 675 Examples 676 -------- 677 First we generate some random data from a Weibull distribution 678 with shape parameter 2.5: 679 680 >>> from scipy import stats 681 >>> import matplotlib.pyplot as plt 682 >>> rng = np.random.default_rng() 683 >>> c = 2.5 684 >>> x = stats.weibull_min.rvs(c, scale=4, size=2000, random_state=rng) 685 686 Generate the PPCC plot for this data with the Weibull distribution. 687 688 >>> fig, ax = plt.subplots(figsize=(8, 6)) 689 >>> res = stats.ppcc_plot(x, c/2, 2*c, dist='weibull_min', plot=ax) 690 691 We calculate the value where the shape should reach its maximum and a 692 red line is drawn there. The line should coincide with the highest 693 point in the PPCC graph. 694 695 >>> cmax = stats.ppcc_max(x, brack=(c/2, 2*c), dist='weibull_min') 696 >>> ax.axvline(cmax, color='r') 697 >>> plt.show() 698 699 """ 700 dist = _parse_dist_kw(dist) 701 osm_uniform = _calc_uniform_order_statistic_medians(len(x)) 702 osr = sort(x) 703 704 # this function computes the x-axis values of the probability plot 705 # and computes a linear regression (including the correlation) 706 # and returns 1-r so that a minimization function maximizes the 707 # correlation 708 def tempfunc(shape, mi, yvals, func): 709 xvals = func(mi, shape) 710 r, prob = stats.pearsonr(xvals, yvals) 711 return 1 - r 712 713 return optimize.brent(tempfunc, brack=brack, 714 args=(osm_uniform, osr, dist.ppf)) 715 716 717def ppcc_plot(x, a, b, dist='tukeylambda', plot=None, N=80): 718 """Calculate and optionally plot probability plot correlation coefficient. 719 720 The probability plot correlation coefficient (PPCC) plot can be used to 721 determine the optimal shape parameter for a one-parameter family of 722 distributions. It cannot be used for distributions without shape 723 parameters 724 (like the normal distribution) or with multiple shape parameters. 725 726 By default a Tukey-Lambda distribution (`stats.tukeylambda`) is used. A 727 Tukey-Lambda PPCC plot interpolates from long-tailed to short-tailed 728 distributions via an approximately normal one, and is therefore 729 particularly useful in practice. 730 731 Parameters 732 ---------- 733 x : array_like 734 Input array. 735 a, b : scalar 736 Lower and upper bounds of the shape parameter to use. 737 dist : str or stats.distributions instance, optional 738 Distribution or distribution function name. Objects that look enough 739 like a stats.distributions instance (i.e. they have a ``ppf`` method) 740 are also accepted. The default is ``'tukeylambda'``. 741 plot : object, optional 742 If given, plots PPCC against the shape parameter. 743 `plot` is an object that has to have methods "plot" and "text". 744 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, 745 or a custom object with the same methods. 746 Default is None, which means that no plot is created. 747 N : int, optional 748 Number of points on the horizontal axis (equally distributed from 749 `a` to `b`). 750 751 Returns 752 ------- 753 svals : ndarray 754 The shape values for which `ppcc` was calculated. 755 ppcc : ndarray 756 The calculated probability plot correlation coefficient values. 757 758 See Also 759 -------- 760 ppcc_max, probplot, boxcox_normplot, tukeylambda 761 762 References 763 ---------- 764 J.J. Filliben, "The Probability Plot Correlation Coefficient Test for 765 Normality", Technometrics, Vol. 17, pp. 111-117, 1975. 766 767 Examples 768 -------- 769 First we generate some random data from a Weibull distribution 770 with shape parameter 2.5, and plot the histogram of the data: 771 772 >>> from scipy import stats 773 >>> import matplotlib.pyplot as plt 774 >>> rng = np.random.default_rng() 775 >>> c = 2.5 776 >>> x = stats.weibull_min.rvs(c, scale=4, size=2000, random_state=rng) 777 778 Take a look at the histogram of the data. 779 780 >>> fig1, ax = plt.subplots(figsize=(9, 4)) 781 >>> ax.hist(x, bins=50) 782 >>> ax.set_title('Histogram of x') 783 >>> plt.show() 784 785 Now we explore this data with a PPCC plot as well as the related 786 probability plot and Box-Cox normplot. A red line is drawn where we 787 expect the PPCC value to be maximal (at the shape parameter ``c`` 788 used above): 789 790 >>> fig2 = plt.figure(figsize=(12, 4)) 791 >>> ax1 = fig2.add_subplot(1, 3, 1) 792 >>> ax2 = fig2.add_subplot(1, 3, 2) 793 >>> ax3 = fig2.add_subplot(1, 3, 3) 794 >>> res = stats.probplot(x, plot=ax1) 795 >>> res = stats.boxcox_normplot(x, -4, 4, plot=ax2) 796 >>> res = stats.ppcc_plot(x, c/2, 2*c, dist='weibull_min', plot=ax3) 797 >>> ax3.axvline(c, color='r') 798 >>> plt.show() 799 800 """ 801 if b <= a: 802 raise ValueError("`b` has to be larger than `a`.") 803 804 svals = np.linspace(a, b, num=N) 805 ppcc = np.empty_like(svals) 806 for k, sval in enumerate(svals): 807 _, r2 = probplot(x, sval, dist=dist, fit=True) 808 ppcc[k] = r2[-1] 809 810 if plot is not None: 811 plot.plot(svals, ppcc, 'x') 812 _add_axis_labels_title(plot, xlabel='Shape Values', 813 ylabel='Prob Plot Corr. Coef.', 814 title='(%s) PPCC Plot' % dist) 815 816 return svals, ppcc 817 818 819def boxcox_llf(lmb, data): 820 r"""The boxcox log-likelihood function. 821 822 Parameters 823 ---------- 824 lmb : scalar 825 Parameter for Box-Cox transformation. See `boxcox` for details. 826 data : array_like 827 Data to calculate Box-Cox log-likelihood for. If `data` is 828 multi-dimensional, the log-likelihood is calculated along the first 829 axis. 830 831 Returns 832 ------- 833 llf : float or ndarray 834 Box-Cox log-likelihood of `data` given `lmb`. A float for 1-D `data`, 835 an array otherwise. 836 837 See Also 838 -------- 839 boxcox, probplot, boxcox_normplot, boxcox_normmax 840 841 Notes 842 ----- 843 The Box-Cox log-likelihood function is defined here as 844 845 .. math:: 846 847 llf = (\lambda - 1) \sum_i(\log(x_i)) - 848 N/2 \log(\sum_i (y_i - \bar{y})^2 / N), 849 850 where ``y`` is the Box-Cox transformed input data ``x``. 851 852 Examples 853 -------- 854 >>> from scipy import stats 855 >>> import matplotlib.pyplot as plt 856 >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes 857 858 Generate some random variates and calculate Box-Cox log-likelihood values 859 for them for a range of ``lmbda`` values: 860 861 >>> rng = np.random.default_rng() 862 >>> x = stats.loggamma.rvs(5, loc=10, size=1000, random_state=rng) 863 >>> lmbdas = np.linspace(-2, 10) 864 >>> llf = np.zeros(lmbdas.shape, dtype=float) 865 >>> for ii, lmbda in enumerate(lmbdas): 866 ... llf[ii] = stats.boxcox_llf(lmbda, x) 867 868 Also find the optimal lmbda value with `boxcox`: 869 870 >>> x_most_normal, lmbda_optimal = stats.boxcox(x) 871 872 Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a 873 horizontal line to check that that's really the optimum: 874 875 >>> fig = plt.figure() 876 >>> ax = fig.add_subplot(111) 877 >>> ax.plot(lmbdas, llf, 'b.-') 878 >>> ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r') 879 >>> ax.set_xlabel('lmbda parameter') 880 >>> ax.set_ylabel('Box-Cox log-likelihood') 881 882 Now add some probability plots to show that where the log-likelihood is 883 maximized the data transformed with `boxcox` looks closest to normal: 884 885 >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right' 886 >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs): 887 ... xt = stats.boxcox(x, lmbda=lmbda) 888 ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt) 889 ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc) 890 ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-') 891 ... ax_inset.set_xticklabels([]) 892 ... ax_inset.set_yticklabels([]) 893 ... ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda) 894 895 >>> plt.show() 896 897 """ 898 data = np.asarray(data) 899 N = data.shape[0] 900 if N == 0: 901 return np.nan 902 903 logdata = np.log(data) 904 905 # Compute the variance of the transformed data. 906 if lmb == 0: 907 variance = np.var(logdata, axis=0) 908 else: 909 # Transform without the constant offset 1/lmb. The offset does 910 # not effect the variance, and the subtraction of the offset can 911 # lead to loss of precision. 912 variance = np.var(data**lmb / lmb, axis=0) 913 914 return (lmb - 1) * np.sum(logdata, axis=0) - N/2 * np.log(variance) 915 916 917def _boxcox_conf_interval(x, lmax, alpha): 918 # Need to find the lambda for which 919 # f(x,lmbda) >= f(x,lmax) - 0.5*chi^2_alpha;1 920 fac = 0.5 * distributions.chi2.ppf(1 - alpha, 1) 921 target = boxcox_llf(lmax, x) - fac 922 923 def rootfunc(lmbda, data, target): 924 return boxcox_llf(lmbda, data) - target 925 926 # Find positive endpoint of interval in which answer is to be found 927 newlm = lmax + 0.5 928 N = 0 929 while (rootfunc(newlm, x, target) > 0.0) and (N < 500): 930 newlm += 0.1 931 N += 1 932 933 if N == 500: 934 raise RuntimeError("Could not find endpoint.") 935 936 lmplus = optimize.brentq(rootfunc, lmax, newlm, args=(x, target)) 937 938 # Now find negative interval in the same way 939 newlm = lmax - 0.5 940 N = 0 941 while (rootfunc(newlm, x, target) > 0.0) and (N < 500): 942 newlm -= 0.1 943 N += 1 944 945 if N == 500: 946 raise RuntimeError("Could not find endpoint.") 947 948 lmminus = optimize.brentq(rootfunc, newlm, lmax, args=(x, target)) 949 return lmminus, lmplus 950 951 952def boxcox(x, lmbda=None, alpha=None, optimizer=None): 953 r"""Return a dataset transformed by a Box-Cox power transformation. 954 955 Parameters 956 ---------- 957 x : ndarray 958 Input array. Must be positive 1-dimensional. Must not be constant. 959 lmbda : {None, scalar}, optional 960 If `lmbda` is not None, do the transformation for that value. 961 If `lmbda` is None, find the lambda that maximizes the log-likelihood 962 function and return it as the second output argument. 963 alpha : {None, float}, optional 964 If ``alpha`` is not None, return the ``100 * (1-alpha)%`` confidence 965 interval for `lmbda` as the third output argument. 966 Must be between 0.0 and 1.0. 967 optimizer : callable, optional 968 If `lmbda` is None, `optimizer` is the scalar optimizer used to find 969 the value of `lmbda` that minimizes the negative log-likelihood 970 function. `optimizer` is a callable that accepts one argument: 971 972 fun : callable 973 The objective function, which evaluates the negative 974 log-likelihood function at a provided value of `lmbda` 975 976 and returns an object, such as an instance of 977 `scipy.optimize.OptimizeResult`, which holds the optimal value of 978 `lmbda` in an attribute `x`. 979 980 See the example in `boxcox_normmax` or the documentation of 981 `scipy.optimize.minimize_scalar` for more information. 982 983 If `lmbda` is not None, `optimizer` is ignored. 984 985 Returns 986 ------- 987 boxcox : ndarray 988 Box-Cox power transformed array. 989 maxlog : float, optional 990 If the `lmbda` parameter is None, the second returned argument is 991 the lambda that maximizes the log-likelihood function. 992 (min_ci, max_ci) : tuple of float, optional 993 If `lmbda` parameter is None and ``alpha`` is not None, this returned 994 tuple of floats represents the minimum and maximum confidence limits 995 given ``alpha``. 996 997 See Also 998 -------- 999 probplot, boxcox_normplot, boxcox_normmax, boxcox_llf 1000 1001 Notes 1002 ----- 1003 The Box-Cox transform is given by:: 1004 1005 y = (x**lmbda - 1) / lmbda, for lmbda != 0 1006 log(x), for lmbda = 0 1007 1008 `boxcox` requires the input data to be positive. Sometimes a Box-Cox 1009 transformation provides a shift parameter to achieve this; `boxcox` does 1010 not. Such a shift parameter is equivalent to adding a positive constant to 1011 `x` before calling `boxcox`. 1012 1013 The confidence limits returned when ``alpha`` is provided give the interval 1014 where: 1015 1016 .. math:: 1017 1018 llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1), 1019 1020 with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared 1021 function. 1022 1023 References 1024 ---------- 1025 G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the 1026 Royal Statistical Society B, 26, 211-252 (1964). 1027 1028 Examples 1029 -------- 1030 >>> from scipy import stats 1031 >>> import matplotlib.pyplot as plt 1032 1033 We generate some random variates from a non-normal distribution and make a 1034 probability plot for it, to show it is non-normal in the tails: 1035 1036 >>> fig = plt.figure() 1037 >>> ax1 = fig.add_subplot(211) 1038 >>> x = stats.loggamma.rvs(5, size=500) + 5 1039 >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1) 1040 >>> ax1.set_xlabel('') 1041 >>> ax1.set_title('Probplot against normal distribution') 1042 1043 We now use `boxcox` to transform the data so it's closest to normal: 1044 1045 >>> ax2 = fig.add_subplot(212) 1046 >>> xt, _ = stats.boxcox(x) 1047 >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2) 1048 >>> ax2.set_title('Probplot after Box-Cox transformation') 1049 1050 >>> plt.show() 1051 1052 """ 1053 x = np.asarray(x) 1054 if x.ndim != 1: 1055 raise ValueError("Data must be 1-dimensional.") 1056 1057 if x.size == 0: 1058 return x 1059 1060 if np.all(x == x[0]): 1061 raise ValueError("Data must not be constant.") 1062 1063 if np.any(x <= 0): 1064 raise ValueError("Data must be positive.") 1065 1066 if lmbda is not None: # single transformation 1067 return special.boxcox(x, lmbda) 1068 1069 # If lmbda=None, find the lmbda that maximizes the log-likelihood function. 1070 lmax = boxcox_normmax(x, method='mle', optimizer=optimizer) 1071 y = boxcox(x, lmax) 1072 1073 if alpha is None: 1074 return y, lmax 1075 else: 1076 # Find confidence interval 1077 interval = _boxcox_conf_interval(x, lmax, alpha) 1078 return y, lmax, interval 1079 1080 1081def boxcox_normmax(x, brack=None, method='pearsonr', optimizer=None): 1082 """Compute optimal Box-Cox transform parameter for input data. 1083 1084 Parameters 1085 ---------- 1086 x : array_like 1087 Input array. 1088 brack : 2-tuple, optional, default (-2.0, 2.0) 1089 The starting interval for a downhill bracket search for the default 1090 `optimize.brent` solver. Note that this is in most cases not 1091 critical; the final result is allowed to be outside this bracket. 1092 If `optimizer` is passed, `brack` must be None. 1093 method : str, optional 1094 The method to determine the optimal transform parameter (`boxcox` 1095 ``lmbda`` parameter). Options are: 1096 1097 'pearsonr' (default) 1098 Maximizes the Pearson correlation coefficient between 1099 ``y = boxcox(x)`` and the expected values for ``y`` if `x` would be 1100 normally-distributed. 1101 1102 'mle' 1103 Minimizes the log-likelihood `boxcox_llf`. This is the method used 1104 in `boxcox`. 1105 1106 'all' 1107 Use all optimization methods available, and return all results. 1108 Useful to compare different methods. 1109 optimizer : callable, optional 1110 `optimizer` is a callable that accepts one argument: 1111 1112 fun : callable 1113 The objective function to be optimized. `fun` accepts one argument, 1114 the Box-Cox transform parameter `lmbda`, and returns the negative 1115 log-likelihood function at the provided value. The job of `optimizer` 1116 is to find the value of `lmbda` that minimizes `fun`. 1117 1118 and returns an object, such as an instance of 1119 `scipy.optimize.OptimizeResult`, which holds the optimal value of 1120 `lmbda` in an attribute `x`. 1121 1122 See the example below or the documentation of 1123 `scipy.optimize.minimize_scalar` for more information. 1124 1125 Returns 1126 ------- 1127 maxlog : float or ndarray 1128 The optimal transform parameter found. An array instead of a scalar 1129 for ``method='all'``. 1130 1131 See Also 1132 -------- 1133 boxcox, boxcox_llf, boxcox_normplot, scipy.optimize.minimize_scalar 1134 1135 Examples 1136 -------- 1137 >>> from scipy import stats 1138 >>> import matplotlib.pyplot as plt 1139 1140 We can generate some data and determine the optimal ``lmbda`` in various 1141 ways: 1142 1143 >>> rng = np.random.default_rng() 1144 >>> x = stats.loggamma.rvs(5, size=30, random_state=rng) + 5 1145 >>> y, lmax_mle = stats.boxcox(x) 1146 >>> lmax_pearsonr = stats.boxcox_normmax(x) 1147 1148 >>> lmax_mle 1149 1.4613865614008015 1150 >>> lmax_pearsonr 1151 1.6685004886804342 1152 >>> stats.boxcox_normmax(x, method='all') 1153 array([1.66850049, 1.46138656]) 1154 1155 >>> fig = plt.figure() 1156 >>> ax = fig.add_subplot(111) 1157 >>> prob = stats.boxcox_normplot(x, -10, 10, plot=ax) 1158 >>> ax.axvline(lmax_mle, color='r') 1159 >>> ax.axvline(lmax_pearsonr, color='g', ls='--') 1160 1161 >>> plt.show() 1162 1163 Alternatively, we can define our own `optimizer` function. Suppose we 1164 are only interested in values of `lmbda` on the interval [6, 7], we 1165 want to use `scipy.optimize.minimize_scalar` with ``method='bounded'``, 1166 and we want to use tighter tolerances when optimizing the log-likelihood 1167 function. To do this, we define a function that accepts positional argument 1168 `fun` and uses `scipy.optimize.minimize_scalar` to minimize `fun` subject 1169 to the provided bounds and tolerances: 1170 1171 >>> from scipy import optimize 1172 >>> options = {'xatol': 1e-12} # absolute tolerance on `x` 1173 >>> def optimizer(fun): 1174 ... return optimize.minimize_scalar(fun, bounds=(6, 7), 1175 ... method="bounded", options=options) 1176 >>> stats.boxcox_normmax(x, optimizer=optimizer) 1177 6.000... 1178 """ 1179 # If optimizer is not given, define default 'brent' optimizer. 1180 if optimizer is None: 1181 1182 # Set default value for `brack`. 1183 if brack is None: 1184 brack = (-2.0, 2.0) 1185 1186 def _optimizer(func, args): 1187 return optimize.brent(func, args=args, brack=brack) 1188 1189 # Otherwise check optimizer. 1190 else: 1191 if not callable(optimizer): 1192 raise ValueError("`optimizer` must be a callable") 1193 1194 if brack is not None: 1195 raise ValueError("`brack` must be None if `optimizer` is given") 1196 1197 # `optimizer` is expected to return a `OptimizeResult` object, we here 1198 # get the solution to the optimization problem. 1199 def _optimizer(func, args): 1200 def func_wrapped(x): 1201 return func(x, *args) 1202 return getattr(optimizer(func_wrapped), 'x', None) 1203 1204 def _pearsonr(x): 1205 osm_uniform = _calc_uniform_order_statistic_medians(len(x)) 1206 xvals = distributions.norm.ppf(osm_uniform) 1207 1208 def _eval_pearsonr(lmbda, xvals, samps): 1209 # This function computes the x-axis values of the probability plot 1210 # and computes a linear regression (including the correlation) and 1211 # returns ``1 - r`` so that a minimization function maximizes the 1212 # correlation. 1213 y = boxcox(samps, lmbda) 1214 yvals = np.sort(y) 1215 r, prob = stats.pearsonr(xvals, yvals) 1216 return 1 - r 1217 1218 return _optimizer(_eval_pearsonr, args=(xvals, x)) 1219 1220 def _mle(x): 1221 def _eval_mle(lmb, data): 1222 # function to minimize 1223 return -boxcox_llf(lmb, data) 1224 1225 return _optimizer(_eval_mle, args=(x,)) 1226 1227 def _all(x): 1228 maxlog = np.empty(2, dtype=float) 1229 maxlog[0] = _pearsonr(x) 1230 maxlog[1] = _mle(x) 1231 return maxlog 1232 1233 methods = {'pearsonr': _pearsonr, 1234 'mle': _mle, 1235 'all': _all} 1236 if method not in methods.keys(): 1237 raise ValueError("Method %s not recognized." % method) 1238 1239 optimfunc = methods[method] 1240 res = optimfunc(x) 1241 if res is None: 1242 message = ("`optimizer` must return an object containing the optimal " 1243 "`lmbda` in attribute `x`") 1244 raise ValueError(message) 1245 return res 1246 1247 1248def _normplot(method, x, la, lb, plot=None, N=80): 1249 """Compute parameters for a Box-Cox or Yeo-Johnson normality plot, 1250 optionally show it. 1251 1252 See `boxcox_normplot` or `yeojohnson_normplot` for details. 1253 """ 1254 1255 if method == 'boxcox': 1256 title = 'Box-Cox Normality Plot' 1257 transform_func = boxcox 1258 else: 1259 title = 'Yeo-Johnson Normality Plot' 1260 transform_func = yeojohnson 1261 1262 x = np.asarray(x) 1263 if x.size == 0: 1264 return x 1265 1266 if lb <= la: 1267 raise ValueError("`lb` has to be larger than `la`.") 1268 1269 lmbdas = np.linspace(la, lb, num=N) 1270 ppcc = lmbdas * 0.0 1271 for i, val in enumerate(lmbdas): 1272 # Determine for each lmbda the square root of correlation coefficient 1273 # of transformed x 1274 z = transform_func(x, lmbda=val) 1275 _, (_, _, r) = probplot(z, dist='norm', fit=True) 1276 ppcc[i] = r 1277 1278 if plot is not None: 1279 plot.plot(lmbdas, ppcc, 'x') 1280 _add_axis_labels_title(plot, xlabel='$\\lambda$', 1281 ylabel='Prob Plot Corr. Coef.', 1282 title=title) 1283 1284 return lmbdas, ppcc 1285 1286 1287def boxcox_normplot(x, la, lb, plot=None, N=80): 1288 """Compute parameters for a Box-Cox normality plot, optionally show it. 1289 1290 A Box-Cox normality plot shows graphically what the best transformation 1291 parameter is to use in `boxcox` to obtain a distribution that is close 1292 to normal. 1293 1294 Parameters 1295 ---------- 1296 x : array_like 1297 Input array. 1298 la, lb : scalar 1299 The lower and upper bounds for the ``lmbda`` values to pass to `boxcox` 1300 for Box-Cox transformations. These are also the limits of the 1301 horizontal axis of the plot if that is generated. 1302 plot : object, optional 1303 If given, plots the quantiles and least squares fit. 1304 `plot` is an object that has to have methods "plot" and "text". 1305 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, 1306 or a custom object with the same methods. 1307 Default is None, which means that no plot is created. 1308 N : int, optional 1309 Number of points on the horizontal axis (equally distributed from 1310 `la` to `lb`). 1311 1312 Returns 1313 ------- 1314 lmbdas : ndarray 1315 The ``lmbda`` values for which a Box-Cox transform was done. 1316 ppcc : ndarray 1317 Probability Plot Correlelation Coefficient, as obtained from `probplot` 1318 when fitting the Box-Cox transformed input `x` against a normal 1319 distribution. 1320 1321 See Also 1322 -------- 1323 probplot, boxcox, boxcox_normmax, boxcox_llf, ppcc_max 1324 1325 Notes 1326 ----- 1327 Even if `plot` is given, the figure is not shown or saved by 1328 `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')`` 1329 should be used after calling `probplot`. 1330 1331 Examples 1332 -------- 1333 >>> from scipy import stats 1334 >>> import matplotlib.pyplot as plt 1335 1336 Generate some non-normally distributed data, and create a Box-Cox plot: 1337 1338 >>> x = stats.loggamma.rvs(5, size=500) + 5 1339 >>> fig = plt.figure() 1340 >>> ax = fig.add_subplot(111) 1341 >>> prob = stats.boxcox_normplot(x, -20, 20, plot=ax) 1342 1343 Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in 1344 the same plot: 1345 1346 >>> _, maxlog = stats.boxcox(x) 1347 >>> ax.axvline(maxlog, color='r') 1348 1349 >>> plt.show() 1350 1351 """ 1352 return _normplot('boxcox', x, la, lb, plot, N) 1353 1354 1355def yeojohnson(x, lmbda=None): 1356 r"""Return a dataset transformed by a Yeo-Johnson power transformation. 1357 1358 Parameters 1359 ---------- 1360 x : ndarray 1361 Input array. Should be 1-dimensional. 1362 lmbda : float, optional 1363 If ``lmbda`` is ``None``, find the lambda that maximizes the 1364 log-likelihood function and return it as the second output argument. 1365 Otherwise the transformation is done for the given value. 1366 1367 Returns 1368 ------- 1369 yeojohnson: ndarray 1370 Yeo-Johnson power transformed array. 1371 maxlog : float, optional 1372 If the `lmbda` parameter is None, the second returned argument is 1373 the lambda that maximizes the log-likelihood function. 1374 1375 See Also 1376 -------- 1377 probplot, yeojohnson_normplot, yeojohnson_normmax, yeojohnson_llf, boxcox 1378 1379 Notes 1380 ----- 1381 The Yeo-Johnson transform is given by:: 1382 1383 y = ((x + 1)**lmbda - 1) / lmbda, for x >= 0, lmbda != 0 1384 log(x + 1), for x >= 0, lmbda = 0 1385 -((-x + 1)**(2 - lmbda) - 1) / (2 - lmbda), for x < 0, lmbda != 2 1386 -log(-x + 1), for x < 0, lmbda = 2 1387 1388 Unlike `boxcox`, `yeojohnson` does not require the input data to be 1389 positive. 1390 1391 .. versionadded:: 1.2.0 1392 1393 1394 References 1395 ---------- 1396 I. Yeo and R.A. Johnson, "A New Family of Power Transformations to 1397 Improve Normality or Symmetry", Biometrika 87.4 (2000): 1398 1399 1400 Examples 1401 -------- 1402 >>> from scipy import stats 1403 >>> import matplotlib.pyplot as plt 1404 1405 We generate some random variates from a non-normal distribution and make a 1406 probability plot for it, to show it is non-normal in the tails: 1407 1408 >>> fig = plt.figure() 1409 >>> ax1 = fig.add_subplot(211) 1410 >>> x = stats.loggamma.rvs(5, size=500) + 5 1411 >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1) 1412 >>> ax1.set_xlabel('') 1413 >>> ax1.set_title('Probplot against normal distribution') 1414 1415 We now use `yeojohnson` to transform the data so it's closest to normal: 1416 1417 >>> ax2 = fig.add_subplot(212) 1418 >>> xt, lmbda = stats.yeojohnson(x) 1419 >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2) 1420 >>> ax2.set_title('Probplot after Yeo-Johnson transformation') 1421 1422 >>> plt.show() 1423 1424 """ 1425 x = np.asarray(x) 1426 if x.size == 0: 1427 return x 1428 1429 if np.issubdtype(x.dtype, np.complexfloating): 1430 raise ValueError('Yeo-Johnson transformation is not defined for ' 1431 'complex numbers.') 1432 1433 if np.issubdtype(x.dtype, np.integer): 1434 x = x.astype(np.float64, copy=False) 1435 1436 if lmbda is not None: 1437 return _yeojohnson_transform(x, lmbda) 1438 1439 # if lmbda=None, find the lmbda that maximizes the log-likelihood function. 1440 lmax = yeojohnson_normmax(x) 1441 y = _yeojohnson_transform(x, lmax) 1442 1443 return y, lmax 1444 1445 1446def _yeojohnson_transform(x, lmbda): 1447 """Returns `x` transformed by the Yeo-Johnson power transform with given 1448 parameter `lmbda`. 1449 """ 1450 out = np.zeros_like(x) 1451 pos = x >= 0 # binary mask 1452 1453 # when x >= 0 1454 if abs(lmbda) < np.spacing(1.): 1455 out[pos] = np.log1p(x[pos]) 1456 else: # lmbda != 0 1457 out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda 1458 1459 # when x < 0 1460 if abs(lmbda - 2) > np.spacing(1.): 1461 out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda) 1462 else: # lmbda == 2 1463 out[~pos] = -np.log1p(-x[~pos]) 1464 1465 return out 1466 1467 1468def yeojohnson_llf(lmb, data): 1469 r"""The yeojohnson log-likelihood function. 1470 1471 Parameters 1472 ---------- 1473 lmb : scalar 1474 Parameter for Yeo-Johnson transformation. See `yeojohnson` for 1475 details. 1476 data : array_like 1477 Data to calculate Yeo-Johnson log-likelihood for. If `data` is 1478 multi-dimensional, the log-likelihood is calculated along the first 1479 axis. 1480 1481 Returns 1482 ------- 1483 llf : float 1484 Yeo-Johnson log-likelihood of `data` given `lmb`. 1485 1486 See Also 1487 -------- 1488 yeojohnson, probplot, yeojohnson_normplot, yeojohnson_normmax 1489 1490 Notes 1491 ----- 1492 The Yeo-Johnson log-likelihood function is defined here as 1493 1494 .. math:: 1495 1496 llf = -N/2 \log(\hat{\sigma}^2) + (\lambda - 1) 1497 \sum_i \text{ sign }(x_i)\log(|x_i| + 1) 1498 1499 where :math:`\hat{\sigma}^2` is estimated variance of the the Yeo-Johnson 1500 transformed input data ``x``. 1501 1502 .. versionadded:: 1.2.0 1503 1504 Examples 1505 -------- 1506 >>> from scipy import stats 1507 >>> import matplotlib.pyplot as plt 1508 >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes 1509 1510 Generate some random variates and calculate Yeo-Johnson log-likelihood 1511 values for them for a range of ``lmbda`` values: 1512 1513 >>> x = stats.loggamma.rvs(5, loc=10, size=1000) 1514 >>> lmbdas = np.linspace(-2, 10) 1515 >>> llf = np.zeros(lmbdas.shape, dtype=float) 1516 >>> for ii, lmbda in enumerate(lmbdas): 1517 ... llf[ii] = stats.yeojohnson_llf(lmbda, x) 1518 1519 Also find the optimal lmbda value with `yeojohnson`: 1520 1521 >>> x_most_normal, lmbda_optimal = stats.yeojohnson(x) 1522 1523 Plot the log-likelihood as function of lmbda. Add the optimal lmbda as a 1524 horizontal line to check that that's really the optimum: 1525 1526 >>> fig = plt.figure() 1527 >>> ax = fig.add_subplot(111) 1528 >>> ax.plot(lmbdas, llf, 'b.-') 1529 >>> ax.axhline(stats.yeojohnson_llf(lmbda_optimal, x), color='r') 1530 >>> ax.set_xlabel('lmbda parameter') 1531 >>> ax.set_ylabel('Yeo-Johnson log-likelihood') 1532 1533 Now add some probability plots to show that where the log-likelihood is 1534 maximized the data transformed with `yeojohnson` looks closest to normal: 1535 1536 >>> locs = [3, 10, 4] # 'lower left', 'center', 'lower right' 1537 >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs): 1538 ... xt = stats.yeojohnson(x, lmbda=lmbda) 1539 ... (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt) 1540 ... ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc) 1541 ... ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-') 1542 ... ax_inset.set_xticklabels([]) 1543 ... ax_inset.set_yticklabels([]) 1544 ... ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda) 1545 1546 >>> plt.show() 1547 1548 """ 1549 data = np.asarray(data) 1550 n_samples = data.shape[0] 1551 1552 if n_samples == 0: 1553 return np.nan 1554 1555 trans = _yeojohnson_transform(data, lmb) 1556 1557 loglike = -n_samples / 2 * np.log(trans.var(axis=0)) 1558 loglike += (lmb - 1) * (np.sign(data) * np.log(np.abs(data) + 1)).sum(axis=0) 1559 1560 return loglike 1561 1562 1563def yeojohnson_normmax(x, brack=(-2, 2)): 1564 """Compute optimal Yeo-Johnson transform parameter. 1565 1566 Compute optimal Yeo-Johnson transform parameter for input data, using 1567 maximum likelihood estimation. 1568 1569 Parameters 1570 ---------- 1571 x : array_like 1572 Input array. 1573 brack : 2-tuple, optional 1574 The starting interval for a downhill bracket search with 1575 `optimize.brent`. Note that this is in most cases not critical; the 1576 final result is allowed to be outside this bracket. 1577 1578 Returns 1579 ------- 1580 maxlog : float 1581 The optimal transform parameter found. 1582 1583 See Also 1584 -------- 1585 yeojohnson, yeojohnson_llf, yeojohnson_normplot 1586 1587 Notes 1588 ----- 1589 .. versionadded:: 1.2.0 1590 1591 Examples 1592 -------- 1593 >>> from scipy import stats 1594 >>> import matplotlib.pyplot as plt 1595 1596 Generate some data and determine optimal ``lmbda`` 1597 1598 >>> rng = np.random.default_rng() 1599 >>> x = stats.loggamma.rvs(5, size=30, random_state=rng) + 5 1600 >>> lmax = stats.yeojohnson_normmax(x) 1601 1602 >>> fig = plt.figure() 1603 >>> ax = fig.add_subplot(111) 1604 >>> prob = stats.yeojohnson_normplot(x, -10, 10, plot=ax) 1605 >>> ax.axvline(lmax, color='r') 1606 1607 >>> plt.show() 1608 1609 """ 1610 def _neg_llf(lmbda, data): 1611 return -yeojohnson_llf(lmbda, data) 1612 1613 return optimize.brent(_neg_llf, brack=brack, args=(x,)) 1614 1615 1616def yeojohnson_normplot(x, la, lb, plot=None, N=80): 1617 """Compute parameters for a Yeo-Johnson normality plot, optionally show it. 1618 1619 A Yeo-Johnson normality plot shows graphically what the best 1620 transformation parameter is to use in `yeojohnson` to obtain a 1621 distribution that is close to normal. 1622 1623 Parameters 1624 ---------- 1625 x : array_like 1626 Input array. 1627 la, lb : scalar 1628 The lower and upper bounds for the ``lmbda`` values to pass to 1629 `yeojohnson` for Yeo-Johnson transformations. These are also the 1630 limits of the horizontal axis of the plot if that is generated. 1631 plot : object, optional 1632 If given, plots the quantiles and least squares fit. 1633 `plot` is an object that has to have methods "plot" and "text". 1634 The `matplotlib.pyplot` module or a Matplotlib Axes object can be used, 1635 or a custom object with the same methods. 1636 Default is None, which means that no plot is created. 1637 N : int, optional 1638 Number of points on the horizontal axis (equally distributed from 1639 `la` to `lb`). 1640 1641 Returns 1642 ------- 1643 lmbdas : ndarray 1644 The ``lmbda`` values for which a Yeo-Johnson transform was done. 1645 ppcc : ndarray 1646 Probability Plot Correlelation Coefficient, as obtained from `probplot` 1647 when fitting the Box-Cox transformed input `x` against a normal 1648 distribution. 1649 1650 See Also 1651 -------- 1652 probplot, yeojohnson, yeojohnson_normmax, yeojohnson_llf, ppcc_max 1653 1654 Notes 1655 ----- 1656 Even if `plot` is given, the figure is not shown or saved by 1657 `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')`` 1658 should be used after calling `probplot`. 1659 1660 .. versionadded:: 1.2.0 1661 1662 Examples 1663 -------- 1664 >>> from scipy import stats 1665 >>> import matplotlib.pyplot as plt 1666 1667 Generate some non-normally distributed data, and create a Yeo-Johnson plot: 1668 1669 >>> x = stats.loggamma.rvs(5, size=500) + 5 1670 >>> fig = plt.figure() 1671 >>> ax = fig.add_subplot(111) 1672 >>> prob = stats.yeojohnson_normplot(x, -20, 20, plot=ax) 1673 1674 Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in 1675 the same plot: 1676 1677 >>> _, maxlog = stats.yeojohnson(x) 1678 >>> ax.axvline(maxlog, color='r') 1679 1680 >>> plt.show() 1681 1682 """ 1683 return _normplot('yeojohnson', x, la, lb, plot, N) 1684 1685 1686ShapiroResult = namedtuple('ShapiroResult', ('statistic', 'pvalue')) 1687 1688 1689def shapiro(x): 1690 """Perform the Shapiro-Wilk test for normality. 1691 1692 The Shapiro-Wilk test tests the null hypothesis that the 1693 data was drawn from a normal distribution. 1694 1695 Parameters 1696 ---------- 1697 x : array_like 1698 Array of sample data. 1699 1700 Returns 1701 ------- 1702 statistic : float 1703 The test statistic. 1704 p-value : float 1705 The p-value for the hypothesis test. 1706 1707 See Also 1708 -------- 1709 anderson : The Anderson-Darling test for normality 1710 kstest : The Kolmogorov-Smirnov test for goodness of fit. 1711 1712 Notes 1713 ----- 1714 The algorithm used is described in [4]_ but censoring parameters as 1715 described are not implemented. For N > 5000 the W test statistic is accurate 1716 but the p-value may not be. 1717 1718 The chance of rejecting the null hypothesis when it is true is close to 5% 1719 regardless of sample size. 1720 1721 References 1722 ---------- 1723 .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm 1724 .. [2] Shapiro, S. S. & Wilk, M.B (1965). An analysis of variance test for 1725 normality (complete samples), Biometrika, Vol. 52, pp. 591-611. 1726 .. [3] Razali, N. M. & Wah, Y. B. (2011) Power comparisons of Shapiro-Wilk, 1727 Kolmogorov-Smirnov, Lilliefors and Anderson-Darling tests, Journal of 1728 Statistical Modeling and Analytics, Vol. 2, pp. 21-33. 1729 .. [4] ALGORITHM AS R94 APPL. STATIST. (1995) VOL. 44, NO. 4. 1730 1731 Examples 1732 -------- 1733 >>> from scipy import stats 1734 >>> rng = np.random.default_rng() 1735 >>> x = stats.norm.rvs(loc=5, scale=3, size=100, random_state=rng) 1736 >>> shapiro_test = stats.shapiro(x) 1737 >>> shapiro_test 1738 ShapiroResult(statistic=0.9813305735588074, pvalue=0.16855233907699585) 1739 >>> shapiro_test.statistic 1740 0.9813305735588074 1741 >>> shapiro_test.pvalue 1742 0.16855233907699585 1743 1744 """ 1745 x = np.ravel(x) 1746 1747 N = len(x) 1748 if N < 3: 1749 raise ValueError("Data must be at least length 3.") 1750 1751 a = zeros(N, 'f') 1752 init = 0 1753 1754 y = sort(x) 1755 a, w, pw, ifault = statlib.swilk(y, a[:N//2], init) 1756 if ifault not in [0, 2]: 1757 warnings.warn("Input data for shapiro has range zero. The results " 1758 "may not be accurate.") 1759 if N > 5000: 1760 warnings.warn("p-value may not be accurate for N > 5000.") 1761 1762 return ShapiroResult(w, pw) 1763 1764 1765# Values from Stephens, M A, "EDF Statistics for Goodness of Fit and 1766# Some Comparisons", Journal of the American Statistical 1767# Association, Vol. 69, Issue 347, Sept. 1974, pp 730-737 1768_Avals_norm = array([0.576, 0.656, 0.787, 0.918, 1.092]) 1769_Avals_expon = array([0.922, 1.078, 1.341, 1.606, 1.957]) 1770# From Stephens, M A, "Goodness of Fit for the Extreme Value Distribution", 1771# Biometrika, Vol. 64, Issue 3, Dec. 1977, pp 583-588. 1772_Avals_gumbel = array([0.474, 0.637, 0.757, 0.877, 1.038]) 1773# From Stephens, M A, "Tests of Fit for the Logistic Distribution Based 1774# on the Empirical Distribution Function.", Biometrika, 1775# Vol. 66, Issue 3, Dec. 1979, pp 591-595. 1776_Avals_logistic = array([0.426, 0.563, 0.660, 0.769, 0.906, 1.010]) 1777 1778 1779AndersonResult = namedtuple('AndersonResult', ('statistic', 1780 'critical_values', 1781 'significance_level')) 1782 1783 1784def anderson(x, dist='norm'): 1785 """Anderson-Darling test for data coming from a particular distribution. 1786 1787 The Anderson-Darling test tests the null hypothesis that a sample is 1788 drawn from a population that follows a particular distribution. 1789 For the Anderson-Darling test, the critical values depend on 1790 which distribution is being tested against. This function works 1791 for normal, exponential, logistic, or Gumbel (Extreme Value 1792 Type I) distributions. 1793 1794 Parameters 1795 ---------- 1796 x : array_like 1797 Array of sample data. 1798 dist : {'norm', 'expon', 'logistic', 'gumbel', 'gumbel_l', 'gumbel_r', 'extreme1'}, optional 1799 The type of distribution to test against. The default is 'norm'. 1800 The names 'extreme1', 'gumbel_l' and 'gumbel' are synonyms for the 1801 same distribution. 1802 1803 Returns 1804 ------- 1805 statistic : float 1806 The Anderson-Darling test statistic. 1807 critical_values : list 1808 The critical values for this distribution. 1809 significance_level : list 1810 The significance levels for the corresponding critical values 1811 in percents. The function returns critical values for a 1812 differing set of significance levels depending on the 1813 distribution that is being tested against. 1814 1815 See Also 1816 -------- 1817 kstest : The Kolmogorov-Smirnov test for goodness-of-fit. 1818 1819 Notes 1820 ----- 1821 Critical values provided are for the following significance levels: 1822 1823 normal/exponential 1824 15%, 10%, 5%, 2.5%, 1% 1825 logistic 1826 25%, 10%, 5%, 2.5%, 1%, 0.5% 1827 Gumbel 1828 25%, 10%, 5%, 2.5%, 1% 1829 1830 If the returned statistic is larger than these critical values then 1831 for the corresponding significance level, the null hypothesis that 1832 the data come from the chosen distribution can be rejected. 1833 The returned statistic is referred to as 'A2' in the references. 1834 1835 References 1836 ---------- 1837 .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm 1838 .. [2] Stephens, M. A. (1974). EDF Statistics for Goodness of Fit and 1839 Some Comparisons, Journal of the American Statistical Association, 1840 Vol. 69, pp. 730-737. 1841 .. [3] Stephens, M. A. (1976). Asymptotic Results for Goodness-of-Fit 1842 Statistics with Unknown Parameters, Annals of Statistics, Vol. 4, 1843 pp. 357-369. 1844 .. [4] Stephens, M. A. (1977). Goodness of Fit for the Extreme Value 1845 Distribution, Biometrika, Vol. 64, pp. 583-588. 1846 .. [5] Stephens, M. A. (1977). Goodness of Fit with Special Reference 1847 to Tests for Exponentiality , Technical Report No. 262, 1848 Department of Statistics, Stanford University, Stanford, CA. 1849 .. [6] Stephens, M. A. (1979). Tests of Fit for the Logistic Distribution 1850 Based on the Empirical Distribution Function, Biometrika, Vol. 66, 1851 pp. 591-595. 1852 1853 """ 1854 if dist not in ['norm', 'expon', 'gumbel', 'gumbel_l', 1855 'gumbel_r', 'extreme1', 'logistic']: 1856 raise ValueError("Invalid distribution; dist must be 'norm', " 1857 "'expon', 'gumbel', 'extreme1' or 'logistic'.") 1858 y = sort(x) 1859 xbar = np.mean(x, axis=0) 1860 N = len(y) 1861 if dist == 'norm': 1862 s = np.std(x, ddof=1, axis=0) 1863 w = (y - xbar) / s 1864 logcdf = distributions.norm.logcdf(w) 1865 logsf = distributions.norm.logsf(w) 1866 sig = array([15, 10, 5, 2.5, 1]) 1867 critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N), 3) 1868 elif dist == 'expon': 1869 w = y / xbar 1870 logcdf = distributions.expon.logcdf(w) 1871 logsf = distributions.expon.logsf(w) 1872 sig = array([15, 10, 5, 2.5, 1]) 1873 critical = around(_Avals_expon / (1.0 + 0.6/N), 3) 1874 elif dist == 'logistic': 1875 def rootfunc(ab, xj, N): 1876 a, b = ab 1877 tmp = (xj - a) / b 1878 tmp2 = exp(tmp) 1879 val = [np.sum(1.0/(1+tmp2), axis=0) - 0.5*N, 1880 np.sum(tmp*(1.0-tmp2)/(1+tmp2), axis=0) + N] 1881 return array(val) 1882 1883 sol0 = array([xbar, np.std(x, ddof=1, axis=0)]) 1884 sol = optimize.fsolve(rootfunc, sol0, args=(x, N), xtol=1e-5) 1885 w = (y - sol[0]) / sol[1] 1886 logcdf = distributions.logistic.logcdf(w) 1887 logsf = distributions.logistic.logsf(w) 1888 sig = array([25, 10, 5, 2.5, 1, 0.5]) 1889 critical = around(_Avals_logistic / (1.0 + 0.25/N), 3) 1890 elif dist == 'gumbel_r': 1891 xbar, s = distributions.gumbel_r.fit(x) 1892 w = (y - xbar) / s 1893 logcdf = distributions.gumbel_r.logcdf(w) 1894 logsf = distributions.gumbel_r.logsf(w) 1895 sig = array([25, 10, 5, 2.5, 1]) 1896 critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3) 1897 else: # (dist == 'gumbel') or (dist == 'gumbel_l') or (dist == 'extreme1') 1898 xbar, s = distributions.gumbel_l.fit(x) 1899 w = (y - xbar) / s 1900 logcdf = distributions.gumbel_l.logcdf(w) 1901 logsf = distributions.gumbel_l.logsf(w) 1902 sig = array([25, 10, 5, 2.5, 1]) 1903 critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3) 1904 1905 i = arange(1, N + 1) 1906 A2 = -N - np.sum((2*i - 1.0) / N * (logcdf + logsf[::-1]), axis=0) 1907 1908 return AndersonResult(A2, critical, sig) 1909 1910 1911def _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N): 1912 """Compute A2akN equation 7 of Scholz and Stephens. 1913 1914 Parameters 1915 ---------- 1916 samples : sequence of 1-D array_like 1917 Array of sample arrays. 1918 Z : array_like 1919 Sorted array of all observations. 1920 Zstar : array_like 1921 Sorted array of unique observations. 1922 k : int 1923 Number of samples. 1924 n : array_like 1925 Number of observations in each sample. 1926 N : int 1927 Total number of observations. 1928 1929 Returns 1930 ------- 1931 A2aKN : float 1932 The A2aKN statistics of Scholz and Stephens 1987. 1933 1934 """ 1935 A2akN = 0. 1936 Z_ssorted_left = Z.searchsorted(Zstar, 'left') 1937 if N == Zstar.size: 1938 lj = 1. 1939 else: 1940 lj = Z.searchsorted(Zstar, 'right') - Z_ssorted_left 1941 Bj = Z_ssorted_left + lj / 2. 1942 for i in arange(0, k): 1943 s = np.sort(samples[i]) 1944 s_ssorted_right = s.searchsorted(Zstar, side='right') 1945 Mij = s_ssorted_right.astype(float) 1946 fij = s_ssorted_right - s.searchsorted(Zstar, 'left') 1947 Mij -= fij / 2. 1948 inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.) 1949 A2akN += inner.sum() / n[i] 1950 A2akN *= (N - 1.) / N 1951 return A2akN 1952 1953 1954def _anderson_ksamp_right(samples, Z, Zstar, k, n, N): 1955 """Compute A2akN equation 6 of Scholz & Stephens. 1956 1957 Parameters 1958 ---------- 1959 samples : sequence of 1-D array_like 1960 Array of sample arrays. 1961 Z : array_like 1962 Sorted array of all observations. 1963 Zstar : array_like 1964 Sorted array of unique observations. 1965 k : int 1966 Number of samples. 1967 n : array_like 1968 Number of observations in each sample. 1969 N : int 1970 Total number of observations. 1971 1972 Returns 1973 ------- 1974 A2KN : float 1975 The A2KN statistics of Scholz and Stephens 1987. 1976 1977 """ 1978 A2kN = 0. 1979 lj = Z.searchsorted(Zstar[:-1], 'right') - Z.searchsorted(Zstar[:-1], 1980 'left') 1981 Bj = lj.cumsum() 1982 for i in arange(0, k): 1983 s = np.sort(samples[i]) 1984 Mij = s.searchsorted(Zstar[:-1], side='right') 1985 inner = lj / float(N) * (N * Mij - Bj * n[i])**2 / (Bj * (N - Bj)) 1986 A2kN += inner.sum() / n[i] 1987 return A2kN 1988 1989 1990Anderson_ksampResult = namedtuple('Anderson_ksampResult', 1991 ('statistic', 'critical_values', 1992 'significance_level')) 1993 1994 1995def anderson_ksamp(samples, midrank=True): 1996 """The Anderson-Darling test for k-samples. 1997 1998 The k-sample Anderson-Darling test is a modification of the 1999 one-sample Anderson-Darling test. It tests the null hypothesis 2000 that k-samples are drawn from the same population without having 2001 to specify the distribution function of that population. The 2002 critical values depend on the number of samples. 2003 2004 Parameters 2005 ---------- 2006 samples : sequence of 1-D array_like 2007 Array of sample data in arrays. 2008 midrank : bool, optional 2009 Type of Anderson-Darling test which is computed. Default 2010 (True) is the midrank test applicable to continuous and 2011 discrete populations. If False, the right side empirical 2012 distribution is used. 2013 2014 Returns 2015 ------- 2016 statistic : float 2017 Normalized k-sample Anderson-Darling test statistic. 2018 critical_values : array 2019 The critical values for significance levels 25%, 10%, 5%, 2.5%, 1%, 2020 0.5%, 0.1%. 2021 significance_level : float 2022 An approximate significance level at which the null hypothesis for the 2023 provided samples can be rejected. The value is floored / capped at 2024 0.1% / 25%. 2025 2026 Raises 2027 ------ 2028 ValueError 2029 If less than 2 samples are provided, a sample is empty, or no 2030 distinct observations are in the samples. 2031 2032 See Also 2033 -------- 2034 ks_2samp : 2 sample Kolmogorov-Smirnov test 2035 anderson : 1 sample Anderson-Darling test 2036 2037 Notes 2038 ----- 2039 [1]_ defines three versions of the k-sample Anderson-Darling test: 2040 one for continuous distributions and two for discrete 2041 distributions, in which ties between samples may occur. The 2042 default of this routine is to compute the version based on the 2043 midrank empirical distribution function. This test is applicable 2044 to continuous and discrete data. If midrank is set to False, the 2045 right side empirical distribution is used for a test for discrete 2046 data. According to [1]_, the two discrete test statistics differ 2047 only slightly if a few collisions due to round-off errors occur in 2048 the test not adjusted for ties between samples. 2049 2050 The critical values corresponding to the significance levels from 0.01 2051 to 0.25 are taken from [1]_. p-values are floored / capped 2052 at 0.1% / 25%. Since the range of critical values might be extended in 2053 future releases, it is recommended not to test ``p == 0.25``, but rather 2054 ``p >= 0.25`` (analogously for the lower bound). 2055 2056 .. versionadded:: 0.14.0 2057 2058 References 2059 ---------- 2060 .. [1] Scholz, F. W and Stephens, M. A. (1987), K-Sample 2061 Anderson-Darling Tests, Journal of the American Statistical 2062 Association, Vol. 82, pp. 918-924. 2063 2064 Examples 2065 -------- 2066 >>> from scipy import stats 2067 >>> rng = np.random.default_rng() 2068 2069 The null hypothesis that the two random samples come from the same 2070 distribution can be rejected at the 5% level because the returned 2071 test value is greater than the critical value for 5% (1.961) but 2072 not at the 2.5% level. The interpolation gives an approximate 2073 significance level of 3.2%: 2074 2075 >>> stats.anderson_ksamp([rng.normal(size=50), 2076 ... rng.normal(loc=0.5, size=30)]) 2077 (1.974403288713695, 2078 array([0.325, 1.226, 1.961, 2.718, 3.752, 4.592, 6.546]), 2079 0.04991293614572478) 2080 2081 2082 The null hypothesis cannot be rejected for three samples from an 2083 identical distribution. The reported p-value (25%) has been capped and 2084 may not be very accurate (since it corresponds to the value 0.449 2085 whereas the statistic is -0.731): 2086 2087 >>> stats.anderson_ksamp([rng.normal(size=50), 2088 ... rng.normal(size=30), rng.normal(size=20)]) 2089 (-0.29103725200789504, 2090 array([ 0.44925884, 1.3052767 , 1.9434184 , 2.57696569, 3.41634856, 2091 4.07210043, 5.56419101]), 2092 0.25) 2093 2094 """ 2095 k = len(samples) 2096 if (k < 2): 2097 raise ValueError("anderson_ksamp needs at least two samples") 2098 2099 samples = list(map(np.asarray, samples)) 2100 Z = np.sort(np.hstack(samples)) 2101 N = Z.size 2102 Zstar = np.unique(Z) 2103 if Zstar.size < 2: 2104 raise ValueError("anderson_ksamp needs more than one distinct " 2105 "observation") 2106 2107 n = np.array([sample.size for sample in samples]) 2108 if np.any(n == 0): 2109 raise ValueError("anderson_ksamp encountered sample without " 2110 "observations") 2111 2112 if midrank: 2113 A2kN = _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N) 2114 else: 2115 A2kN = _anderson_ksamp_right(samples, Z, Zstar, k, n, N) 2116 2117 H = (1. / n).sum() 2118 hs_cs = (1. / arange(N - 1, 1, -1)).cumsum() 2119 h = hs_cs[-1] + 1 2120 g = (hs_cs / arange(2, N)).sum() 2121 2122 a = (4*g - 6) * (k - 1) + (10 - 6*g)*H 2123 b = (2*g - 4)*k**2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6 2124 c = (6*h + 2*g - 2)*k**2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h 2125 d = (2*h + 6)*k**2 - 4*h*k 2126 sigmasq = (a*N**3 + b*N**2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.)) 2127 m = k - 1 2128 A2 = (A2kN - m) / math.sqrt(sigmasq) 2129 2130 # The b_i values are the interpolation coefficients from Table 2 2131 # of Scholz and Stephens 1987 2132 b0 = np.array([0.675, 1.281, 1.645, 1.96, 2.326, 2.573, 3.085]) 2133 b1 = np.array([-0.245, 0.25, 0.678, 1.149, 1.822, 2.364, 3.615]) 2134 b2 = np.array([-0.105, -0.305, -0.362, -0.391, -0.396, -0.345, -0.154]) 2135 critical = b0 + b1 / math.sqrt(m) + b2 / m 2136 2137 sig = np.array([0.25, 0.1, 0.05, 0.025, 0.01, 0.005, 0.001]) 2138 if A2 < critical.min(): 2139 p = sig.max() 2140 warnings.warn("p-value capped: true value larger than {}".format(p), 2141 stacklevel=2) 2142 elif A2 > critical.max(): 2143 p = sig.min() 2144 warnings.warn("p-value floored: true value smaller than {}".format(p), 2145 stacklevel=2) 2146 else: 2147 # interpolation of probit of significance level 2148 pf = np.polyfit(critical, log(sig), 2) 2149 p = math.exp(np.polyval(pf, A2)) 2150 2151 return Anderson_ksampResult(A2, critical, p) 2152 2153 2154AnsariResult = namedtuple('AnsariResult', ('statistic', 'pvalue')) 2155 2156 2157class _ABW: 2158 """Distribution of Ansari-Bradley W-statistic under the null hypothesis.""" 2159 # TODO: calculate exact distribution considering ties 2160 # We could avoid summing over more than half the frequencies, 2161 # but inititally it doesn't seem worth the extra complexity 2162 2163 def __init__(self): 2164 """Minimal initializer.""" 2165 self.m = None 2166 self.n = None 2167 self.astart = None 2168 self.total = None 2169 self.freqs = None 2170 2171 def _recalc(self, n, m): 2172 """When necessary, recalculate exact distribution.""" 2173 if n != self.n or m != self.m: 2174 self.n, self.m = n, m 2175 # distribution is NOT symmetric when m + n is odd 2176 # n is len(x), m is len(y), and ratio of scales is defined x/y 2177 astart, a1, _ = statlib.gscale(n, m) 2178 self.astart = astart # minimum value of statistic 2179 # Exact distribution of test statistic under null hypothesis 2180 # expressed as frequencies/counts/integers to maintain precision. 2181 # Stored as floats to avoid overflow of sums. 2182 self.freqs = a1.astype(np.float64) 2183 self.total = self.freqs.sum() # could calculate from m and n 2184 # probability mass is self.freqs / self.total; 2185 2186 def pmf(self, k, n, m): 2187 """Probability mass function.""" 2188 self._recalc(n, m) 2189 # The convention here is that PMF at k = 12.5 is the same as at k = 12, 2190 # -> use `floor` in case of ties. 2191 ind = np.floor(k - self.astart).astype(int) 2192 return self.freqs[ind] / self.total 2193 2194 def cdf(self, k, n, m): 2195 """Cumulative distribution function.""" 2196 self._recalc(n, m) 2197 # Null distribution derived without considering ties is 2198 # approximate. Round down to avoid Type I error. 2199 ind = np.ceil(k - self.astart).astype(int) 2200 return self.freqs[:ind+1].sum() / self.total 2201 2202 def sf(self, k, n, m): 2203 """Survival function.""" 2204 self._recalc(n, m) 2205 # Null distribution derived without considering ties is 2206 # approximate. Round down to avoid Type I error. 2207 ind = np.floor(k - self.astart).astype(int) 2208 return self.freqs[ind:].sum() / self.total 2209 2210 2211# Maintain state for faster repeat calls to ansari w/ method='exact' 2212_abw_state = _ABW() 2213 2214 2215def ansari(x, y, alternative='two-sided'): 2216 """Perform the Ansari-Bradley test for equal scale parameters. 2217 2218 The Ansari-Bradley test ([1]_, [2]_) is a non-parametric test 2219 for the equality of the scale parameter of the distributions 2220 from which two samples were drawn. The null hypothesis states that 2221 the ratio of the scale of the distribution underlying `x` to the scale 2222 of the distribution underlying `y` is 1. 2223 2224 Parameters 2225 ---------- 2226 x, y : array_like 2227 Arrays of sample data. 2228 alternative : {'two-sided', 'less', 'greater'}, optional 2229 Defines the alternative hypothesis. Default is 'two-sided'. 2230 The following options are available: 2231 2232 * 'two-sided': the ratio of scales is not equal to 1. 2233 * 'less': the ratio of scales is less than 1. 2234 * 'greater': the ratio of scales is greater than 1. 2235 2236 .. versionadded:: 1.7.0 2237 2238 Returns 2239 ------- 2240 statistic : float 2241 The Ansari-Bradley test statistic. 2242 pvalue : float 2243 The p-value of the hypothesis test. 2244 2245 See Also 2246 -------- 2247 fligner : A non-parametric test for the equality of k variances 2248 mood : A non-parametric test for the equality of two scale parameters 2249 2250 Notes 2251 ----- 2252 The p-value given is exact when the sample sizes are both less than 2253 55 and there are no ties, otherwise a normal approximation for the 2254 p-value is used. 2255 2256 References 2257 ---------- 2258 .. [1] Ansari, A. R. and Bradley, R. A. (1960) Rank-sum tests for 2259 dispersions, Annals of Mathematical Statistics, 31, 1174-1189. 2260 .. [2] Sprent, Peter and N.C. Smeeton. Applied nonparametric 2261 statistical methods. 3rd ed. Chapman and Hall/CRC. 2001. 2262 Section 5.8.2. 2263 .. [3] Nathaniel E. Helwig "Nonparametric Dispersion and Equality 2264 Tests" at http://users.stat.umn.edu/~helwig/notes/npde-Notes.pdf 2265 2266 Examples 2267 -------- 2268 >>> from scipy.stats import ansari 2269 >>> rng = np.random.default_rng() 2270 2271 For these examples, we'll create three random data sets. The first 2272 two, with sizes 35 and 25, are drawn from a normal distribution with 2273 mean 0 and standard deviation 2. The third data set has size 25 and 2274 is drawn from a normal distribution with standard deviation 1.25. 2275 2276 >>> x1 = rng.normal(loc=0, scale=2, size=35) 2277 >>> x2 = rng.normal(loc=0, scale=2, size=25) 2278 >>> x3 = rng.normal(loc=0, scale=1.25, size=25) 2279 2280 First we apply `ansari` to `x1` and `x2`. These samples are drawn 2281 from the same distribution, so we expect the Ansari-Bradley test 2282 should not lead us to conclude that the scales of the distributions 2283 are different. 2284 2285 >>> ansari(x1, x2) 2286 AnsariResult(statistic=541.0, pvalue=0.9762532927399098) 2287 2288 With a p-value close to 1, we cannot conclude that there is a 2289 significant difference in the scales (as expected). 2290 2291 Now apply the test to `x1` and `x3`: 2292 2293 >>> ansari(x1, x3) 2294 AnsariResult(statistic=425.0, pvalue=0.0003087020407974518) 2295 2296 The probability of observing such an extreme value of the statistic 2297 under the null hypothesis of equal scales is only 0.03087%. We take this 2298 as evidence against the null hypothesis in favor of the alternative: 2299 the scales of the distributions from which the samples were drawn 2300 are not equal. 2301 2302 We can use the `alternative` parameter to perform a one-tailed test. 2303 In the above example, the scale of `x1` is greater than `x3` and so 2304 the ratio of scales of `x1` and `x3` is greater than 1. This means 2305 that the p-value when ``alternative='greater'`` should be near 0 and 2306 hence we should be able to reject the null hypothesis: 2307 2308 >>> ansari(x1, x3, alternative='greater') 2309 AnsariResult(statistic=425.0, pvalue=0.0001543510203987259) 2310 2311 As we can see, the p-value is indeed quite low. Use of 2312 ``alternative='less'`` should thus yield a large p-value: 2313 2314 >>> ansari(x1, x3, alternative='less') 2315 AnsariResult(statistic=425.0, pvalue=0.9998643258449039) 2316 2317 """ 2318 if alternative not in {'two-sided', 'greater', 'less'}: 2319 raise ValueError("'alternative' must be 'two-sided'," 2320 " 'greater', or 'less'.") 2321 x, y = asarray(x), asarray(y) 2322 n = len(x) 2323 m = len(y) 2324 if m < 1: 2325 raise ValueError("Not enough other observations.") 2326 if n < 1: 2327 raise ValueError("Not enough test observations.") 2328 2329 N = m + n 2330 xy = r_[x, y] # combine 2331 rank = stats.rankdata(xy) 2332 symrank = amin(array((rank, N - rank + 1)), 0) 2333 AB = np.sum(symrank[:n], axis=0) 2334 uxy = unique(xy) 2335 repeats = (len(uxy) != len(xy)) 2336 exact = ((m < 55) and (n < 55) and not repeats) 2337 if repeats and (m < 55 or n < 55): 2338 warnings.warn("Ties preclude use of exact statistic.") 2339 if exact: 2340 if alternative == 'two-sided': 2341 pval = 2.0 * np.minimum(_abw_state.cdf(AB, n, m), 2342 _abw_state.sf(AB, n, m)) 2343 elif alternative == 'greater': 2344 # AB statistic is _smaller_ when ratio of scales is larger, 2345 # so this is the opposite of the usual calculation 2346 pval = _abw_state.cdf(AB, n, m) 2347 else: 2348 pval = _abw_state.sf(AB, n, m) 2349 return AnsariResult(AB, min(1.0, pval)) 2350 2351 # otherwise compute normal approximation 2352 if N % 2: # N odd 2353 mnAB = n * (N+1.0)**2 / 4.0 / N 2354 varAB = n * m * (N+1.0) * (3+N**2) / (48.0 * N**2) 2355 else: 2356 mnAB = n * (N+2.0) / 4.0 2357 varAB = m * n * (N+2) * (N-2.0) / 48 / (N-1.0) 2358 if repeats: # adjust variance estimates 2359 # compute np.sum(tj * rj**2,axis=0) 2360 fac = np.sum(symrank**2, axis=0) 2361 if N % 2: # N odd 2362 varAB = m * n * (16*N*fac - (N+1)**4) / (16.0 * N**2 * (N-1)) 2363 else: # N even 2364 varAB = m * n * (16*fac - N*(N+2)**2) / (16.0 * N * (N-1)) 2365 2366 # Small values of AB indicate larger dispersion for the x sample. 2367 # Large values of AB indicate larger dispersion for the y sample. 2368 # This is opposite to the way we define the ratio of scales. see [1]_. 2369 z = (mnAB - AB) / sqrt(varAB) 2370 z, pval = _normtest_finish(z, alternative) 2371 return AnsariResult(AB, pval) 2372 2373 2374BartlettResult = namedtuple('BartlettResult', ('statistic', 'pvalue')) 2375 2376 2377def bartlett(*args): 2378 """Perform Bartlett's test for equal variances. 2379 2380 Bartlett's test tests the null hypothesis that all input samples 2381 are from populations with equal variances. For samples 2382 from significantly non-normal populations, Levene's test 2383 `levene` is more robust. 2384 2385 Parameters 2386 ---------- 2387 sample1, sample2,... : array_like 2388 arrays of sample data. Only 1d arrays are accepted, they may have 2389 different lengths. 2390 2391 Returns 2392 ------- 2393 statistic : float 2394 The test statistic. 2395 pvalue : float 2396 The p-value of the test. 2397 2398 See Also 2399 -------- 2400 fligner : A non-parametric test for the equality of k variances 2401 levene : A robust parametric test for equality of k variances 2402 2403 Notes 2404 ----- 2405 Conover et al. (1981) examine many of the existing parametric and 2406 nonparametric tests by extensive simulations and they conclude that the 2407 tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be 2408 superior in terms of robustness of departures from normality and power 2409 ([3]_). 2410 2411 References 2412 ---------- 2413 .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm 2414 2415 .. [2] Snedecor, George W. and Cochran, William G. (1989), Statistical 2416 Methods, Eighth Edition, Iowa State University Press. 2417 2418 .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and 2419 Hypothesis Testing based on Quadratic Inference Function. Technical 2420 Report #99-03, Center for Likelihood Studies, Pennsylvania State 2421 University. 2422 2423 .. [4] Bartlett, M. S. (1937). Properties of Sufficiency and Statistical 2424 Tests. Proceedings of the Royal Society of London. Series A, 2425 Mathematical and Physical Sciences, Vol. 160, No.901, pp. 268-282. 2426 2427 Examples 2428 -------- 2429 Test whether or not the lists `a`, `b` and `c` come from populations 2430 with equal variances. 2431 2432 >>> from scipy.stats import bartlett 2433 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] 2434 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] 2435 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] 2436 >>> stat, p = bartlett(a, b, c) 2437 >>> p 2438 1.1254782518834628e-05 2439 2440 The very small p-value suggests that the populations do not have equal 2441 variances. 2442 2443 This is not surprising, given that the sample variance of `b` is much 2444 larger than that of `a` and `c`: 2445 2446 >>> [np.var(x, ddof=1) for x in [a, b, c]] 2447 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002] 2448 2449 """ 2450 # Handle empty input and input that is not 1d 2451 for a in args: 2452 if np.asanyarray(a).size == 0: 2453 return BartlettResult(np.nan, np.nan) 2454 if np.asanyarray(a).ndim > 1: 2455 raise ValueError('Samples must be one-dimensional.') 2456 2457 k = len(args) 2458 if k < 2: 2459 raise ValueError("Must enter at least two input sample vectors.") 2460 Ni = np.empty(k) 2461 ssq = np.empty(k, 'd') 2462 for j in range(k): 2463 Ni[j] = len(args[j]) 2464 ssq[j] = np.var(args[j], ddof=1) 2465 Ntot = np.sum(Ni, axis=0) 2466 spsq = np.sum((Ni - 1)*ssq, axis=0) / (1.0*(Ntot - k)) 2467 numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0) 2468 denom = 1.0 + 1.0/(3*(k - 1)) * ((np.sum(1.0/(Ni - 1.0), axis=0)) - 2469 1.0/(Ntot - k)) 2470 T = numer / denom 2471 pval = distributions.chi2.sf(T, k - 1) # 1 - cdf 2472 2473 return BartlettResult(T, pval) 2474 2475 2476LeveneResult = namedtuple('LeveneResult', ('statistic', 'pvalue')) 2477 2478 2479def levene(*args, center='median', proportiontocut=0.05): 2480 """Perform Levene test for equal variances. 2481 2482 The Levene test tests the null hypothesis that all input samples 2483 are from populations with equal variances. Levene's test is an 2484 alternative to Bartlett's test `bartlett` in the case where 2485 there are significant deviations from normality. 2486 2487 Parameters 2488 ---------- 2489 sample1, sample2, ... : array_like 2490 The sample data, possibly with different lengths. Only one-dimensional 2491 samples are accepted. 2492 center : {'mean', 'median', 'trimmed'}, optional 2493 Which function of the data to use in the test. The default 2494 is 'median'. 2495 proportiontocut : float, optional 2496 When `center` is 'trimmed', this gives the proportion of data points 2497 to cut from each end. (See `scipy.stats.trim_mean`.) 2498 Default is 0.05. 2499 2500 Returns 2501 ------- 2502 statistic : float 2503 The test statistic. 2504 pvalue : float 2505 The p-value for the test. 2506 2507 Notes 2508 ----- 2509 Three variations of Levene's test are possible. The possibilities 2510 and their recommended usages are: 2511 2512 * 'median' : Recommended for skewed (non-normal) distributions> 2513 * 'mean' : Recommended for symmetric, moderate-tailed distributions. 2514 * 'trimmed' : Recommended for heavy-tailed distributions. 2515 2516 The test version using the mean was proposed in the original article 2517 of Levene ([2]_) while the median and trimmed mean have been studied by 2518 Brown and Forsythe ([3]_), sometimes also referred to as Brown-Forsythe 2519 test. 2520 2521 References 2522 ---------- 2523 .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm 2524 .. [2] Levene, H. (1960). In Contributions to Probability and Statistics: 2525 Essays in Honor of Harold Hotelling, I. Olkin et al. eds., 2526 Stanford University Press, pp. 278-292. 2527 .. [3] Brown, M. B. and Forsythe, A. B. (1974), Journal of the American 2528 Statistical Association, 69, 364-367 2529 2530 Examples 2531 -------- 2532 Test whether or not the lists `a`, `b` and `c` come from populations 2533 with equal variances. 2534 2535 >>> from scipy.stats import levene 2536 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] 2537 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] 2538 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] 2539 >>> stat, p = levene(a, b, c) 2540 >>> p 2541 0.002431505967249681 2542 2543 The small p-value suggests that the populations do not have equal 2544 variances. 2545 2546 This is not surprising, given that the sample variance of `b` is much 2547 larger than that of `a` and `c`: 2548 2549 >>> [np.var(x, ddof=1) for x in [a, b, c]] 2550 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002] 2551 2552 """ 2553 if center not in ['mean', 'median', 'trimmed']: 2554 raise ValueError("center must be 'mean', 'median' or 'trimmed'.") 2555 2556 k = len(args) 2557 if k < 2: 2558 raise ValueError("Must enter at least two input sample vectors.") 2559 # check for 1d input 2560 for j in range(k): 2561 if np.asanyarray(args[j]).ndim > 1: 2562 raise ValueError('Samples must be one-dimensional.') 2563 2564 Ni = np.empty(k) 2565 Yci = np.empty(k, 'd') 2566 2567 if center == 'median': 2568 func = lambda x: np.median(x, axis=0) 2569 elif center == 'mean': 2570 func = lambda x: np.mean(x, axis=0) 2571 else: # center == 'trimmed' 2572 args = tuple(stats.trimboth(np.sort(arg), proportiontocut) 2573 for arg in args) 2574 func = lambda x: np.mean(x, axis=0) 2575 2576 for j in range(k): 2577 Ni[j] = len(args[j]) 2578 Yci[j] = func(args[j]) 2579 Ntot = np.sum(Ni, axis=0) 2580 2581 # compute Zij's 2582 Zij = [None] * k 2583 for i in range(k): 2584 Zij[i] = abs(asarray(args[i]) - Yci[i]) 2585 2586 # compute Zbari 2587 Zbari = np.empty(k, 'd') 2588 Zbar = 0.0 2589 for i in range(k): 2590 Zbari[i] = np.mean(Zij[i], axis=0) 2591 Zbar += Zbari[i] * Ni[i] 2592 2593 Zbar /= Ntot 2594 numer = (Ntot - k) * np.sum(Ni * (Zbari - Zbar)**2, axis=0) 2595 2596 # compute denom_variance 2597 dvar = 0.0 2598 for i in range(k): 2599 dvar += np.sum((Zij[i] - Zbari[i])**2, axis=0) 2600 2601 denom = (k - 1.0) * dvar 2602 2603 W = numer / denom 2604 pval = distributions.f.sf(W, k-1, Ntot-k) # 1 - cdf 2605 return LeveneResult(W, pval) 2606 2607 2608def binom_test(x, n=None, p=0.5, alternative='two-sided'): 2609 """Perform a test that the probability of success is p. 2610 2611 Note: `binom_test` is deprecated; it is recommended that `binomtest` 2612 be used instead. 2613 2614 This is an exact, two-sided test of the null hypothesis 2615 that the probability of success in a Bernoulli experiment 2616 is `p`. 2617 2618 Parameters 2619 ---------- 2620 x : int or array_like 2621 The number of successes, or if x has length 2, it is the 2622 number of successes and the number of failures. 2623 n : int 2624 The number of trials. This is ignored if x gives both the 2625 number of successes and failures. 2626 p : float, optional 2627 The hypothesized probability of success. ``0 <= p <= 1``. The 2628 default value is ``p = 0.5``. 2629 alternative : {'two-sided', 'greater', 'less'}, optional 2630 Indicates the alternative hypothesis. The default value is 2631 'two-sided'. 2632 2633 Returns 2634 ------- 2635 p-value : float 2636 The p-value of the hypothesis test. 2637 2638 References 2639 ---------- 2640 .. [1] https://en.wikipedia.org/wiki/Binomial_test 2641 2642 Examples 2643 -------- 2644 >>> from scipy import stats 2645 2646 A car manufacturer claims that no more than 10% of their cars are unsafe. 2647 15 cars are inspected for safety, 3 were found to be unsafe. Test the 2648 manufacturer's claim: 2649 2650 >>> stats.binom_test(3, n=15, p=0.1, alternative='greater') 2651 0.18406106910639114 2652 2653 The null hypothesis cannot be rejected at the 5% level of significance 2654 because the returned p-value is greater than the critical value of 5%. 2655 2656 """ 2657 x = atleast_1d(x).astype(np.int_) 2658 if len(x) == 2: 2659 n = x[1] + x[0] 2660 x = x[0] 2661 elif len(x) == 1: 2662 x = x[0] 2663 if n is None or n < x: 2664 raise ValueError("n must be >= x") 2665 n = np.int_(n) 2666 else: 2667 raise ValueError("Incorrect length for x.") 2668 2669 if (p > 1.0) or (p < 0.0): 2670 raise ValueError("p must be in range [0,1]") 2671 2672 if alternative not in ('two-sided', 'less', 'greater'): 2673 raise ValueError("alternative not recognized\n" 2674 "should be 'two-sided', 'less' or 'greater'") 2675 2676 if alternative == 'less': 2677 pval = distributions.binom.cdf(x, n, p) 2678 return pval 2679 2680 if alternative == 'greater': 2681 pval = distributions.binom.sf(x-1, n, p) 2682 return pval 2683 2684 # if alternative was neither 'less' nor 'greater', then it's 'two-sided' 2685 d = distributions.binom.pmf(x, n, p) 2686 rerr = 1 + 1e-7 2687 if x == p * n: 2688 # special case as shortcut, would also be handled by `else` below 2689 pval = 1. 2690 elif x < p * n: 2691 i = np.arange(np.ceil(p * n), n+1) 2692 y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0) 2693 pval = (distributions.binom.cdf(x, n, p) + 2694 distributions.binom.sf(n - y, n, p)) 2695 else: 2696 i = np.arange(np.floor(p*n) + 1) 2697 y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0) 2698 pval = (distributions.binom.cdf(y-1, n, p) + 2699 distributions.binom.sf(x-1, n, p)) 2700 2701 return min(1.0, pval) 2702 2703 2704def _apply_func(x, g, func): 2705 # g is list of indices into x 2706 # separating x into different groups 2707 # func should be applied over the groups 2708 g = unique(r_[0, g, len(x)]) 2709 output = [func(x[g[k]:g[k+1]]) for k in range(len(g) - 1)] 2710 2711 return asarray(output) 2712 2713 2714FlignerResult = namedtuple('FlignerResult', ('statistic', 'pvalue')) 2715 2716 2717def fligner(*args, center='median', proportiontocut=0.05): 2718 """Perform Fligner-Killeen test for equality of variance. 2719 2720 Fligner's test tests the null hypothesis that all input samples 2721 are from populations with equal variances. Fligner-Killeen's test is 2722 distribution free when populations are identical [2]_. 2723 2724 Parameters 2725 ---------- 2726 sample1, sample2, ... : array_like 2727 Arrays of sample data. Need not be the same length. 2728 center : {'mean', 'median', 'trimmed'}, optional 2729 Keyword argument controlling which function of the data is used in 2730 computing the test statistic. The default is 'median'. 2731 proportiontocut : float, optional 2732 When `center` is 'trimmed', this gives the proportion of data points 2733 to cut from each end. (See `scipy.stats.trim_mean`.) 2734 Default is 0.05. 2735 2736 Returns 2737 ------- 2738 statistic : float 2739 The test statistic. 2740 pvalue : float 2741 The p-value for the hypothesis test. 2742 2743 See Also 2744 -------- 2745 bartlett : A parametric test for equality of k variances in normal samples 2746 levene : A robust parametric test for equality of k variances 2747 2748 Notes 2749 ----- 2750 As with Levene's test there are three variants of Fligner's test that 2751 differ by the measure of central tendency used in the test. See `levene` 2752 for more information. 2753 2754 Conover et al. (1981) examine many of the existing parametric and 2755 nonparametric tests by extensive simulations and they conclude that the 2756 tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be 2757 superior in terms of robustness of departures from normality and power [3]_. 2758 2759 References 2760 ---------- 2761 .. [1] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and 2762 Hypothesis Testing based on Quadratic Inference Function. Technical 2763 Report #99-03, Center for Likelihood Studies, Pennsylvania State 2764 University. 2765 https://cecas.clemson.edu/~cspark/cv/paper/qif/draftqif2.pdf 2766 2767 .. [2] Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample 2768 tests for scale. 'Journal of the American Statistical Association.' 2769 71(353), 210-213. 2770 2771 .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and 2772 Hypothesis Testing based on Quadratic Inference Function. Technical 2773 Report #99-03, Center for Likelihood Studies, Pennsylvania State 2774 University. 2775 2776 .. [4] Conover, W. J., Johnson, M. E. and Johnson M. M. (1981). A 2777 comparative study of tests for homogeneity of variances, with 2778 applications to the outer continental shelf biding data. 2779 Technometrics, 23(4), 351-361. 2780 2781 Examples 2782 -------- 2783 Test whether or not the lists `a`, `b` and `c` come from populations 2784 with equal variances. 2785 2786 >>> from scipy.stats import fligner 2787 >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99] 2788 >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05] 2789 >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98] 2790 >>> stat, p = fligner(a, b, c) 2791 >>> p 2792 0.00450826080004775 2793 2794 The small p-value suggests that the populations do not have equal 2795 variances. 2796 2797 This is not surprising, given that the sample variance of `b` is much 2798 larger than that of `a` and `c`: 2799 2800 >>> [np.var(x, ddof=1) for x in [a, b, c]] 2801 [0.007054444444444413, 0.13073888888888888, 0.008890000000000002] 2802 2803 """ 2804 if center not in ['mean', 'median', 'trimmed']: 2805 raise ValueError("center must be 'mean', 'median' or 'trimmed'.") 2806 2807 # Handle empty input 2808 for a in args: 2809 if np.asanyarray(a).size == 0: 2810 return FlignerResult(np.nan, np.nan) 2811 2812 k = len(args) 2813 if k < 2: 2814 raise ValueError("Must enter at least two input sample vectors.") 2815 2816 if center == 'median': 2817 func = lambda x: np.median(x, axis=0) 2818 elif center == 'mean': 2819 func = lambda x: np.mean(x, axis=0) 2820 else: # center == 'trimmed' 2821 args = tuple(stats.trimboth(arg, proportiontocut) for arg in args) 2822 func = lambda x: np.mean(x, axis=0) 2823 2824 Ni = asarray([len(args[j]) for j in range(k)]) 2825 Yci = asarray([func(args[j]) for j in range(k)]) 2826 Ntot = np.sum(Ni, axis=0) 2827 # compute Zij's 2828 Zij = [abs(asarray(args[i]) - Yci[i]) for i in range(k)] 2829 allZij = [] 2830 g = [0] 2831 for i in range(k): 2832 allZij.extend(list(Zij[i])) 2833 g.append(len(allZij)) 2834 2835 ranks = stats.rankdata(allZij) 2836 a = distributions.norm.ppf(ranks / (2*(Ntot + 1.0)) + 0.5) 2837 2838 # compute Aibar 2839 Aibar = _apply_func(a, g, np.sum) / Ni 2840 anbar = np.mean(a, axis=0) 2841 varsq = np.var(a, axis=0, ddof=1) 2842 Xsq = np.sum(Ni * (asarray(Aibar) - anbar)**2.0, axis=0) / varsq 2843 pval = distributions.chi2.sf(Xsq, k - 1) # 1 - cdf 2844 return FlignerResult(Xsq, pval) 2845 2846 2847def mood(x, y, axis=0, alternative="two-sided"): 2848 """Perform Mood's test for equal scale parameters. 2849 2850 Mood's two-sample test for scale parameters is a non-parametric 2851 test for the null hypothesis that two samples are drawn from the 2852 same distribution with the same scale parameter. 2853 2854 Parameters 2855 ---------- 2856 x, y : array_like 2857 Arrays of sample data. 2858 axis : int, optional 2859 The axis along which the samples are tested. `x` and `y` can be of 2860 different length along `axis`. 2861 If `axis` is None, `x` and `y` are flattened and the test is done on 2862 all values in the flattened arrays. 2863 alternative : {'two-sided', 'less', 'greater'}, optional 2864 Defines the alternative hypothesis. Default is 'two-sided'. 2865 The following options are available: 2866 2867 * 'two-sided': the scales of the distributions underlying `x` and `y` 2868 are different. 2869 * 'less': the scale of the distribution underlying `x` is less than 2870 the scale of the distribution underlying `y`. 2871 * 'greater': the scale of the distribution underlying `x` is greater 2872 than the scale of the distribution underlying `y`. 2873 2874 .. versionadded:: 1.7.0 2875 2876 Returns 2877 ------- 2878 z : scalar or ndarray 2879 The z-score for the hypothesis test. For 1-D inputs a scalar is 2880 returned. 2881 p-value : scalar ndarray 2882 The p-value for the hypothesis test. 2883 2884 See Also 2885 -------- 2886 fligner : A non-parametric test for the equality of k variances 2887 ansari : A non-parametric test for the equality of 2 variances 2888 bartlett : A parametric test for equality of k variances in normal samples 2889 levene : A parametric test for equality of k variances 2890 2891 Notes 2892 ----- 2893 The data are assumed to be drawn from probability distributions ``f(x)`` 2894 and ``f(x/s) / s`` respectively, for some probability density function f. 2895 The null hypothesis is that ``s == 1``. 2896 2897 For multi-dimensional arrays, if the inputs are of shapes 2898 ``(n0, n1, n2, n3)`` and ``(n0, m1, n2, n3)``, then if ``axis=1``, the 2899 resulting z and p values will have shape ``(n0, n2, n3)``. Note that 2900 ``n1`` and ``m1`` don't have to be equal, but the other dimensions do. 2901 2902 Examples 2903 -------- 2904 >>> from scipy import stats 2905 >>> rng = np.random.default_rng() 2906 >>> x2 = rng.standard_normal((2, 45, 6, 7)) 2907 >>> x1 = rng.standard_normal((2, 30, 6, 7)) 2908 >>> z, p = stats.mood(x1, x2, axis=1) 2909 >>> p.shape 2910 (2, 6, 7) 2911 2912 Find the number of points where the difference in scale is not significant: 2913 2914 >>> (p > 0.1).sum() 2915 78 2916 2917 Perform the test with different scales: 2918 2919 >>> x1 = rng.standard_normal((2, 30)) 2920 >>> x2 = rng.standard_normal((2, 35)) * 10.0 2921 >>> stats.mood(x1, x2, axis=1) 2922 (array([-5.76174136, -6.12650783]), array([8.32505043e-09, 8.98287869e-10])) 2923 2924 """ 2925 x = np.asarray(x, dtype=float) 2926 y = np.asarray(y, dtype=float) 2927 2928 if axis is None: 2929 x = x.flatten() 2930 y = y.flatten() 2931 axis = 0 2932 2933 if axis < 0: 2934 axis = x.ndim + axis 2935 2936 # Determine shape of the result arrays 2937 res_shape = tuple([x.shape[ax] for ax in range(len(x.shape)) if ax != axis]) 2938 if not (res_shape == tuple([y.shape[ax] for ax in range(len(y.shape)) if 2939 ax != axis])): 2940 raise ValueError("Dimensions of x and y on all axes except `axis` " 2941 "should match") 2942 2943 n = x.shape[axis] 2944 m = y.shape[axis] 2945 N = m + n 2946 if N < 3: 2947 raise ValueError("Not enough observations.") 2948 2949 xy = np.concatenate((x, y), axis=axis) 2950 if axis != 0: 2951 xy = np.rollaxis(xy, axis) 2952 2953 xy = xy.reshape(xy.shape[0], -1) 2954 2955 # Generalized to the n-dimensional case by adding the axis argument, and 2956 # using for loops, since rankdata is not vectorized. For improving 2957 # performance consider vectorizing rankdata function. 2958 all_ranks = np.empty_like(xy) 2959 for j in range(xy.shape[1]): 2960 all_ranks[:, j] = stats.rankdata(xy[:, j]) 2961 2962 Ri = all_ranks[:n] 2963 M = np.sum((Ri - (N + 1.0) / 2)**2, axis=0) 2964 # Approx stat. 2965 mnM = n * (N * N - 1.0) / 12 2966 varM = m * n * (N + 1.0) * (N + 2) * (N - 2) / 180 2967 z = (M - mnM) / sqrt(varM) 2968 z, pval = _normtest_finish(z, alternative) 2969 2970 if res_shape == (): 2971 # Return scalars, not 0-D arrays 2972 z = z[0] 2973 pval = pval[0] 2974 else: 2975 z.shape = res_shape 2976 pval.shape = res_shape 2977 2978 return z, pval 2979 2980 2981WilcoxonResult = namedtuple('WilcoxonResult', ('statistic', 'pvalue')) 2982 2983 2984def wilcoxon(x, y=None, zero_method="wilcox", correction=False, 2985 alternative="two-sided", mode='auto'): 2986 """Calculate the Wilcoxon signed-rank test. 2987 2988 The Wilcoxon signed-rank test tests the null hypothesis that two 2989 related paired samples come from the same distribution. In particular, 2990 it tests whether the distribution of the differences x - y is symmetric 2991 about zero. It is a non-parametric version of the paired T-test. 2992 2993 Parameters 2994 ---------- 2995 x : array_like 2996 Either the first set of measurements (in which case ``y`` is the second 2997 set of measurements), or the differences between two sets of 2998 measurements (in which case ``y`` is not to be specified.) Must be 2999 one-dimensional. 3000 y : array_like, optional 3001 Either the second set of measurements (if ``x`` is the first set of 3002 measurements), or not specified (if ``x`` is the differences between 3003 two sets of measurements.) Must be one-dimensional. 3004 zero_method : {"pratt", "wilcox", "zsplit"}, optional 3005 The following options are available (default is "wilcox"): 3006 3007 * "pratt": Includes zero-differences in the ranking process, 3008 but drops the ranks of the zeros, see [4]_, (more conservative). 3009 * "wilcox": Discards all zero-differences, the default. 3010 * "zsplit": Includes zero-differences in the ranking process and 3011 split the zero rank between positive and negative ones. 3012 correction : bool, optional 3013 If True, apply continuity correction by adjusting the Wilcoxon rank 3014 statistic by 0.5 towards the mean value when computing the 3015 z-statistic if a normal approximation is used. Default is False. 3016 alternative : {"two-sided", "greater", "less"}, optional 3017 The alternative hypothesis to be tested, see Notes. Default is 3018 "two-sided". 3019 mode : {"auto", "exact", "approx"} 3020 Method to calculate the p-value, see Notes. Default is "auto". 3021 3022 Returns 3023 ------- 3024 statistic : float 3025 If ``alternative`` is "two-sided", the sum of the ranks of the 3026 differences above or below zero, whichever is smaller. 3027 Otherwise the sum of the ranks of the differences above zero. 3028 pvalue : float 3029 The p-value for the test depending on ``alternative`` and ``mode``. 3030 3031 See Also 3032 -------- 3033 kruskal, mannwhitneyu 3034 3035 Notes 3036 ----- 3037 The test has been introduced in [4]_. Given n independent samples 3038 (xi, yi) from a bivariate distribution (i.e. paired samples), 3039 it computes the differences di = xi - yi. One assumption of the test 3040 is that the differences are symmetric, see [2]_. 3041 The two-sided test has the null hypothesis that the median of the 3042 differences is zero against the alternative that it is different from 3043 zero. The one-sided test has the null hypothesis that the median is 3044 positive against the alternative that it is negative 3045 (``alternative == 'less'``), or vice versa (``alternative == 'greater.'``). 3046 3047 To derive the p-value, the exact distribution (``mode == 'exact'``) 3048 can be used for sample sizes of up to 25. The default ``mode == 'auto'`` 3049 uses the exact distribution if there are at most 25 observations and no 3050 ties, otherwise a normal approximation is used (``mode == 'approx'``). 3051 3052 The treatment of ties can be controlled by the parameter `zero_method`. 3053 If ``zero_method == 'pratt'``, the normal approximation is adjusted as in 3054 [5]_. A typical rule is to require that n > 20 ([2]_, p. 383). 3055 3056 References 3057 ---------- 3058 .. [1] https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test 3059 .. [2] Conover, W.J., Practical Nonparametric Statistics, 1971. 3060 .. [3] Pratt, J.W., Remarks on Zeros and Ties in the Wilcoxon Signed 3061 Rank Procedures, Journal of the American Statistical Association, 3062 Vol. 54, 1959, pp. 655-667. :doi:`10.1080/01621459.1959.10501526` 3063 .. [4] Wilcoxon, F., Individual Comparisons by Ranking Methods, 3064 Biometrics Bulletin, Vol. 1, 1945, pp. 80-83. :doi:`10.2307/3001968` 3065 .. [5] Cureton, E.E., The Normal Approximation to the Signed-Rank 3066 Sampling Distribution When Zero Differences are Present, 3067 Journal of the American Statistical Association, Vol. 62, 1967, 3068 pp. 1068-1069. :doi:`10.1080/01621459.1967.10500917` 3069 3070 Examples 3071 -------- 3072 In [4]_, the differences in height between cross- and self-fertilized 3073 corn plants is given as follows: 3074 3075 >>> d = [6, 8, 14, 16, 23, 24, 28, 29, 41, -48, 49, 56, 60, -67, 75] 3076 3077 Cross-fertilized plants appear to be be higher. To test the null 3078 hypothesis that there is no height difference, we can apply the 3079 two-sided test: 3080 3081 >>> from scipy.stats import wilcoxon 3082 >>> w, p = wilcoxon(d) 3083 >>> w, p 3084 (24.0, 0.041259765625) 3085 3086 Hence, we would reject the null hypothesis at a confidence level of 5%, 3087 concluding that there is a difference in height between the groups. 3088 To confirm that the median of the differences can be assumed to be 3089 positive, we use: 3090 3091 >>> w, p = wilcoxon(d, alternative='greater') 3092 >>> w, p 3093 (96.0, 0.0206298828125) 3094 3095 This shows that the null hypothesis that the median is negative can be 3096 rejected at a confidence level of 5% in favor of the alternative that 3097 the median is greater than zero. The p-values above are exact. Using the 3098 normal approximation gives very similar values: 3099 3100 >>> w, p = wilcoxon(d, mode='approx') 3101 >>> w, p 3102 (24.0, 0.04088813291185591) 3103 3104 Note that the statistic changed to 96 in the one-sided case (the sum 3105 of ranks of positive differences) whereas it is 24 in the two-sided 3106 case (the minimum of sum of ranks above and below zero). 3107 3108 """ 3109 if mode not in ["auto", "approx", "exact"]: 3110 raise ValueError("mode must be either 'auto', 'approx' or 'exact'") 3111 3112 if zero_method not in ["wilcox", "pratt", "zsplit"]: 3113 raise ValueError("Zero method must be either 'wilcox' " 3114 "or 'pratt' or 'zsplit'") 3115 3116 if alternative not in ["two-sided", "less", "greater"]: 3117 raise ValueError("Alternative must be either 'two-sided', " 3118 "'greater' or 'less'") 3119 3120 if y is None: 3121 d = asarray(x) 3122 if d.ndim > 1: 3123 raise ValueError('Sample x must be one-dimensional.') 3124 else: 3125 x, y = map(asarray, (x, y)) 3126 if x.ndim > 1 or y.ndim > 1: 3127 raise ValueError('Samples x and y must be one-dimensional.') 3128 if len(x) != len(y): 3129 raise ValueError('The samples x and y must have the same length.') 3130 d = x - y 3131 3132 if mode == "auto": 3133 if len(d) <= 25: 3134 mode = "exact" 3135 else: 3136 mode = "approx" 3137 3138 n_zero = np.sum(d == 0) 3139 if n_zero > 0 and mode == "exact": 3140 mode = "approx" 3141 warnings.warn("Exact p-value calculation does not work if there are " 3142 "ties. Switching to normal approximation.") 3143 3144 if mode == "approx": 3145 if zero_method in ["wilcox", "pratt"]: 3146 if n_zero == len(d): 3147 raise ValueError("zero_method 'wilcox' and 'pratt' do not " 3148 "work if x - y is zero for all elements.") 3149 if zero_method == "wilcox": 3150 # Keep all non-zero differences 3151 d = compress(np.not_equal(d, 0), d) 3152 3153 count = len(d) 3154 if count < 10 and mode == "approx": 3155 warnings.warn("Sample size too small for normal approximation.") 3156 3157 r = stats.rankdata(abs(d)) 3158 r_plus = np.sum((d > 0) * r) 3159 r_minus = np.sum((d < 0) * r) 3160 3161 if zero_method == "zsplit": 3162 r_zero = np.sum((d == 0) * r) 3163 r_plus += r_zero / 2. 3164 r_minus += r_zero / 2. 3165 3166 # return min for two-sided test, but r_plus for one-sided test 3167 # the literature is not consistent here 3168 # r_plus is more informative since r_plus + r_minus = count*(count+1)/2, 3169 # i.e. the sum of the ranks, so r_minus and the min can be inferred 3170 # (If alternative='pratt', r_plus + r_minus = count*(count+1)/2 - r_zero.) 3171 # [3] uses the r_plus for the one-sided test, keep min for two-sided test 3172 # to keep backwards compatibility 3173 if alternative == "two-sided": 3174 T = min(r_plus, r_minus) 3175 else: 3176 T = r_plus 3177 3178 if mode == "approx": 3179 mn = count * (count + 1.) * 0.25 3180 se = count * (count + 1.) * (2. * count + 1.) 3181 3182 if zero_method == "pratt": 3183 r = r[d != 0] 3184 # normal approximation needs to be adjusted, see Cureton (1967) 3185 mn -= n_zero * (n_zero + 1.) * 0.25 3186 se -= n_zero * (n_zero + 1.) * (2. * n_zero + 1.) 3187 3188 replist, repnum = find_repeats(r) 3189 if repnum.size != 0: 3190 # Correction for repeated elements. 3191 se -= 0.5 * (repnum * (repnum * repnum - 1)).sum() 3192 3193 se = sqrt(se / 24) 3194 3195 # apply continuity correction if applicable 3196 d = 0 3197 if correction: 3198 if alternative == "two-sided": 3199 d = 0.5 * np.sign(T - mn) 3200 elif alternative == "less": 3201 d = -0.5 3202 else: 3203 d = 0.5 3204 3205 # compute statistic and p-value using normal approximation 3206 z = (T - mn - d) / se 3207 if alternative == "two-sided": 3208 prob = 2. * distributions.norm.sf(abs(z)) 3209 elif alternative == "greater": 3210 # large T = r_plus indicates x is greater than y; i.e. 3211 # accept alternative in that case and return small p-value (sf) 3212 prob = distributions.norm.sf(z) 3213 else: 3214 prob = distributions.norm.cdf(z) 3215 elif mode == "exact": 3216 # get frequencies cnt of the possible positive ranksums r_plus 3217 cnt = _get_wilcoxon_distr(count) 3218 # note: r_plus is int (ties not allowed), need int for slices below 3219 r_plus = int(r_plus) 3220 if alternative == "two-sided": 3221 if r_plus == (len(cnt) - 1) // 2: 3222 # r_plus is the center of the distribution. 3223 prob = 1.0 3224 else: 3225 p_less = np.sum(cnt[:r_plus + 1]) / 2**count 3226 p_greater = np.sum(cnt[r_plus:]) / 2**count 3227 prob = 2*min(p_greater, p_less) 3228 elif alternative == "greater": 3229 prob = np.sum(cnt[r_plus:]) / 2**count 3230 else: 3231 prob = np.sum(cnt[:r_plus + 1]) / 2**count 3232 3233 return WilcoxonResult(T, prob) 3234 3235 3236def median_test(*args, ties='below', correction=True, lambda_=1, 3237 nan_policy='propagate'): 3238 """Perform a Mood's median test. 3239 3240 Test that two or more samples come from populations with the same median. 3241 3242 Let ``n = len(args)`` be the number of samples. The "grand median" of 3243 all the data is computed, and a contingency table is formed by 3244 classifying the values in each sample as being above or below the grand 3245 median. The contingency table, along with `correction` and `lambda_`, 3246 are passed to `scipy.stats.chi2_contingency` to compute the test statistic 3247 and p-value. 3248 3249 Parameters 3250 ---------- 3251 sample1, sample2, ... : array_like 3252 The set of samples. There must be at least two samples. 3253 Each sample must be a one-dimensional sequence containing at least 3254 one value. The samples are not required to have the same length. 3255 ties : str, optional 3256 Determines how values equal to the grand median are classified in 3257 the contingency table. The string must be one of:: 3258 3259 "below": 3260 Values equal to the grand median are counted as "below". 3261 "above": 3262 Values equal to the grand median are counted as "above". 3263 "ignore": 3264 Values equal to the grand median are not counted. 3265 3266 The default is "below". 3267 correction : bool, optional 3268 If True, *and* there are just two samples, apply Yates' correction 3269 for continuity when computing the test statistic associated with 3270 the contingency table. Default is True. 3271 lambda_ : float or str, optional 3272 By default, the statistic computed in this test is Pearson's 3273 chi-squared statistic. `lambda_` allows a statistic from the 3274 Cressie-Read power divergence family to be used instead. See 3275 `power_divergence` for details. 3276 Default is 1 (Pearson's chi-squared statistic). 3277 nan_policy : {'propagate', 'raise', 'omit'}, optional 3278 Defines how to handle when input contains nan. 'propagate' returns nan, 3279 'raise' throws an error, 'omit' performs the calculations ignoring nan 3280 values. Default is 'propagate'. 3281 3282 Returns 3283 ------- 3284 stat : float 3285 The test statistic. The statistic that is returned is determined by 3286 `lambda_`. The default is Pearson's chi-squared statistic. 3287 p : float 3288 The p-value of the test. 3289 m : float 3290 The grand median. 3291 table : ndarray 3292 The contingency table. The shape of the table is (2, n), where 3293 n is the number of samples. The first row holds the counts of the 3294 values above the grand median, and the second row holds the counts 3295 of the values below the grand median. The table allows further 3296 analysis with, for example, `scipy.stats.chi2_contingency`, or with 3297 `scipy.stats.fisher_exact` if there are two samples, without having 3298 to recompute the table. If ``nan_policy`` is "propagate" and there 3299 are nans in the input, the return value for ``table`` is ``None``. 3300 3301 See Also 3302 -------- 3303 kruskal : Compute the Kruskal-Wallis H-test for independent samples. 3304 mannwhitneyu : Computes the Mann-Whitney rank test on samples x and y. 3305 3306 Notes 3307 ----- 3308 .. versionadded:: 0.15.0 3309 3310 References 3311 ---------- 3312 .. [1] Mood, A. M., Introduction to the Theory of Statistics. McGraw-Hill 3313 (1950), pp. 394-399. 3314 .. [2] Zar, J. H., Biostatistical Analysis, 5th ed. Prentice Hall (2010). 3315 See Sections 8.12 and 10.15. 3316 3317 Examples 3318 -------- 3319 A biologist runs an experiment in which there are three groups of plants. 3320 Group 1 has 16 plants, group 2 has 15 plants, and group 3 has 17 plants. 3321 Each plant produces a number of seeds. The seed counts for each group 3322 are:: 3323 3324 Group 1: 10 14 14 18 20 22 24 25 31 31 32 39 43 43 48 49 3325 Group 2: 28 30 31 33 34 35 36 40 44 55 57 61 91 92 99 3326 Group 3: 0 3 9 22 23 25 25 33 34 34 40 45 46 48 62 67 84 3327 3328 The following code applies Mood's median test to these samples. 3329 3330 >>> g1 = [10, 14, 14, 18, 20, 22, 24, 25, 31, 31, 32, 39, 43, 43, 48, 49] 3331 >>> g2 = [28, 30, 31, 33, 34, 35, 36, 40, 44, 55, 57, 61, 91, 92, 99] 3332 >>> g3 = [0, 3, 9, 22, 23, 25, 25, 33, 34, 34, 40, 45, 46, 48, 62, 67, 84] 3333 >>> from scipy.stats import median_test 3334 >>> stat, p, med, tbl = median_test(g1, g2, g3) 3335 3336 The median is 3337 3338 >>> med 3339 34.0 3340 3341 and the contingency table is 3342 3343 >>> tbl 3344 array([[ 5, 10, 7], 3345 [11, 5, 10]]) 3346 3347 `p` is too large to conclude that the medians are not the same: 3348 3349 >>> p 3350 0.12609082774093244 3351 3352 The "G-test" can be performed by passing ``lambda_="log-likelihood"`` to 3353 `median_test`. 3354 3355 >>> g, p, med, tbl = median_test(g1, g2, g3, lambda_="log-likelihood") 3356 >>> p 3357 0.12224779737117837 3358 3359 The median occurs several times in the data, so we'll get a different 3360 result if, for example, ``ties="above"`` is used: 3361 3362 >>> stat, p, med, tbl = median_test(g1, g2, g3, ties="above") 3363 >>> p 3364 0.063873276069553273 3365 3366 >>> tbl 3367 array([[ 5, 11, 9], 3368 [11, 4, 8]]) 3369 3370 This example demonstrates that if the data set is not large and there 3371 are values equal to the median, the p-value can be sensitive to the 3372 choice of `ties`. 3373 3374 """ 3375 if len(args) < 2: 3376 raise ValueError('median_test requires two or more samples.') 3377 3378 ties_options = ['below', 'above', 'ignore'] 3379 if ties not in ties_options: 3380 raise ValueError("invalid 'ties' option '%s'; 'ties' must be one " 3381 "of: %s" % (ties, str(ties_options)[1:-1])) 3382 3383 data = [np.asarray(arg) for arg in args] 3384 3385 # Validate the sizes and shapes of the arguments. 3386 for k, d in enumerate(data): 3387 if d.size == 0: 3388 raise ValueError("Sample %d is empty. All samples must " 3389 "contain at least one value." % (k + 1)) 3390 if d.ndim != 1: 3391 raise ValueError("Sample %d has %d dimensions. All " 3392 "samples must be one-dimensional sequences." % 3393 (k + 1, d.ndim)) 3394 3395 cdata = np.concatenate(data) 3396 contains_nan, nan_policy = _contains_nan(cdata, nan_policy) 3397 if contains_nan and nan_policy == 'propagate': 3398 return np.nan, np.nan, np.nan, None 3399 3400 if contains_nan: 3401 grand_median = np.median(cdata[~np.isnan(cdata)]) 3402 else: 3403 grand_median = np.median(cdata) 3404 # When the minimum version of numpy supported by scipy is 1.9.0, 3405 # the above if/else statement can be replaced by the single line: 3406 # grand_median = np.nanmedian(cdata) 3407 3408 # Create the contingency table. 3409 table = np.zeros((2, len(data)), dtype=np.int64) 3410 for k, sample in enumerate(data): 3411 sample = sample[~np.isnan(sample)] 3412 3413 nabove = count_nonzero(sample > grand_median) 3414 nbelow = count_nonzero(sample < grand_median) 3415 nequal = sample.size - (nabove + nbelow) 3416 table[0, k] += nabove 3417 table[1, k] += nbelow 3418 if ties == "below": 3419 table[1, k] += nequal 3420 elif ties == "above": 3421 table[0, k] += nequal 3422 3423 # Check that no row or column of the table is all zero. 3424 # Such a table can not be given to chi2_contingency, because it would have 3425 # a zero in the table of expected frequencies. 3426 rowsums = table.sum(axis=1) 3427 if rowsums[0] == 0: 3428 raise ValueError("All values are below the grand median (%r)." % 3429 grand_median) 3430 if rowsums[1] == 0: 3431 raise ValueError("All values are above the grand median (%r)." % 3432 grand_median) 3433 if ties == "ignore": 3434 # We already checked that each sample has at least one value, but it 3435 # is possible that all those values equal the grand median. If `ties` 3436 # is "ignore", that would result in a column of zeros in `table`. We 3437 # check for that case here. 3438 zero_cols = np.nonzero((table == 0).all(axis=0))[0] 3439 if len(zero_cols) > 0: 3440 msg = ("All values in sample %d are equal to the grand " 3441 "median (%r), so they are ignored, resulting in an " 3442 "empty sample." % (zero_cols[0] + 1, grand_median)) 3443 raise ValueError(msg) 3444 3445 stat, p, dof, expected = chi2_contingency(table, lambda_=lambda_, 3446 correction=correction) 3447 return stat, p, grand_median, table 3448 3449 3450def _circfuncs_common(samples, high, low, nan_policy='propagate'): 3451 # Ensure samples are array-like and size is not zero 3452 samples = np.asarray(samples) 3453 if samples.size == 0: 3454 return np.nan, np.asarray(np.nan), np.asarray(np.nan), None 3455 3456 # Recast samples as radians that range between 0 and 2 pi and calculate 3457 # the sine and cosine 3458 sin_samp = sin((samples - low)*2.*pi / (high - low)) 3459 cos_samp = cos((samples - low)*2.*pi / (high - low)) 3460 3461 # Apply the NaN policy 3462 contains_nan, nan_policy = _contains_nan(samples, nan_policy) 3463 if contains_nan and nan_policy == 'omit': 3464 mask = np.isnan(samples) 3465 # Set the sines and cosines that are NaN to zero 3466 sin_samp[mask] = 0.0 3467 cos_samp[mask] = 0.0 3468 else: 3469 mask = None 3470 3471 return samples, sin_samp, cos_samp, mask 3472 3473 3474def circmean(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'): 3475 """Compute the circular mean for samples in a range. 3476 3477 Parameters 3478 ---------- 3479 samples : array_like 3480 Input array. 3481 high : float or int, optional 3482 High boundary for circular mean range. Default is ``2*pi``. 3483 low : float or int, optional 3484 Low boundary for circular mean range. Default is 0. 3485 axis : int, optional 3486 Axis along which means are computed. The default is to compute 3487 the mean of the flattened array. 3488 nan_policy : {'propagate', 'raise', 'omit'}, optional 3489 Defines how to handle when input contains nan. 'propagate' returns nan, 3490 'raise' throws an error, 'omit' performs the calculations ignoring nan 3491 values. Default is 'propagate'. 3492 3493 Returns 3494 ------- 3495 circmean : float 3496 Circular mean. 3497 3498 Examples 3499 -------- 3500 >>> from scipy.stats import circmean 3501 >>> circmean([0.1, 2*np.pi+0.2, 6*np.pi+0.3]) 3502 0.2 3503 3504 >>> from scipy.stats import circmean 3505 >>> circmean([0.2, 1.4, 2.6], high = 1, low = 0) 3506 0.4 3507 3508 """ 3509 samples, sin_samp, cos_samp, nmask = _circfuncs_common(samples, high, low, 3510 nan_policy=nan_policy) 3511 sin_sum = sin_samp.sum(axis=axis) 3512 cos_sum = cos_samp.sum(axis=axis) 3513 res = arctan2(sin_sum, cos_sum) 3514 3515 mask_nan = ~np.isnan(res) 3516 if mask_nan.ndim > 0: 3517 mask = res[mask_nan] < 0 3518 else: 3519 mask = res < 0 3520 3521 if mask.ndim > 0: 3522 mask_nan[mask_nan] = mask 3523 res[mask_nan] += 2*pi 3524 elif mask: 3525 res += 2*pi 3526 3527 # Set output to NaN if no samples went into the mean 3528 if nmask is not None: 3529 if nmask.all(): 3530 res = np.full(shape=res.shape, fill_value=np.nan) 3531 else: 3532 # Find out if any of the axis that are being averaged consist 3533 # entirely of NaN. If one exists, set the result (res) to NaN 3534 nshape = 0 if axis is None else axis 3535 smask = nmask.shape[nshape] == nmask.sum(axis=axis) 3536 if smask.any(): 3537 res[smask] = np.nan 3538 3539 return res*(high - low)/2.0/pi + low 3540 3541 3542def circvar(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'): 3543 """Compute the circular variance for samples assumed to be in a range. 3544 3545 Parameters 3546 ---------- 3547 samples : array_like 3548 Input array. 3549 high : float or int, optional 3550 High boundary for circular variance range. Default is ``2*pi``. 3551 low : float or int, optional 3552 Low boundary for circular variance range. Default is 0. 3553 axis : int, optional 3554 Axis along which variances are computed. The default is to compute 3555 the variance of the flattened array. 3556 nan_policy : {'propagate', 'raise', 'omit'}, optional 3557 Defines how to handle when input contains nan. 'propagate' returns nan, 3558 'raise' throws an error, 'omit' performs the calculations ignoring nan 3559 values. Default is 'propagate'. 3560 3561 Returns 3562 ------- 3563 circvar : float 3564 Circular variance. 3565 3566 Notes 3567 ----- 3568 This uses a definition of circular variance that in the limit of small 3569 angles returns a number close to the 'linear' variance. 3570 3571 Examples 3572 -------- 3573 >>> from scipy.stats import circvar 3574 >>> circvar([0, 2*np.pi/3, 5*np.pi/3]) 3575 2.19722457734 3576 3577 """ 3578 samples, sin_samp, cos_samp, mask = _circfuncs_common(samples, high, low, 3579 nan_policy=nan_policy) 3580 if mask is None: 3581 sin_mean = sin_samp.mean(axis=axis) 3582 cos_mean = cos_samp.mean(axis=axis) 3583 else: 3584 nsum = np.asarray(np.sum(~mask, axis=axis).astype(float)) 3585 nsum[nsum == 0] = np.nan 3586 sin_mean = sin_samp.sum(axis=axis) / nsum 3587 cos_mean = cos_samp.sum(axis=axis) / nsum 3588 # hypot can go slightly above 1 due to rounding errors 3589 with np.errstate(invalid='ignore'): 3590 R = np.minimum(1, hypot(sin_mean, cos_mean)) 3591 3592 return ((high - low)/2.0/pi)**2 * -2 * log(R) 3593 3594 3595def circstd(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'): 3596 """ 3597 Compute the circular standard deviation for samples assumed to be in the 3598 range [low to high]. 3599 3600 Parameters 3601 ---------- 3602 samples : array_like 3603 Input array. 3604 high : float or int, optional 3605 High boundary for circular standard deviation range. 3606 Default is ``2*pi``. 3607 low : float or int, optional 3608 Low boundary for circular standard deviation range. Default is 0. 3609 axis : int, optional 3610 Axis along which standard deviations are computed. The default is 3611 to compute the standard deviation of the flattened array. 3612 nan_policy : {'propagate', 'raise', 'omit'}, optional 3613 Defines how to handle when input contains nan. 'propagate' returns nan, 3614 'raise' throws an error, 'omit' performs the calculations ignoring nan 3615 values. Default is 'propagate'. 3616 3617 Returns 3618 ------- 3619 circstd : float 3620 Circular standard deviation. 3621 3622 Notes 3623 ----- 3624 This uses a definition of circular standard deviation that in the limit of 3625 small angles returns a number close to the 'linear' standard deviation. 3626 3627 Examples 3628 -------- 3629 >>> from scipy.stats import circstd 3630 >>> circstd([0, 0.1*np.pi/2, 0.001*np.pi, 0.03*np.pi/2]) 3631 0.063564063306 3632 3633 """ 3634 samples, sin_samp, cos_samp, mask = _circfuncs_common(samples, high, low, 3635 nan_policy=nan_policy) 3636 if mask is None: 3637 sin_mean = sin_samp.mean(axis=axis) 3638 cos_mean = cos_samp.mean(axis=axis) 3639 else: 3640 nsum = np.asarray(np.sum(~mask, axis=axis).astype(float)) 3641 nsum[nsum == 0] = np.nan 3642 sin_mean = sin_samp.sum(axis=axis) / nsum 3643 cos_mean = cos_samp.sum(axis=axis) / nsum 3644 # hypot can go slightly above 1 due to rounding errors 3645 with np.errstate(invalid='ignore'): 3646 R = np.minimum(1, hypot(sin_mean, cos_mean)) 3647 3648 return ((high - low)/2.0/pi) * sqrt(-2*log(R)) 3649