1from __future__ import annotations
2import math
3import warnings
4from collections import namedtuple
5
6import numpy as np
7from numpy import (isscalar, r_, log, around, unique, asarray, zeros,
8                   arange, sort, amin, amax, atleast_1d, sqrt, array,
9                   compress, pi, exp, ravel, count_nonzero, sin, cos,
10                   arctan2, hypot)
11
12from scipy import optimize
13from scipy import special
14from . import statlib
15from . import stats
16from .stats import find_repeats, _contains_nan, _normtest_finish
17from .contingency import chi2_contingency
18from . import distributions
19from ._distn_infrastructure import rv_generic
20from ._hypotests import _get_wilcoxon_distr
21
22
23__all__ = ['mvsdist',
24           'bayes_mvs', 'kstat', 'kstatvar', 'probplot', 'ppcc_max', 'ppcc_plot',
25           'boxcox_llf', 'boxcox', 'boxcox_normmax', 'boxcox_normplot',
26           'shapiro', 'anderson', 'ansari', 'bartlett', 'levene', 'binom_test',
27           'fligner', 'mood', 'wilcoxon', 'median_test',
28           'circmean', 'circvar', 'circstd', 'anderson_ksamp',
29           'yeojohnson_llf', 'yeojohnson', 'yeojohnson_normmax',
30           'yeojohnson_normplot'
31           ]
32
33
34Mean = namedtuple('Mean', ('statistic', 'minmax'))
35Variance = namedtuple('Variance', ('statistic', 'minmax'))
36Std_dev = namedtuple('Std_dev', ('statistic', 'minmax'))
37
38
39def bayes_mvs(data, alpha=0.90):
40    r"""
41    Bayesian confidence intervals for the mean, var, and std.
42
43    Parameters
44    ----------
45    data : array_like
46        Input data, if multi-dimensional it is flattened to 1-D by `bayes_mvs`.
47        Requires 2 or more data points.
48    alpha : float, optional
49        Probability that the returned confidence interval contains
50        the true parameter.
51
52    Returns
53    -------
54    mean_cntr, var_cntr, std_cntr : tuple
55        The three results are for the mean, variance and standard deviation,
56        respectively.  Each result is a tuple of the form::
57
58            (center, (lower, upper))
59
60        with `center` the mean of the conditional pdf of the value given the
61        data, and `(lower, upper)` a confidence interval, centered on the
62        median, containing the estimate to a probability ``alpha``.
63
64    See Also
65    --------
66    mvsdist
67
68    Notes
69    -----
70    Each tuple of mean, variance, and standard deviation estimates represent
71    the (center, (lower, upper)) with center the mean of the conditional pdf
72    of the value given the data and (lower, upper) is a confidence interval
73    centered on the median, containing the estimate to a probability
74    ``alpha``.
75
76    Converts data to 1-D and assumes all data has the same mean and variance.
77    Uses Jeffrey's prior for variance and std.
78
79    Equivalent to ``tuple((x.mean(), x.interval(alpha)) for x in mvsdist(dat))``
80
81    References
82    ----------
83    T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
84    standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
85    2006.
86
87    Examples
88    --------
89    First a basic example to demonstrate the outputs:
90
91    >>> from scipy import stats
92    >>> data = [6, 9, 12, 7, 8, 8, 13]
93    >>> mean, var, std = stats.bayes_mvs(data)
94    >>> mean
95    Mean(statistic=9.0, minmax=(7.103650222612533, 10.896349777387467))
96    >>> var
97    Variance(statistic=10.0, minmax=(3.176724206..., 24.45910382...))
98    >>> std
99    Std_dev(statistic=2.9724954732045084, minmax=(1.7823367265645143, 4.945614605014631))
100
101    Now we generate some normally distributed random data, and get estimates of
102    mean and standard deviation with 95% confidence intervals for those
103    estimates:
104
105    >>> n_samples = 100000
106    >>> data = stats.norm.rvs(size=n_samples)
107    >>> res_mean, res_var, res_std = stats.bayes_mvs(data, alpha=0.95)
108
109    >>> import matplotlib.pyplot as plt
110    >>> fig = plt.figure()
111    >>> ax = fig.add_subplot(111)
112    >>> ax.hist(data, bins=100, density=True, label='Histogram of data')
113    >>> ax.vlines(res_mean.statistic, 0, 0.5, colors='r', label='Estimated mean')
114    >>> ax.axvspan(res_mean.minmax[0],res_mean.minmax[1], facecolor='r',
115    ...            alpha=0.2, label=r'Estimated mean (95% limits)')
116    >>> ax.vlines(res_std.statistic, 0, 0.5, colors='g', label='Estimated scale')
117    >>> ax.axvspan(res_std.minmax[0],res_std.minmax[1], facecolor='g', alpha=0.2,
118    ...            label=r'Estimated scale (95% limits)')
119
120    >>> ax.legend(fontsize=10)
121    >>> ax.set_xlim([-4, 4])
122    >>> ax.set_ylim([0, 0.5])
123    >>> plt.show()
124
125    """
126    m, v, s = mvsdist(data)
127    if alpha >= 1 or alpha <= 0:
128        raise ValueError("0 < alpha < 1 is required, but alpha=%s was given."
129                         % alpha)
130
131    m_res = Mean(m.mean(), m.interval(alpha))
132    v_res = Variance(v.mean(), v.interval(alpha))
133    s_res = Std_dev(s.mean(), s.interval(alpha))
134
135    return m_res, v_res, s_res
136
137
138def mvsdist(data):
139    """
140    'Frozen' distributions for mean, variance, and standard deviation of data.
141
142    Parameters
143    ----------
144    data : array_like
145        Input array. Converted to 1-D using ravel.
146        Requires 2 or more data-points.
147
148    Returns
149    -------
150    mdist : "frozen" distribution object
151        Distribution object representing the mean of the data.
152    vdist : "frozen" distribution object
153        Distribution object representing the variance of the data.
154    sdist : "frozen" distribution object
155        Distribution object representing the standard deviation of the data.
156
157    See Also
158    --------
159    bayes_mvs
160
161    Notes
162    -----
163    The return values from ``bayes_mvs(data)`` is equivalent to
164    ``tuple((x.mean(), x.interval(0.90)) for x in mvsdist(data))``.
165
166    In other words, calling ``<dist>.mean()`` and ``<dist>.interval(0.90)``
167    on the three distribution objects returned from this function will give
168    the same results that are returned from `bayes_mvs`.
169
170    References
171    ----------
172    T.E. Oliphant, "A Bayesian perspective on estimating mean, variance, and
173    standard-deviation from data", https://scholarsarchive.byu.edu/facpub/278,
174    2006.
175
176    Examples
177    --------
178    >>> from scipy import stats
179    >>> data = [6, 9, 12, 7, 8, 8, 13]
180    >>> mean, var, std = stats.mvsdist(data)
181
182    We now have frozen distribution objects "mean", "var" and "std" that we can
183    examine:
184
185    >>> mean.mean()
186    9.0
187    >>> mean.interval(0.95)
188    (6.6120585482655692, 11.387941451734431)
189    >>> mean.std()
190    1.1952286093343936
191
192    """
193    x = ravel(data)
194    n = len(x)
195    if n < 2:
196        raise ValueError("Need at least 2 data-points.")
197    xbar = x.mean()
198    C = x.var()
199    if n > 1000:  # gaussian approximations for large n
200        mdist = distributions.norm(loc=xbar, scale=math.sqrt(C / n))
201        sdist = distributions.norm(loc=math.sqrt(C), scale=math.sqrt(C / (2. * n)))
202        vdist = distributions.norm(loc=C, scale=math.sqrt(2.0 / n) * C)
203    else:
204        nm1 = n - 1
205        fac = n * C / 2.
206        val = nm1 / 2.
207        mdist = distributions.t(nm1, loc=xbar, scale=math.sqrt(C / nm1))
208        sdist = distributions.gengamma(val, -2, scale=math.sqrt(fac))
209        vdist = distributions.invgamma(val, scale=fac)
210    return mdist, vdist, sdist
211
212
213def kstat(data, n=2):
214    r"""
215    Return the nth k-statistic (1<=n<=4 so far).
216
217    The nth k-statistic k_n is the unique symmetric unbiased estimator of the
218    nth cumulant kappa_n.
219
220    Parameters
221    ----------
222    data : array_like
223        Input array. Note that n-D input gets flattened.
224    n : int, {1, 2, 3, 4}, optional
225        Default is equal to 2.
226
227    Returns
228    -------
229    kstat : float
230        The nth k-statistic.
231
232    See Also
233    --------
234    kstatvar: Returns an unbiased estimator of the variance of the k-statistic.
235    moment: Returns the n-th central moment about the mean for a sample.
236
237    Notes
238    -----
239    For a sample size n, the first few k-statistics are given by:
240
241    .. math::
242
243        k_{1} = \mu
244        k_{2} = \frac{n}{n-1} m_{2}
245        k_{3} = \frac{ n^{2} } {(n-1) (n-2)} m_{3}
246        k_{4} = \frac{ n^{2} [(n + 1)m_{4} - 3(n - 1) m^2_{2}]} {(n-1) (n-2) (n-3)}
247
248    where :math:`\mu` is the sample mean, :math:`m_2` is the sample
249    variance, and :math:`m_i` is the i-th sample central moment.
250
251    References
252    ----------
253    http://mathworld.wolfram.com/k-Statistic.html
254
255    http://mathworld.wolfram.com/Cumulant.html
256
257    Examples
258    --------
259    >>> from scipy import stats
260    >>> from numpy.random import default_rng
261    >>> rng = default_rng()
262
263    As sample size increases, n-th moment and n-th k-statistic converge to the
264    same number (although they aren't identical). In the case of the normal
265    distribution, they converge to zero.
266
267    >>> for n in [2, 3, 4, 5, 6, 7]:
268    ...     x = rng.normal(size=10**n)
269    ...     m, k = stats.moment(x, 3), stats.kstat(x, 3)
270    ...     print("%.3g %.3g %.3g" % (m, k, m-k))
271    -0.631 -0.651 0.0194  # random
272    0.0282 0.0283 -8.49e-05
273    -0.0454 -0.0454 1.36e-05
274    7.53e-05 7.53e-05 -2.26e-09
275    0.00166 0.00166 -4.99e-09
276    -2.88e-06 -2.88e-06 8.63e-13
277    """
278    if n > 4 or n < 1:
279        raise ValueError("k-statistics only supported for 1<=n<=4")
280    n = int(n)
281    S = np.zeros(n + 1, np.float64)
282    data = ravel(data)
283    N = data.size
284
285    # raise ValueError on empty input
286    if N == 0:
287        raise ValueError("Data input must not be empty")
288
289    # on nan input, return nan without warning
290    if np.isnan(np.sum(data)):
291        return np.nan
292
293    for k in range(1, n + 1):
294        S[k] = np.sum(data**k, axis=0)
295    if n == 1:
296        return S[1] * 1.0/N
297    elif n == 2:
298        return (N*S[2] - S[1]**2.0) / (N*(N - 1.0))
299    elif n == 3:
300        return (2*S[1]**3 - 3*N*S[1]*S[2] + N*N*S[3]) / (N*(N - 1.0)*(N - 2.0))
301    elif n == 4:
302        return ((-6*S[1]**4 + 12*N*S[1]**2 * S[2] - 3*N*(N-1.0)*S[2]**2 -
303                 4*N*(N+1)*S[1]*S[3] + N*N*(N+1)*S[4]) /
304                 (N*(N-1.0)*(N-2.0)*(N-3.0)))
305    else:
306        raise ValueError("Should not be here.")
307
308
309def kstatvar(data, n=2):
310    r"""Return an unbiased estimator of the variance of the k-statistic.
311
312    See `kstat` for more details of the k-statistic.
313
314    Parameters
315    ----------
316    data : array_like
317        Input array. Note that n-D input gets flattened.
318    n : int, {1, 2}, optional
319        Default is equal to 2.
320
321    Returns
322    -------
323    kstatvar : float
324        The nth k-statistic variance.
325
326    See Also
327    --------
328    kstat: Returns the n-th k-statistic.
329    moment: Returns the n-th central moment about the mean for a sample.
330
331    Notes
332    -----
333    The variances of the first few k-statistics are given by:
334
335    .. math::
336
337        var(k_{1}) = \frac{\kappa^2}{n}
338        var(k_{2}) = \frac{\kappa^4}{n} + \frac{2\kappa^2_{2}}{n - 1}
339        var(k_{3}) = \frac{\kappa^6}{n} + \frac{9 \kappa_2 \kappa_4}{n - 1} +
340                     \frac{9 \kappa^2_{3}}{n - 1} +
341                     \frac{6 n \kappa^3_{2}}{(n-1) (n-2)}
342        var(k_{4}) = \frac{\kappa^8}{n} + \frac{16 \kappa_2 \kappa_6}{n - 1} +
343                     \frac{48 \kappa_{3} \kappa_5}{n - 1} +
344                     \frac{34 \kappa^2_{4}}{n-1} + \frac{72 n \kappa^2_{2} \kappa_4}{(n - 1) (n - 2)} +
345                     \frac{144 n \kappa_{2} \kappa^2_{3}}{(n - 1) (n - 2)} +
346                     \frac{24 (n + 1) n \kappa^4_{2}}{(n - 1) (n - 2) (n - 3)}
347    """
348    data = ravel(data)
349    N = len(data)
350    if n == 1:
351        return kstat(data, n=2) * 1.0/N
352    elif n == 2:
353        k2 = kstat(data, n=2)
354        k4 = kstat(data, n=4)
355        return (2*N*k2**2 + (N-1)*k4) / (N*(N+1))
356    else:
357        raise ValueError("Only n=1 or n=2 supported.")
358
359
360def _calc_uniform_order_statistic_medians(n):
361    """Approximations of uniform order statistic medians.
362
363    Parameters
364    ----------
365    n : int
366        Sample size.
367
368    Returns
369    -------
370    v : 1d float array
371        Approximations of the order statistic medians.
372
373    References
374    ----------
375    .. [1] James J. Filliben, "The Probability Plot Correlation Coefficient
376           Test for Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
377
378    Examples
379    --------
380    Order statistics of the uniform distribution on the unit interval
381    are marginally distributed according to beta distributions.
382    The expectations of these order statistic are evenly spaced across
383    the interval, but the distributions are skewed in a way that
384    pushes the medians slightly towards the endpoints of the unit interval:
385
386    >>> n = 4
387    >>> k = np.arange(1, n+1)
388    >>> from scipy.stats import beta
389    >>> a = k
390    >>> b = n-k+1
391    >>> beta.mean(a, b)
392    array([0.2, 0.4, 0.6, 0.8])
393    >>> beta.median(a, b)
394    array([0.15910358, 0.38572757, 0.61427243, 0.84089642])
395
396    The Filliben approximation uses the exact medians of the smallest
397    and greatest order statistics, and the remaining medians are approximated
398    by points spread evenly across a sub-interval of the unit interval:
399
400    >>> from scipy.morestats import _calc_uniform_order_statistic_medians
401    >>> _calc_uniform_order_statistic_medians(n)
402    array([0.15910358, 0.38545246, 0.61454754, 0.84089642])
403
404    This plot shows the skewed distributions of the order statistics
405    of a sample of size four from a uniform distribution on the unit interval:
406
407    >>> import matplotlib.pyplot as plt
408    >>> x = np.linspace(0.0, 1.0, num=50, endpoint=True)
409    >>> pdfs = [beta.pdf(x, a[i], b[i]) for i in range(n)]
410    >>> plt.figure()
411    >>> plt.plot(x, pdfs[0], x, pdfs[1], x, pdfs[2], x, pdfs[3])
412
413    """
414    v = np.empty(n, dtype=np.float64)
415    v[-1] = 0.5**(1.0 / n)
416    v[0] = 1 - v[-1]
417    i = np.arange(2, n)
418    v[1:-1] = (i - 0.3175) / (n + 0.365)
419    return v
420
421
422def _parse_dist_kw(dist, enforce_subclass=True):
423    """Parse `dist` keyword.
424
425    Parameters
426    ----------
427    dist : str or stats.distributions instance.
428        Several functions take `dist` as a keyword, hence this utility
429        function.
430    enforce_subclass : bool, optional
431        If True (default), `dist` needs to be a
432        `_distn_infrastructure.rv_generic` instance.
433        It can sometimes be useful to set this keyword to False, if a function
434        wants to accept objects that just look somewhat like such an instance
435        (for example, they have a ``ppf`` method).
436
437    """
438    if isinstance(dist, rv_generic):
439        pass
440    elif isinstance(dist, str):
441        try:
442            dist = getattr(distributions, dist)
443        except AttributeError as e:
444            raise ValueError("%s is not a valid distribution name" % dist) from e
445    elif enforce_subclass:
446        msg = ("`dist` should be a stats.distributions instance or a string "
447               "with the name of such a distribution.")
448        raise ValueError(msg)
449
450    return dist
451
452
453def _add_axis_labels_title(plot, xlabel, ylabel, title):
454    """Helper function to add axes labels and a title to stats plots."""
455    try:
456        if hasattr(plot, 'set_title'):
457            # Matplotlib Axes instance or something that looks like it
458            plot.set_title(title)
459            plot.set_xlabel(xlabel)
460            plot.set_ylabel(ylabel)
461        else:
462            # matplotlib.pyplot module
463            plot.title(title)
464            plot.xlabel(xlabel)
465            plot.ylabel(ylabel)
466    except Exception:
467        # Not an MPL object or something that looks (enough) like it.
468        # Don't crash on adding labels or title
469        pass
470
471
472def probplot(x, sparams=(), dist='norm', fit=True, plot=None, rvalue=False):
473    """
474    Calculate quantiles for a probability plot, and optionally show the plot.
475
476    Generates a probability plot of sample data against the quantiles of a
477    specified theoretical distribution (the normal distribution by default).
478    `probplot` optionally calculates a best-fit line for the data and plots the
479    results using Matplotlib or a given plot function.
480
481    Parameters
482    ----------
483    x : array_like
484        Sample/response data from which `probplot` creates the plot.
485    sparams : tuple, optional
486        Distribution-specific shape parameters (shape parameters plus location
487        and scale).
488    dist : str or stats.distributions instance, optional
489        Distribution or distribution function name. The default is 'norm' for a
490        normal probability plot.  Objects that look enough like a
491        stats.distributions instance (i.e. they have a ``ppf`` method) are also
492        accepted.
493    fit : bool, optional
494        Fit a least-squares regression (best-fit) line to the sample data if
495        True (default).
496    plot : object, optional
497        If given, plots the quantiles.
498        If given and `fit` is True, also plots the least squares fit.
499        `plot` is an object that has to have methods "plot" and "text".
500        The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
501        or a custom object with the same methods.
502        Default is None, which means that no plot is created.
503
504    Returns
505    -------
506    (osm, osr) : tuple of ndarrays
507        Tuple of theoretical quantiles (osm, or order statistic medians) and
508        ordered responses (osr).  `osr` is simply sorted input `x`.
509        For details on how `osm` is calculated see the Notes section.
510    (slope, intercept, r) : tuple of floats, optional
511        Tuple  containing the result of the least-squares fit, if that is
512        performed by `probplot`. `r` is the square root of the coefficient of
513        determination.  If ``fit=False`` and ``plot=None``, this tuple is not
514        returned.
515
516    Notes
517    -----
518    Even if `plot` is given, the figure is not shown or saved by `probplot`;
519    ``plt.show()`` or ``plt.savefig('figname.png')`` should be used after
520    calling `probplot`.
521
522    `probplot` generates a probability plot, which should not be confused with
523    a Q-Q or a P-P plot.  Statsmodels has more extensive functionality of this
524    type, see ``statsmodels.api.ProbPlot``.
525
526    The formula used for the theoretical quantiles (horizontal axis of the
527    probability plot) is Filliben's estimate::
528
529        quantiles = dist.ppf(val), for
530
531                0.5**(1/n),                  for i = n
532          val = (i - 0.3175) / (n + 0.365),  for i = 2, ..., n-1
533                1 - 0.5**(1/n),              for i = 1
534
535    where ``i`` indicates the i-th ordered value and ``n`` is the total number
536    of values.
537
538    Examples
539    --------
540    >>> from scipy import stats
541    >>> import matplotlib.pyplot as plt
542    >>> nsample = 100
543    >>> rng = np.random.default_rng()
544
545    A t distribution with small degrees of freedom:
546
547    >>> ax1 = plt.subplot(221)
548    >>> x = stats.t.rvs(3, size=nsample, random_state=rng)
549    >>> res = stats.probplot(x, plot=plt)
550
551    A t distribution with larger degrees of freedom:
552
553    >>> ax2 = plt.subplot(222)
554    >>> x = stats.t.rvs(25, size=nsample, random_state=rng)
555    >>> res = stats.probplot(x, plot=plt)
556
557    A mixture of two normal distributions with broadcasting:
558
559    >>> ax3 = plt.subplot(223)
560    >>> x = stats.norm.rvs(loc=[0,5], scale=[1,1.5],
561    ...                    size=(nsample//2,2), random_state=rng).ravel()
562    >>> res = stats.probplot(x, plot=plt)
563
564    A standard normal distribution:
565
566    >>> ax4 = plt.subplot(224)
567    >>> x = stats.norm.rvs(loc=0, scale=1, size=nsample, random_state=rng)
568    >>> res = stats.probplot(x, plot=plt)
569
570    Produce a new figure with a loggamma distribution, using the ``dist`` and
571    ``sparams`` keywords:
572
573    >>> fig = plt.figure()
574    >>> ax = fig.add_subplot(111)
575    >>> x = stats.loggamma.rvs(c=2.5, size=500, random_state=rng)
576    >>> res = stats.probplot(x, dist=stats.loggamma, sparams=(2.5,), plot=ax)
577    >>> ax.set_title("Probplot for loggamma dist with shape parameter 2.5")
578
579    Show the results with Matplotlib:
580
581    >>> plt.show()
582
583    """
584    x = np.asarray(x)
585    if x.size == 0:
586        if fit:
587            return (x, x), (np.nan, np.nan, 0.0)
588        else:
589            return x, x
590
591    osm_uniform = _calc_uniform_order_statistic_medians(len(x))
592    dist = _parse_dist_kw(dist, enforce_subclass=False)
593    if sparams is None:
594        sparams = ()
595    if isscalar(sparams):
596        sparams = (sparams,)
597    if not isinstance(sparams, tuple):
598        sparams = tuple(sparams)
599
600    osm = dist.ppf(osm_uniform, *sparams)
601    osr = sort(x)
602    if fit:
603        # perform a linear least squares fit.
604        slope, intercept, r, prob, _ = stats.linregress(osm, osr)
605
606    if plot is not None:
607        plot.plot(osm, osr, 'bo')
608        if fit:
609            plot.plot(osm, slope*osm + intercept, 'r-')
610        _add_axis_labels_title(plot, xlabel='Theoretical quantiles',
611                               ylabel='Ordered Values',
612                               title='Probability Plot')
613
614        # Add R^2 value to the plot as text
615        if rvalue:
616            xmin = amin(osm)
617            xmax = amax(osm)
618            ymin = amin(x)
619            ymax = amax(x)
620            posx = xmin + 0.70 * (xmax - xmin)
621            posy = ymin + 0.01 * (ymax - ymin)
622            plot.text(posx, posy, "$R^2=%1.4f$" % r**2)
623
624    if fit:
625        return (osm, osr), (slope, intercept, r)
626    else:
627        return osm, osr
628
629
630def ppcc_max(x, brack=(0.0, 1.0), dist='tukeylambda'):
631    """Calculate the shape parameter that maximizes the PPCC.
632
633    The probability plot correlation coefficient (PPCC) plot can be used
634    to determine the optimal shape parameter for a one-parameter family
635    of distributions. ``ppcc_max`` returns the shape parameter that would
636    maximize the probability plot correlation coefficient for the given
637    data to a one-parameter family of distributions.
638
639    Parameters
640    ----------
641    x : array_like
642        Input array.
643    brack : tuple, optional
644        Triple (a,b,c) where (a<b<c). If bracket consists of two numbers (a, c)
645        then they are assumed to be a starting interval for a downhill bracket
646        search (see `scipy.optimize.brent`).
647    dist : str or stats.distributions instance, optional
648        Distribution or distribution function name.  Objects that look enough
649        like a stats.distributions instance (i.e. they have a ``ppf`` method)
650        are also accepted.  The default is ``'tukeylambda'``.
651
652    Returns
653    -------
654    shape_value : float
655        The shape parameter at which the probability plot correlation
656        coefficient reaches its max value.
657
658    See Also
659    --------
660    ppcc_plot, probplot, boxcox
661
662    Notes
663    -----
664    The brack keyword serves as a starting point which is useful in corner
665    cases. One can use a plot to obtain a rough visual estimate of the location
666    for the maximum to start the search near it.
667
668    References
669    ----------
670    .. [1] J.J. Filliben, "The Probability Plot Correlation Coefficient Test
671           for Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
672    .. [2] Engineering Statistics Handbook, NIST/SEMATEC,
673           https://www.itl.nist.gov/div898/handbook/eda/section3/ppccplot.htm
674
675    Examples
676    --------
677    First we generate some random data from a Weibull distribution
678    with shape parameter 2.5:
679
680    >>> from scipy import stats
681    >>> import matplotlib.pyplot as plt
682    >>> rng = np.random.default_rng()
683    >>> c = 2.5
684    >>> x = stats.weibull_min.rvs(c, scale=4, size=2000, random_state=rng)
685
686    Generate the PPCC plot for this data with the Weibull distribution.
687
688    >>> fig, ax = plt.subplots(figsize=(8, 6))
689    >>> res = stats.ppcc_plot(x, c/2, 2*c, dist='weibull_min', plot=ax)
690
691    We calculate the value where the shape should reach its maximum and a
692    red line is drawn there. The line should coincide with the highest
693    point in the PPCC graph.
694
695    >>> cmax = stats.ppcc_max(x, brack=(c/2, 2*c), dist='weibull_min')
696    >>> ax.axvline(cmax, color='r')
697    >>> plt.show()
698
699    """
700    dist = _parse_dist_kw(dist)
701    osm_uniform = _calc_uniform_order_statistic_medians(len(x))
702    osr = sort(x)
703
704    # this function computes the x-axis values of the probability plot
705    #  and computes a linear regression (including the correlation)
706    #  and returns 1-r so that a minimization function maximizes the
707    #  correlation
708    def tempfunc(shape, mi, yvals, func):
709        xvals = func(mi, shape)
710        r, prob = stats.pearsonr(xvals, yvals)
711        return 1 - r
712
713    return optimize.brent(tempfunc, brack=brack,
714                          args=(osm_uniform, osr, dist.ppf))
715
716
717def ppcc_plot(x, a, b, dist='tukeylambda', plot=None, N=80):
718    """Calculate and optionally plot probability plot correlation coefficient.
719
720    The probability plot correlation coefficient (PPCC) plot can be used to
721    determine the optimal shape parameter for a one-parameter family of
722    distributions.  It cannot be used for distributions without shape
723    parameters
724    (like the normal distribution) or with multiple shape parameters.
725
726    By default a Tukey-Lambda distribution (`stats.tukeylambda`) is used. A
727    Tukey-Lambda PPCC plot interpolates from long-tailed to short-tailed
728    distributions via an approximately normal one, and is therefore
729    particularly useful in practice.
730
731    Parameters
732    ----------
733    x : array_like
734        Input array.
735    a, b : scalar
736        Lower and upper bounds of the shape parameter to use.
737    dist : str or stats.distributions instance, optional
738        Distribution or distribution function name.  Objects that look enough
739        like a stats.distributions instance (i.e. they have a ``ppf`` method)
740        are also accepted.  The default is ``'tukeylambda'``.
741    plot : object, optional
742        If given, plots PPCC against the shape parameter.
743        `plot` is an object that has to have methods "plot" and "text".
744        The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
745        or a custom object with the same methods.
746        Default is None, which means that no plot is created.
747    N : int, optional
748        Number of points on the horizontal axis (equally distributed from
749        `a` to `b`).
750
751    Returns
752    -------
753    svals : ndarray
754        The shape values for which `ppcc` was calculated.
755    ppcc : ndarray
756        The calculated probability plot correlation coefficient values.
757
758    See Also
759    --------
760    ppcc_max, probplot, boxcox_normplot, tukeylambda
761
762    References
763    ----------
764    J.J. Filliben, "The Probability Plot Correlation Coefficient Test for
765    Normality", Technometrics, Vol. 17, pp. 111-117, 1975.
766
767    Examples
768    --------
769    First we generate some random data from a Weibull distribution
770    with shape parameter 2.5, and plot the histogram of the data:
771
772    >>> from scipy import stats
773    >>> import matplotlib.pyplot as plt
774    >>> rng = np.random.default_rng()
775    >>> c = 2.5
776    >>> x = stats.weibull_min.rvs(c, scale=4, size=2000, random_state=rng)
777
778    Take a look at the histogram of the data.
779
780    >>> fig1, ax = plt.subplots(figsize=(9, 4))
781    >>> ax.hist(x, bins=50)
782    >>> ax.set_title('Histogram of x')
783    >>> plt.show()
784
785    Now we explore this data with a PPCC plot as well as the related
786    probability plot and Box-Cox normplot.  A red line is drawn where we
787    expect the PPCC value to be maximal (at the shape parameter ``c``
788    used above):
789
790    >>> fig2 = plt.figure(figsize=(12, 4))
791    >>> ax1 = fig2.add_subplot(1, 3, 1)
792    >>> ax2 = fig2.add_subplot(1, 3, 2)
793    >>> ax3 = fig2.add_subplot(1, 3, 3)
794    >>> res = stats.probplot(x, plot=ax1)
795    >>> res = stats.boxcox_normplot(x, -4, 4, plot=ax2)
796    >>> res = stats.ppcc_plot(x, c/2, 2*c, dist='weibull_min', plot=ax3)
797    >>> ax3.axvline(c, color='r')
798    >>> plt.show()
799
800    """
801    if b <= a:
802        raise ValueError("`b` has to be larger than `a`.")
803
804    svals = np.linspace(a, b, num=N)
805    ppcc = np.empty_like(svals)
806    for k, sval in enumerate(svals):
807        _, r2 = probplot(x, sval, dist=dist, fit=True)
808        ppcc[k] = r2[-1]
809
810    if plot is not None:
811        plot.plot(svals, ppcc, 'x')
812        _add_axis_labels_title(plot, xlabel='Shape Values',
813                               ylabel='Prob Plot Corr. Coef.',
814                               title='(%s) PPCC Plot' % dist)
815
816    return svals, ppcc
817
818
819def boxcox_llf(lmb, data):
820    r"""The boxcox log-likelihood function.
821
822    Parameters
823    ----------
824    lmb : scalar
825        Parameter for Box-Cox transformation.  See `boxcox` for details.
826    data : array_like
827        Data to calculate Box-Cox log-likelihood for.  If `data` is
828        multi-dimensional, the log-likelihood is calculated along the first
829        axis.
830
831    Returns
832    -------
833    llf : float or ndarray
834        Box-Cox log-likelihood of `data` given `lmb`.  A float for 1-D `data`,
835        an array otherwise.
836
837    See Also
838    --------
839    boxcox, probplot, boxcox_normplot, boxcox_normmax
840
841    Notes
842    -----
843    The Box-Cox log-likelihood function is defined here as
844
845    .. math::
846
847        llf = (\lambda - 1) \sum_i(\log(x_i)) -
848              N/2 \log(\sum_i (y_i - \bar{y})^2 / N),
849
850    where ``y`` is the Box-Cox transformed input data ``x``.
851
852    Examples
853    --------
854    >>> from scipy import stats
855    >>> import matplotlib.pyplot as plt
856    >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes
857
858    Generate some random variates and calculate Box-Cox log-likelihood values
859    for them for a range of ``lmbda`` values:
860
861    >>> rng = np.random.default_rng()
862    >>> x = stats.loggamma.rvs(5, loc=10, size=1000, random_state=rng)
863    >>> lmbdas = np.linspace(-2, 10)
864    >>> llf = np.zeros(lmbdas.shape, dtype=float)
865    >>> for ii, lmbda in enumerate(lmbdas):
866    ...     llf[ii] = stats.boxcox_llf(lmbda, x)
867
868    Also find the optimal lmbda value with `boxcox`:
869
870    >>> x_most_normal, lmbda_optimal = stats.boxcox(x)
871
872    Plot the log-likelihood as function of lmbda.  Add the optimal lmbda as a
873    horizontal line to check that that's really the optimum:
874
875    >>> fig = plt.figure()
876    >>> ax = fig.add_subplot(111)
877    >>> ax.plot(lmbdas, llf, 'b.-')
878    >>> ax.axhline(stats.boxcox_llf(lmbda_optimal, x), color='r')
879    >>> ax.set_xlabel('lmbda parameter')
880    >>> ax.set_ylabel('Box-Cox log-likelihood')
881
882    Now add some probability plots to show that where the log-likelihood is
883    maximized the data transformed with `boxcox` looks closest to normal:
884
885    >>> locs = [3, 10, 4]  # 'lower left', 'center', 'lower right'
886    >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
887    ...     xt = stats.boxcox(x, lmbda=lmbda)
888    ...     (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
889    ...     ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
890    ...     ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
891    ...     ax_inset.set_xticklabels([])
892    ...     ax_inset.set_yticklabels([])
893    ...     ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)
894
895    >>> plt.show()
896
897    """
898    data = np.asarray(data)
899    N = data.shape[0]
900    if N == 0:
901        return np.nan
902
903    logdata = np.log(data)
904
905    # Compute the variance of the transformed data.
906    if lmb == 0:
907        variance = np.var(logdata, axis=0)
908    else:
909        # Transform without the constant offset 1/lmb.  The offset does
910        # not effect the variance, and the subtraction of the offset can
911        # lead to loss of precision.
912        variance = np.var(data**lmb / lmb, axis=0)
913
914    return (lmb - 1) * np.sum(logdata, axis=0) - N/2 * np.log(variance)
915
916
917def _boxcox_conf_interval(x, lmax, alpha):
918    # Need to find the lambda for which
919    #  f(x,lmbda) >= f(x,lmax) - 0.5*chi^2_alpha;1
920    fac = 0.5 * distributions.chi2.ppf(1 - alpha, 1)
921    target = boxcox_llf(lmax, x) - fac
922
923    def rootfunc(lmbda, data, target):
924        return boxcox_llf(lmbda, data) - target
925
926    # Find positive endpoint of interval in which answer is to be found
927    newlm = lmax + 0.5
928    N = 0
929    while (rootfunc(newlm, x, target) > 0.0) and (N < 500):
930        newlm += 0.1
931        N += 1
932
933    if N == 500:
934        raise RuntimeError("Could not find endpoint.")
935
936    lmplus = optimize.brentq(rootfunc, lmax, newlm, args=(x, target))
937
938    # Now find negative interval in the same way
939    newlm = lmax - 0.5
940    N = 0
941    while (rootfunc(newlm, x, target) > 0.0) and (N < 500):
942        newlm -= 0.1
943        N += 1
944
945    if N == 500:
946        raise RuntimeError("Could not find endpoint.")
947
948    lmminus = optimize.brentq(rootfunc, newlm, lmax, args=(x, target))
949    return lmminus, lmplus
950
951
952def boxcox(x, lmbda=None, alpha=None, optimizer=None):
953    r"""Return a dataset transformed by a Box-Cox power transformation.
954
955    Parameters
956    ----------
957    x : ndarray
958        Input array.  Must be positive 1-dimensional.  Must not be constant.
959    lmbda : {None, scalar}, optional
960        If `lmbda` is not None, do the transformation for that value.
961        If `lmbda` is None, find the lambda that maximizes the log-likelihood
962        function and return it as the second output argument.
963    alpha : {None, float}, optional
964        If ``alpha`` is not None, return the ``100 * (1-alpha)%`` confidence
965        interval for `lmbda` as the third output argument.
966        Must be between 0.0 and 1.0.
967    optimizer : callable, optional
968        If `lmbda` is None, `optimizer` is the scalar optimizer used to find
969        the value of `lmbda` that minimizes the negative log-likelihood
970        function. `optimizer` is a callable that accepts one argument:
971
972        fun : callable
973            The objective function, which evaluates the negative
974            log-likelihood function at a provided value of `lmbda`
975
976        and returns an object, such as an instance of
977        `scipy.optimize.OptimizeResult`, which holds the optimal value of
978        `lmbda` in an attribute `x`.
979
980        See the example in `boxcox_normmax` or the documentation of
981        `scipy.optimize.minimize_scalar` for more information.
982
983        If `lmbda` is not None, `optimizer` is ignored.
984
985    Returns
986    -------
987    boxcox : ndarray
988        Box-Cox power transformed array.
989    maxlog : float, optional
990        If the `lmbda` parameter is None, the second returned argument is
991        the lambda that maximizes the log-likelihood function.
992    (min_ci, max_ci) : tuple of float, optional
993        If `lmbda` parameter is None and ``alpha`` is not None, this returned
994        tuple of floats represents the minimum and maximum confidence limits
995        given ``alpha``.
996
997    See Also
998    --------
999    probplot, boxcox_normplot, boxcox_normmax, boxcox_llf
1000
1001    Notes
1002    -----
1003    The Box-Cox transform is given by::
1004
1005        y = (x**lmbda - 1) / lmbda,  for lmbda != 0
1006            log(x),                  for lmbda = 0
1007
1008    `boxcox` requires the input data to be positive.  Sometimes a Box-Cox
1009    transformation provides a shift parameter to achieve this; `boxcox` does
1010    not.  Such a shift parameter is equivalent to adding a positive constant to
1011    `x` before calling `boxcox`.
1012
1013    The confidence limits returned when ``alpha`` is provided give the interval
1014    where:
1015
1016    .. math::
1017
1018        llf(\hat{\lambda}) - llf(\lambda) < \frac{1}{2}\chi^2(1 - \alpha, 1),
1019
1020    with ``llf`` the log-likelihood function and :math:`\chi^2` the chi-squared
1021    function.
1022
1023    References
1024    ----------
1025    G.E.P. Box and D.R. Cox, "An Analysis of Transformations", Journal of the
1026    Royal Statistical Society B, 26, 211-252 (1964).
1027
1028    Examples
1029    --------
1030    >>> from scipy import stats
1031    >>> import matplotlib.pyplot as plt
1032
1033    We generate some random variates from a non-normal distribution and make a
1034    probability plot for it, to show it is non-normal in the tails:
1035
1036    >>> fig = plt.figure()
1037    >>> ax1 = fig.add_subplot(211)
1038    >>> x = stats.loggamma.rvs(5, size=500) + 5
1039    >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1)
1040    >>> ax1.set_xlabel('')
1041    >>> ax1.set_title('Probplot against normal distribution')
1042
1043    We now use `boxcox` to transform the data so it's closest to normal:
1044
1045    >>> ax2 = fig.add_subplot(212)
1046    >>> xt, _ = stats.boxcox(x)
1047    >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
1048    >>> ax2.set_title('Probplot after Box-Cox transformation')
1049
1050    >>> plt.show()
1051
1052    """
1053    x = np.asarray(x)
1054    if x.ndim != 1:
1055        raise ValueError("Data must be 1-dimensional.")
1056
1057    if x.size == 0:
1058        return x
1059
1060    if np.all(x == x[0]):
1061        raise ValueError("Data must not be constant.")
1062
1063    if np.any(x <= 0):
1064        raise ValueError("Data must be positive.")
1065
1066    if lmbda is not None:  # single transformation
1067        return special.boxcox(x, lmbda)
1068
1069    # If lmbda=None, find the lmbda that maximizes the log-likelihood function.
1070    lmax = boxcox_normmax(x, method='mle', optimizer=optimizer)
1071    y = boxcox(x, lmax)
1072
1073    if alpha is None:
1074        return y, lmax
1075    else:
1076        # Find confidence interval
1077        interval = _boxcox_conf_interval(x, lmax, alpha)
1078        return y, lmax, interval
1079
1080
1081def boxcox_normmax(x, brack=None, method='pearsonr', optimizer=None):
1082    """Compute optimal Box-Cox transform parameter for input data.
1083
1084    Parameters
1085    ----------
1086    x : array_like
1087        Input array.
1088    brack : 2-tuple, optional, default (-2.0, 2.0)
1089         The starting interval for a downhill bracket search for the default
1090         `optimize.brent` solver. Note that this is in most cases not
1091         critical; the final result is allowed to be outside this bracket.
1092         If `optimizer` is passed, `brack` must be None.
1093    method : str, optional
1094        The method to determine the optimal transform parameter (`boxcox`
1095        ``lmbda`` parameter). Options are:
1096
1097        'pearsonr'  (default)
1098            Maximizes the Pearson correlation coefficient between
1099            ``y = boxcox(x)`` and the expected values for ``y`` if `x` would be
1100            normally-distributed.
1101
1102        'mle'
1103            Minimizes the log-likelihood `boxcox_llf`.  This is the method used
1104            in `boxcox`.
1105
1106        'all'
1107            Use all optimization methods available, and return all results.
1108            Useful to compare different methods.
1109    optimizer : callable, optional
1110        `optimizer` is a callable that accepts one argument:
1111
1112        fun : callable
1113            The objective function to be optimized. `fun` accepts one argument,
1114            the Box-Cox transform parameter `lmbda`, and returns the negative
1115            log-likelihood function at the provided value. The job of `optimizer`
1116            is to find the value of `lmbda` that minimizes `fun`.
1117
1118        and returns an object, such as an instance of
1119        `scipy.optimize.OptimizeResult`, which holds the optimal value of
1120        `lmbda` in an attribute `x`.
1121
1122        See the example below or the documentation of
1123        `scipy.optimize.minimize_scalar` for more information.
1124
1125    Returns
1126    -------
1127    maxlog : float or ndarray
1128        The optimal transform parameter found.  An array instead of a scalar
1129        for ``method='all'``.
1130
1131    See Also
1132    --------
1133    boxcox, boxcox_llf, boxcox_normplot, scipy.optimize.minimize_scalar
1134
1135    Examples
1136    --------
1137    >>> from scipy import stats
1138    >>> import matplotlib.pyplot as plt
1139
1140    We can generate some data and determine the optimal ``lmbda`` in various
1141    ways:
1142
1143    >>> rng = np.random.default_rng()
1144    >>> x = stats.loggamma.rvs(5, size=30, random_state=rng) + 5
1145    >>> y, lmax_mle = stats.boxcox(x)
1146    >>> lmax_pearsonr = stats.boxcox_normmax(x)
1147
1148    >>> lmax_mle
1149    1.4613865614008015
1150    >>> lmax_pearsonr
1151    1.6685004886804342
1152    >>> stats.boxcox_normmax(x, method='all')
1153    array([1.66850049, 1.46138656])
1154
1155    >>> fig = plt.figure()
1156    >>> ax = fig.add_subplot(111)
1157    >>> prob = stats.boxcox_normplot(x, -10, 10, plot=ax)
1158    >>> ax.axvline(lmax_mle, color='r')
1159    >>> ax.axvline(lmax_pearsonr, color='g', ls='--')
1160
1161    >>> plt.show()
1162
1163    Alternatively, we can define our own `optimizer` function. Suppose we
1164    are only interested in values of `lmbda` on the interval [6, 7], we
1165    want to use `scipy.optimize.minimize_scalar` with ``method='bounded'``,
1166    and we want to use tighter tolerances when optimizing the log-likelihood
1167    function. To do this, we define a function that accepts positional argument
1168    `fun` and uses `scipy.optimize.minimize_scalar` to minimize `fun` subject
1169    to the provided bounds and tolerances:
1170
1171    >>> from scipy import optimize
1172    >>> options = {'xatol': 1e-12}  # absolute tolerance on `x`
1173    >>> def optimizer(fun):
1174    ...     return optimize.minimize_scalar(fun, bounds=(6, 7),
1175    ...                                     method="bounded", options=options)
1176    >>> stats.boxcox_normmax(x, optimizer=optimizer)
1177    6.000...
1178    """
1179    # If optimizer is not given, define default 'brent' optimizer.
1180    if optimizer is None:
1181
1182        # Set default value for `brack`.
1183        if brack is None:
1184            brack = (-2.0, 2.0)
1185
1186        def _optimizer(func, args):
1187            return optimize.brent(func, args=args, brack=brack)
1188
1189    # Otherwise check optimizer.
1190    else:
1191        if not callable(optimizer):
1192            raise ValueError("`optimizer` must be a callable")
1193
1194        if brack is not None:
1195            raise ValueError("`brack` must be None if `optimizer` is given")
1196
1197        # `optimizer` is expected to return a `OptimizeResult` object, we here
1198        # get the solution to the optimization problem.
1199        def _optimizer(func, args):
1200            def func_wrapped(x):
1201                return func(x, *args)
1202            return getattr(optimizer(func_wrapped), 'x', None)
1203
1204    def _pearsonr(x):
1205        osm_uniform = _calc_uniform_order_statistic_medians(len(x))
1206        xvals = distributions.norm.ppf(osm_uniform)
1207
1208        def _eval_pearsonr(lmbda, xvals, samps):
1209            # This function computes the x-axis values of the probability plot
1210            # and computes a linear regression (including the correlation) and
1211            # returns ``1 - r`` so that a minimization function maximizes the
1212            # correlation.
1213            y = boxcox(samps, lmbda)
1214            yvals = np.sort(y)
1215            r, prob = stats.pearsonr(xvals, yvals)
1216            return 1 - r
1217
1218        return _optimizer(_eval_pearsonr, args=(xvals, x))
1219
1220    def _mle(x):
1221        def _eval_mle(lmb, data):
1222            # function to minimize
1223            return -boxcox_llf(lmb, data)
1224
1225        return _optimizer(_eval_mle, args=(x,))
1226
1227    def _all(x):
1228        maxlog = np.empty(2, dtype=float)
1229        maxlog[0] = _pearsonr(x)
1230        maxlog[1] = _mle(x)
1231        return maxlog
1232
1233    methods = {'pearsonr': _pearsonr,
1234               'mle': _mle,
1235               'all': _all}
1236    if method not in methods.keys():
1237        raise ValueError("Method %s not recognized." % method)
1238
1239    optimfunc = methods[method]
1240    res = optimfunc(x)
1241    if res is None:
1242        message = ("`optimizer` must return an object containing the optimal "
1243                   "`lmbda` in attribute `x`")
1244        raise ValueError(message)
1245    return res
1246
1247
1248def _normplot(method, x, la, lb, plot=None, N=80):
1249    """Compute parameters for a Box-Cox or Yeo-Johnson normality plot,
1250    optionally show it.
1251
1252    See `boxcox_normplot` or `yeojohnson_normplot` for details.
1253    """
1254
1255    if method == 'boxcox':
1256        title = 'Box-Cox Normality Plot'
1257        transform_func = boxcox
1258    else:
1259        title = 'Yeo-Johnson Normality Plot'
1260        transform_func = yeojohnson
1261
1262    x = np.asarray(x)
1263    if x.size == 0:
1264        return x
1265
1266    if lb <= la:
1267        raise ValueError("`lb` has to be larger than `la`.")
1268
1269    lmbdas = np.linspace(la, lb, num=N)
1270    ppcc = lmbdas * 0.0
1271    for i, val in enumerate(lmbdas):
1272        # Determine for each lmbda the square root of correlation coefficient
1273        # of transformed x
1274        z = transform_func(x, lmbda=val)
1275        _, (_, _, r) = probplot(z, dist='norm', fit=True)
1276        ppcc[i] = r
1277
1278    if plot is not None:
1279        plot.plot(lmbdas, ppcc, 'x')
1280        _add_axis_labels_title(plot, xlabel='$\\lambda$',
1281                               ylabel='Prob Plot Corr. Coef.',
1282                               title=title)
1283
1284    return lmbdas, ppcc
1285
1286
1287def boxcox_normplot(x, la, lb, plot=None, N=80):
1288    """Compute parameters for a Box-Cox normality plot, optionally show it.
1289
1290    A Box-Cox normality plot shows graphically what the best transformation
1291    parameter is to use in `boxcox` to obtain a distribution that is close
1292    to normal.
1293
1294    Parameters
1295    ----------
1296    x : array_like
1297        Input array.
1298    la, lb : scalar
1299        The lower and upper bounds for the ``lmbda`` values to pass to `boxcox`
1300        for Box-Cox transformations.  These are also the limits of the
1301        horizontal axis of the plot if that is generated.
1302    plot : object, optional
1303        If given, plots the quantiles and least squares fit.
1304        `plot` is an object that has to have methods "plot" and "text".
1305        The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
1306        or a custom object with the same methods.
1307        Default is None, which means that no plot is created.
1308    N : int, optional
1309        Number of points on the horizontal axis (equally distributed from
1310        `la` to `lb`).
1311
1312    Returns
1313    -------
1314    lmbdas : ndarray
1315        The ``lmbda`` values for which a Box-Cox transform was done.
1316    ppcc : ndarray
1317        Probability Plot Correlelation Coefficient, as obtained from `probplot`
1318        when fitting the Box-Cox transformed input `x` against a normal
1319        distribution.
1320
1321    See Also
1322    --------
1323    probplot, boxcox, boxcox_normmax, boxcox_llf, ppcc_max
1324
1325    Notes
1326    -----
1327    Even if `plot` is given, the figure is not shown or saved by
1328    `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')``
1329    should be used after calling `probplot`.
1330
1331    Examples
1332    --------
1333    >>> from scipy import stats
1334    >>> import matplotlib.pyplot as plt
1335
1336    Generate some non-normally distributed data, and create a Box-Cox plot:
1337
1338    >>> x = stats.loggamma.rvs(5, size=500) + 5
1339    >>> fig = plt.figure()
1340    >>> ax = fig.add_subplot(111)
1341    >>> prob = stats.boxcox_normplot(x, -20, 20, plot=ax)
1342
1343    Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in
1344    the same plot:
1345
1346    >>> _, maxlog = stats.boxcox(x)
1347    >>> ax.axvline(maxlog, color='r')
1348
1349    >>> plt.show()
1350
1351    """
1352    return _normplot('boxcox', x, la, lb, plot, N)
1353
1354
1355def yeojohnson(x, lmbda=None):
1356    r"""Return a dataset transformed by a Yeo-Johnson power transformation.
1357
1358    Parameters
1359    ----------
1360    x : ndarray
1361        Input array.  Should be 1-dimensional.
1362    lmbda : float, optional
1363        If ``lmbda`` is ``None``, find the lambda that maximizes the
1364        log-likelihood function and return it as the second output argument.
1365        Otherwise the transformation is done for the given value.
1366
1367    Returns
1368    -------
1369    yeojohnson: ndarray
1370        Yeo-Johnson power transformed array.
1371    maxlog : float, optional
1372        If the `lmbda` parameter is None, the second returned argument is
1373        the lambda that maximizes the log-likelihood function.
1374
1375    See Also
1376    --------
1377    probplot, yeojohnson_normplot, yeojohnson_normmax, yeojohnson_llf, boxcox
1378
1379    Notes
1380    -----
1381    The Yeo-Johnson transform is given by::
1382
1383        y = ((x + 1)**lmbda - 1) / lmbda,                for x >= 0, lmbda != 0
1384            log(x + 1),                                  for x >= 0, lmbda = 0
1385            -((-x + 1)**(2 - lmbda) - 1) / (2 - lmbda),  for x < 0, lmbda != 2
1386            -log(-x + 1),                                for x < 0, lmbda = 2
1387
1388    Unlike `boxcox`, `yeojohnson` does not require the input data to be
1389    positive.
1390
1391    .. versionadded:: 1.2.0
1392
1393
1394    References
1395    ----------
1396    I. Yeo and R.A. Johnson, "A New Family of Power Transformations to
1397    Improve Normality or Symmetry", Biometrika 87.4 (2000):
1398
1399
1400    Examples
1401    --------
1402    >>> from scipy import stats
1403    >>> import matplotlib.pyplot as plt
1404
1405    We generate some random variates from a non-normal distribution and make a
1406    probability plot for it, to show it is non-normal in the tails:
1407
1408    >>> fig = plt.figure()
1409    >>> ax1 = fig.add_subplot(211)
1410    >>> x = stats.loggamma.rvs(5, size=500) + 5
1411    >>> prob = stats.probplot(x, dist=stats.norm, plot=ax1)
1412    >>> ax1.set_xlabel('')
1413    >>> ax1.set_title('Probplot against normal distribution')
1414
1415    We now use `yeojohnson` to transform the data so it's closest to normal:
1416
1417    >>> ax2 = fig.add_subplot(212)
1418    >>> xt, lmbda = stats.yeojohnson(x)
1419    >>> prob = stats.probplot(xt, dist=stats.norm, plot=ax2)
1420    >>> ax2.set_title('Probplot after Yeo-Johnson transformation')
1421
1422    >>> plt.show()
1423
1424    """
1425    x = np.asarray(x)
1426    if x.size == 0:
1427        return x
1428
1429    if np.issubdtype(x.dtype, np.complexfloating):
1430        raise ValueError('Yeo-Johnson transformation is not defined for '
1431                         'complex numbers.')
1432
1433    if np.issubdtype(x.dtype, np.integer):
1434        x = x.astype(np.float64, copy=False)
1435
1436    if lmbda is not None:
1437        return _yeojohnson_transform(x, lmbda)
1438
1439    # if lmbda=None, find the lmbda that maximizes the log-likelihood function.
1440    lmax = yeojohnson_normmax(x)
1441    y = _yeojohnson_transform(x, lmax)
1442
1443    return y, lmax
1444
1445
1446def _yeojohnson_transform(x, lmbda):
1447    """Returns `x` transformed by the Yeo-Johnson power transform with given
1448    parameter `lmbda`.
1449    """
1450    out = np.zeros_like(x)
1451    pos = x >= 0  # binary mask
1452
1453    # when x >= 0
1454    if abs(lmbda) < np.spacing(1.):
1455        out[pos] = np.log1p(x[pos])
1456    else:  # lmbda != 0
1457        out[pos] = (np.power(x[pos] + 1, lmbda) - 1) / lmbda
1458
1459    # when x < 0
1460    if abs(lmbda - 2) > np.spacing(1.):
1461        out[~pos] = -(np.power(-x[~pos] + 1, 2 - lmbda) - 1) / (2 - lmbda)
1462    else:  # lmbda == 2
1463        out[~pos] = -np.log1p(-x[~pos])
1464
1465    return out
1466
1467
1468def yeojohnson_llf(lmb, data):
1469    r"""The yeojohnson log-likelihood function.
1470
1471    Parameters
1472    ----------
1473    lmb : scalar
1474        Parameter for Yeo-Johnson transformation. See `yeojohnson` for
1475        details.
1476    data : array_like
1477        Data to calculate Yeo-Johnson log-likelihood for. If `data` is
1478        multi-dimensional, the log-likelihood is calculated along the first
1479        axis.
1480
1481    Returns
1482    -------
1483    llf : float
1484        Yeo-Johnson log-likelihood of `data` given `lmb`.
1485
1486    See Also
1487    --------
1488    yeojohnson, probplot, yeojohnson_normplot, yeojohnson_normmax
1489
1490    Notes
1491    -----
1492    The Yeo-Johnson log-likelihood function is defined here as
1493
1494    .. math::
1495
1496        llf = -N/2 \log(\hat{\sigma}^2) + (\lambda - 1)
1497              \sum_i \text{ sign }(x_i)\log(|x_i| + 1)
1498
1499    where :math:`\hat{\sigma}^2` is estimated variance of the the Yeo-Johnson
1500    transformed input data ``x``.
1501
1502    .. versionadded:: 1.2.0
1503
1504    Examples
1505    --------
1506    >>> from scipy import stats
1507    >>> import matplotlib.pyplot as plt
1508    >>> from mpl_toolkits.axes_grid1.inset_locator import inset_axes
1509
1510    Generate some random variates and calculate Yeo-Johnson log-likelihood
1511    values for them for a range of ``lmbda`` values:
1512
1513    >>> x = stats.loggamma.rvs(5, loc=10, size=1000)
1514    >>> lmbdas = np.linspace(-2, 10)
1515    >>> llf = np.zeros(lmbdas.shape, dtype=float)
1516    >>> for ii, lmbda in enumerate(lmbdas):
1517    ...     llf[ii] = stats.yeojohnson_llf(lmbda, x)
1518
1519    Also find the optimal lmbda value with `yeojohnson`:
1520
1521    >>> x_most_normal, lmbda_optimal = stats.yeojohnson(x)
1522
1523    Plot the log-likelihood as function of lmbda.  Add the optimal lmbda as a
1524    horizontal line to check that that's really the optimum:
1525
1526    >>> fig = plt.figure()
1527    >>> ax = fig.add_subplot(111)
1528    >>> ax.plot(lmbdas, llf, 'b.-')
1529    >>> ax.axhline(stats.yeojohnson_llf(lmbda_optimal, x), color='r')
1530    >>> ax.set_xlabel('lmbda parameter')
1531    >>> ax.set_ylabel('Yeo-Johnson log-likelihood')
1532
1533    Now add some probability plots to show that where the log-likelihood is
1534    maximized the data transformed with `yeojohnson` looks closest to normal:
1535
1536    >>> locs = [3, 10, 4]  # 'lower left', 'center', 'lower right'
1537    >>> for lmbda, loc in zip([-1, lmbda_optimal, 9], locs):
1538    ...     xt = stats.yeojohnson(x, lmbda=lmbda)
1539    ...     (osm, osr), (slope, intercept, r_sq) = stats.probplot(xt)
1540    ...     ax_inset = inset_axes(ax, width="20%", height="20%", loc=loc)
1541    ...     ax_inset.plot(osm, osr, 'c.', osm, slope*osm + intercept, 'k-')
1542    ...     ax_inset.set_xticklabels([])
1543    ...     ax_inset.set_yticklabels([])
1544    ...     ax_inset.set_title(r'$\lambda=%1.2f$' % lmbda)
1545
1546    >>> plt.show()
1547
1548    """
1549    data = np.asarray(data)
1550    n_samples = data.shape[0]
1551
1552    if n_samples == 0:
1553        return np.nan
1554
1555    trans = _yeojohnson_transform(data, lmb)
1556
1557    loglike = -n_samples / 2 * np.log(trans.var(axis=0))
1558    loglike += (lmb - 1) * (np.sign(data) * np.log(np.abs(data) + 1)).sum(axis=0)
1559
1560    return loglike
1561
1562
1563def yeojohnson_normmax(x, brack=(-2, 2)):
1564    """Compute optimal Yeo-Johnson transform parameter.
1565
1566    Compute optimal Yeo-Johnson transform parameter for input data, using
1567    maximum likelihood estimation.
1568
1569    Parameters
1570    ----------
1571    x : array_like
1572        Input array.
1573    brack : 2-tuple, optional
1574        The starting interval for a downhill bracket search with
1575        `optimize.brent`. Note that this is in most cases not critical; the
1576        final result is allowed to be outside this bracket.
1577
1578    Returns
1579    -------
1580    maxlog : float
1581        The optimal transform parameter found.
1582
1583    See Also
1584    --------
1585    yeojohnson, yeojohnson_llf, yeojohnson_normplot
1586
1587    Notes
1588    -----
1589    .. versionadded:: 1.2.0
1590
1591    Examples
1592    --------
1593    >>> from scipy import stats
1594    >>> import matplotlib.pyplot as plt
1595
1596    Generate some data and determine optimal ``lmbda``
1597
1598    >>> rng = np.random.default_rng()
1599    >>> x = stats.loggamma.rvs(5, size=30, random_state=rng) + 5
1600    >>> lmax = stats.yeojohnson_normmax(x)
1601
1602    >>> fig = plt.figure()
1603    >>> ax = fig.add_subplot(111)
1604    >>> prob = stats.yeojohnson_normplot(x, -10, 10, plot=ax)
1605    >>> ax.axvline(lmax, color='r')
1606
1607    >>> plt.show()
1608
1609    """
1610    def _neg_llf(lmbda, data):
1611        return -yeojohnson_llf(lmbda, data)
1612
1613    return optimize.brent(_neg_llf, brack=brack, args=(x,))
1614
1615
1616def yeojohnson_normplot(x, la, lb, plot=None, N=80):
1617    """Compute parameters for a Yeo-Johnson normality plot, optionally show it.
1618
1619    A Yeo-Johnson normality plot shows graphically what the best
1620    transformation parameter is to use in `yeojohnson` to obtain a
1621    distribution that is close to normal.
1622
1623    Parameters
1624    ----------
1625    x : array_like
1626        Input array.
1627    la, lb : scalar
1628        The lower and upper bounds for the ``lmbda`` values to pass to
1629        `yeojohnson` for Yeo-Johnson transformations. These are also the
1630        limits of the horizontal axis of the plot if that is generated.
1631    plot : object, optional
1632        If given, plots the quantiles and least squares fit.
1633        `plot` is an object that has to have methods "plot" and "text".
1634        The `matplotlib.pyplot` module or a Matplotlib Axes object can be used,
1635        or a custom object with the same methods.
1636        Default is None, which means that no plot is created.
1637    N : int, optional
1638        Number of points on the horizontal axis (equally distributed from
1639        `la` to `lb`).
1640
1641    Returns
1642    -------
1643    lmbdas : ndarray
1644        The ``lmbda`` values for which a Yeo-Johnson transform was done.
1645    ppcc : ndarray
1646        Probability Plot Correlelation Coefficient, as obtained from `probplot`
1647        when fitting the Box-Cox transformed input `x` against a normal
1648        distribution.
1649
1650    See Also
1651    --------
1652    probplot, yeojohnson, yeojohnson_normmax, yeojohnson_llf, ppcc_max
1653
1654    Notes
1655    -----
1656    Even if `plot` is given, the figure is not shown or saved by
1657    `boxcox_normplot`; ``plt.show()`` or ``plt.savefig('figname.png')``
1658    should be used after calling `probplot`.
1659
1660    .. versionadded:: 1.2.0
1661
1662    Examples
1663    --------
1664    >>> from scipy import stats
1665    >>> import matplotlib.pyplot as plt
1666
1667    Generate some non-normally distributed data, and create a Yeo-Johnson plot:
1668
1669    >>> x = stats.loggamma.rvs(5, size=500) + 5
1670    >>> fig = plt.figure()
1671    >>> ax = fig.add_subplot(111)
1672    >>> prob = stats.yeojohnson_normplot(x, -20, 20, plot=ax)
1673
1674    Determine and plot the optimal ``lmbda`` to transform ``x`` and plot it in
1675    the same plot:
1676
1677    >>> _, maxlog = stats.yeojohnson(x)
1678    >>> ax.axvline(maxlog, color='r')
1679
1680    >>> plt.show()
1681
1682    """
1683    return _normplot('yeojohnson', x, la, lb, plot, N)
1684
1685
1686ShapiroResult = namedtuple('ShapiroResult', ('statistic', 'pvalue'))
1687
1688
1689def shapiro(x):
1690    """Perform the Shapiro-Wilk test for normality.
1691
1692    The Shapiro-Wilk test tests the null hypothesis that the
1693    data was drawn from a normal distribution.
1694
1695    Parameters
1696    ----------
1697    x : array_like
1698        Array of sample data.
1699
1700    Returns
1701    -------
1702    statistic : float
1703        The test statistic.
1704    p-value : float
1705        The p-value for the hypothesis test.
1706
1707    See Also
1708    --------
1709    anderson : The Anderson-Darling test for normality
1710    kstest : The Kolmogorov-Smirnov test for goodness of fit.
1711
1712    Notes
1713    -----
1714    The algorithm used is described in [4]_ but censoring parameters as
1715    described are not implemented. For N > 5000 the W test statistic is accurate
1716    but the p-value may not be.
1717
1718    The chance of rejecting the null hypothesis when it is true is close to 5%
1719    regardless of sample size.
1720
1721    References
1722    ----------
1723    .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm
1724    .. [2] Shapiro, S. S. & Wilk, M.B (1965). An analysis of variance test for
1725           normality (complete samples), Biometrika, Vol. 52, pp. 591-611.
1726    .. [3] Razali, N. M. & Wah, Y. B. (2011) Power comparisons of Shapiro-Wilk,
1727           Kolmogorov-Smirnov, Lilliefors and Anderson-Darling tests, Journal of
1728           Statistical Modeling and Analytics, Vol. 2, pp. 21-33.
1729    .. [4] ALGORITHM AS R94 APPL. STATIST. (1995) VOL. 44, NO. 4.
1730
1731    Examples
1732    --------
1733    >>> from scipy import stats
1734    >>> rng = np.random.default_rng()
1735    >>> x = stats.norm.rvs(loc=5, scale=3, size=100, random_state=rng)
1736    >>> shapiro_test = stats.shapiro(x)
1737    >>> shapiro_test
1738    ShapiroResult(statistic=0.9813305735588074, pvalue=0.16855233907699585)
1739    >>> shapiro_test.statistic
1740    0.9813305735588074
1741    >>> shapiro_test.pvalue
1742    0.16855233907699585
1743
1744    """
1745    x = np.ravel(x)
1746
1747    N = len(x)
1748    if N < 3:
1749        raise ValueError("Data must be at least length 3.")
1750
1751    a = zeros(N, 'f')
1752    init = 0
1753
1754    y = sort(x)
1755    a, w, pw, ifault = statlib.swilk(y, a[:N//2], init)
1756    if ifault not in [0, 2]:
1757        warnings.warn("Input data for shapiro has range zero. The results "
1758                      "may not be accurate.")
1759    if N > 5000:
1760        warnings.warn("p-value may not be accurate for N > 5000.")
1761
1762    return ShapiroResult(w, pw)
1763
1764
1765# Values from Stephens, M A, "EDF Statistics for Goodness of Fit and
1766#             Some Comparisons", Journal of the American Statistical
1767#             Association, Vol. 69, Issue 347, Sept. 1974, pp 730-737
1768_Avals_norm = array([0.576, 0.656, 0.787, 0.918, 1.092])
1769_Avals_expon = array([0.922, 1.078, 1.341, 1.606, 1.957])
1770# From Stephens, M A, "Goodness of Fit for the Extreme Value Distribution",
1771#             Biometrika, Vol. 64, Issue 3, Dec. 1977, pp 583-588.
1772_Avals_gumbel = array([0.474, 0.637, 0.757, 0.877, 1.038])
1773# From Stephens, M A, "Tests of Fit for the Logistic Distribution Based
1774#             on the Empirical Distribution Function.", Biometrika,
1775#             Vol. 66, Issue 3, Dec. 1979, pp 591-595.
1776_Avals_logistic = array([0.426, 0.563, 0.660, 0.769, 0.906, 1.010])
1777
1778
1779AndersonResult = namedtuple('AndersonResult', ('statistic',
1780                                               'critical_values',
1781                                               'significance_level'))
1782
1783
1784def anderson(x, dist='norm'):
1785    """Anderson-Darling test for data coming from a particular distribution.
1786
1787    The Anderson-Darling test tests the null hypothesis that a sample is
1788    drawn from a population that follows a particular distribution.
1789    For the Anderson-Darling test, the critical values depend on
1790    which distribution is being tested against.  This function works
1791    for normal, exponential, logistic, or Gumbel (Extreme Value
1792    Type I) distributions.
1793
1794    Parameters
1795    ----------
1796    x : array_like
1797        Array of sample data.
1798    dist : {'norm', 'expon', 'logistic', 'gumbel', 'gumbel_l', 'gumbel_r', 'extreme1'}, optional
1799        The type of distribution to test against.  The default is 'norm'.
1800        The names 'extreme1', 'gumbel_l' and 'gumbel' are synonyms for the
1801        same distribution.
1802
1803    Returns
1804    -------
1805    statistic : float
1806        The Anderson-Darling test statistic.
1807    critical_values : list
1808        The critical values for this distribution.
1809    significance_level : list
1810        The significance levels for the corresponding critical values
1811        in percents.  The function returns critical values for a
1812        differing set of significance levels depending on the
1813        distribution that is being tested against.
1814
1815    See Also
1816    --------
1817    kstest : The Kolmogorov-Smirnov test for goodness-of-fit.
1818
1819    Notes
1820    -----
1821    Critical values provided are for the following significance levels:
1822
1823    normal/exponential
1824        15%, 10%, 5%, 2.5%, 1%
1825    logistic
1826        25%, 10%, 5%, 2.5%, 1%, 0.5%
1827    Gumbel
1828        25%, 10%, 5%, 2.5%, 1%
1829
1830    If the returned statistic is larger than these critical values then
1831    for the corresponding significance level, the null hypothesis that
1832    the data come from the chosen distribution can be rejected.
1833    The returned statistic is referred to as 'A2' in the references.
1834
1835    References
1836    ----------
1837    .. [1] https://www.itl.nist.gov/div898/handbook/prc/section2/prc213.htm
1838    .. [2] Stephens, M. A. (1974). EDF Statistics for Goodness of Fit and
1839           Some Comparisons, Journal of the American Statistical Association,
1840           Vol. 69, pp. 730-737.
1841    .. [3] Stephens, M. A. (1976). Asymptotic Results for Goodness-of-Fit
1842           Statistics with Unknown Parameters, Annals of Statistics, Vol. 4,
1843           pp. 357-369.
1844    .. [4] Stephens, M. A. (1977). Goodness of Fit for the Extreme Value
1845           Distribution, Biometrika, Vol. 64, pp. 583-588.
1846    .. [5] Stephens, M. A. (1977). Goodness of Fit with Special Reference
1847           to Tests for Exponentiality , Technical Report No. 262,
1848           Department of Statistics, Stanford University, Stanford, CA.
1849    .. [6] Stephens, M. A. (1979). Tests of Fit for the Logistic Distribution
1850           Based on the Empirical Distribution Function, Biometrika, Vol. 66,
1851           pp. 591-595.
1852
1853    """
1854    if dist not in ['norm', 'expon', 'gumbel', 'gumbel_l',
1855                    'gumbel_r', 'extreme1', 'logistic']:
1856        raise ValueError("Invalid distribution; dist must be 'norm', "
1857                         "'expon', 'gumbel', 'extreme1' or 'logistic'.")
1858    y = sort(x)
1859    xbar = np.mean(x, axis=0)
1860    N = len(y)
1861    if dist == 'norm':
1862        s = np.std(x, ddof=1, axis=0)
1863        w = (y - xbar) / s
1864        logcdf = distributions.norm.logcdf(w)
1865        logsf = distributions.norm.logsf(w)
1866        sig = array([15, 10, 5, 2.5, 1])
1867        critical = around(_Avals_norm / (1.0 + 4.0/N - 25.0/N/N), 3)
1868    elif dist == 'expon':
1869        w = y / xbar
1870        logcdf = distributions.expon.logcdf(w)
1871        logsf = distributions.expon.logsf(w)
1872        sig = array([15, 10, 5, 2.5, 1])
1873        critical = around(_Avals_expon / (1.0 + 0.6/N), 3)
1874    elif dist == 'logistic':
1875        def rootfunc(ab, xj, N):
1876            a, b = ab
1877            tmp = (xj - a) / b
1878            tmp2 = exp(tmp)
1879            val = [np.sum(1.0/(1+tmp2), axis=0) - 0.5*N,
1880                   np.sum(tmp*(1.0-tmp2)/(1+tmp2), axis=0) + N]
1881            return array(val)
1882
1883        sol0 = array([xbar, np.std(x, ddof=1, axis=0)])
1884        sol = optimize.fsolve(rootfunc, sol0, args=(x, N), xtol=1e-5)
1885        w = (y - sol[0]) / sol[1]
1886        logcdf = distributions.logistic.logcdf(w)
1887        logsf = distributions.logistic.logsf(w)
1888        sig = array([25, 10, 5, 2.5, 1, 0.5])
1889        critical = around(_Avals_logistic / (1.0 + 0.25/N), 3)
1890    elif dist == 'gumbel_r':
1891        xbar, s = distributions.gumbel_r.fit(x)
1892        w = (y - xbar) / s
1893        logcdf = distributions.gumbel_r.logcdf(w)
1894        logsf = distributions.gumbel_r.logsf(w)
1895        sig = array([25, 10, 5, 2.5, 1])
1896        critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3)
1897    else:  # (dist == 'gumbel') or (dist == 'gumbel_l') or (dist == 'extreme1')
1898        xbar, s = distributions.gumbel_l.fit(x)
1899        w = (y - xbar) / s
1900        logcdf = distributions.gumbel_l.logcdf(w)
1901        logsf = distributions.gumbel_l.logsf(w)
1902        sig = array([25, 10, 5, 2.5, 1])
1903        critical = around(_Avals_gumbel / (1.0 + 0.2/sqrt(N)), 3)
1904
1905    i = arange(1, N + 1)
1906    A2 = -N - np.sum((2*i - 1.0) / N * (logcdf + logsf[::-1]), axis=0)
1907
1908    return AndersonResult(A2, critical, sig)
1909
1910
1911def _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N):
1912    """Compute A2akN equation 7 of Scholz and Stephens.
1913
1914    Parameters
1915    ----------
1916    samples : sequence of 1-D array_like
1917        Array of sample arrays.
1918    Z : array_like
1919        Sorted array of all observations.
1920    Zstar : array_like
1921        Sorted array of unique observations.
1922    k : int
1923        Number of samples.
1924    n : array_like
1925        Number of observations in each sample.
1926    N : int
1927        Total number of observations.
1928
1929    Returns
1930    -------
1931    A2aKN : float
1932        The A2aKN statistics of Scholz and Stephens 1987.
1933
1934    """
1935    A2akN = 0.
1936    Z_ssorted_left = Z.searchsorted(Zstar, 'left')
1937    if N == Zstar.size:
1938        lj = 1.
1939    else:
1940        lj = Z.searchsorted(Zstar, 'right') - Z_ssorted_left
1941    Bj = Z_ssorted_left + lj / 2.
1942    for i in arange(0, k):
1943        s = np.sort(samples[i])
1944        s_ssorted_right = s.searchsorted(Zstar, side='right')
1945        Mij = s_ssorted_right.astype(float)
1946        fij = s_ssorted_right - s.searchsorted(Zstar, 'left')
1947        Mij -= fij / 2.
1948        inner = lj / float(N) * (N*Mij - Bj*n[i])**2 / (Bj*(N - Bj) - N*lj/4.)
1949        A2akN += inner.sum() / n[i]
1950    A2akN *= (N - 1.) / N
1951    return A2akN
1952
1953
1954def _anderson_ksamp_right(samples, Z, Zstar, k, n, N):
1955    """Compute A2akN equation 6 of Scholz & Stephens.
1956
1957    Parameters
1958    ----------
1959    samples : sequence of 1-D array_like
1960        Array of sample arrays.
1961    Z : array_like
1962        Sorted array of all observations.
1963    Zstar : array_like
1964        Sorted array of unique observations.
1965    k : int
1966        Number of samples.
1967    n : array_like
1968        Number of observations in each sample.
1969    N : int
1970        Total number of observations.
1971
1972    Returns
1973    -------
1974    A2KN : float
1975        The A2KN statistics of Scholz and Stephens 1987.
1976
1977    """
1978    A2kN = 0.
1979    lj = Z.searchsorted(Zstar[:-1], 'right') - Z.searchsorted(Zstar[:-1],
1980                                                              'left')
1981    Bj = lj.cumsum()
1982    for i in arange(0, k):
1983        s = np.sort(samples[i])
1984        Mij = s.searchsorted(Zstar[:-1], side='right')
1985        inner = lj / float(N) * (N * Mij - Bj * n[i])**2 / (Bj * (N - Bj))
1986        A2kN += inner.sum() / n[i]
1987    return A2kN
1988
1989
1990Anderson_ksampResult = namedtuple('Anderson_ksampResult',
1991                                  ('statistic', 'critical_values',
1992                                   'significance_level'))
1993
1994
1995def anderson_ksamp(samples, midrank=True):
1996    """The Anderson-Darling test for k-samples.
1997
1998    The k-sample Anderson-Darling test is a modification of the
1999    one-sample Anderson-Darling test. It tests the null hypothesis
2000    that k-samples are drawn from the same population without having
2001    to specify the distribution function of that population. The
2002    critical values depend on the number of samples.
2003
2004    Parameters
2005    ----------
2006    samples : sequence of 1-D array_like
2007        Array of sample data in arrays.
2008    midrank : bool, optional
2009        Type of Anderson-Darling test which is computed. Default
2010        (True) is the midrank test applicable to continuous and
2011        discrete populations. If False, the right side empirical
2012        distribution is used.
2013
2014    Returns
2015    -------
2016    statistic : float
2017        Normalized k-sample Anderson-Darling test statistic.
2018    critical_values : array
2019        The critical values for significance levels 25%, 10%, 5%, 2.5%, 1%,
2020        0.5%, 0.1%.
2021    significance_level : float
2022        An approximate significance level at which the null hypothesis for the
2023        provided samples can be rejected. The value is floored / capped at
2024        0.1% / 25%.
2025
2026    Raises
2027    ------
2028    ValueError
2029        If less than 2 samples are provided, a sample is empty, or no
2030        distinct observations are in the samples.
2031
2032    See Also
2033    --------
2034    ks_2samp : 2 sample Kolmogorov-Smirnov test
2035    anderson : 1 sample Anderson-Darling test
2036
2037    Notes
2038    -----
2039    [1]_ defines three versions of the k-sample Anderson-Darling test:
2040    one for continuous distributions and two for discrete
2041    distributions, in which ties between samples may occur. The
2042    default of this routine is to compute the version based on the
2043    midrank empirical distribution function. This test is applicable
2044    to continuous and discrete data. If midrank is set to False, the
2045    right side empirical distribution is used for a test for discrete
2046    data. According to [1]_, the two discrete test statistics differ
2047    only slightly if a few collisions due to round-off errors occur in
2048    the test not adjusted for ties between samples.
2049
2050    The critical values corresponding to the significance levels from 0.01
2051    to 0.25 are taken from [1]_. p-values are floored / capped
2052    at 0.1% / 25%. Since the range of critical values might be extended in
2053    future releases, it is recommended not to test ``p == 0.25``, but rather
2054    ``p >= 0.25`` (analogously for the lower bound).
2055
2056    .. versionadded:: 0.14.0
2057
2058    References
2059    ----------
2060    .. [1] Scholz, F. W and Stephens, M. A. (1987), K-Sample
2061           Anderson-Darling Tests, Journal of the American Statistical
2062           Association, Vol. 82, pp. 918-924.
2063
2064    Examples
2065    --------
2066    >>> from scipy import stats
2067    >>> rng = np.random.default_rng()
2068
2069    The null hypothesis that the two random samples come from the same
2070    distribution can be rejected at the 5% level because the returned
2071    test value is greater than the critical value for 5% (1.961) but
2072    not at the 2.5% level. The interpolation gives an approximate
2073    significance level of 3.2%:
2074
2075    >>> stats.anderson_ksamp([rng.normal(size=50),
2076    ... rng.normal(loc=0.5, size=30)])
2077    (1.974403288713695,
2078      array([0.325, 1.226, 1.961, 2.718, 3.752, 4.592, 6.546]),
2079      0.04991293614572478)
2080
2081
2082    The null hypothesis cannot be rejected for three samples from an
2083    identical distribution. The reported p-value (25%) has been capped and
2084    may not be very accurate (since it corresponds to the value 0.449
2085    whereas the statistic is -0.731):
2086
2087    >>> stats.anderson_ksamp([rng.normal(size=50),
2088    ... rng.normal(size=30), rng.normal(size=20)])
2089    (-0.29103725200789504,
2090      array([ 0.44925884,  1.3052767 ,  1.9434184 ,  2.57696569,  3.41634856,
2091      4.07210043, 5.56419101]),
2092      0.25)
2093
2094    """
2095    k = len(samples)
2096    if (k < 2):
2097        raise ValueError("anderson_ksamp needs at least two samples")
2098
2099    samples = list(map(np.asarray, samples))
2100    Z = np.sort(np.hstack(samples))
2101    N = Z.size
2102    Zstar = np.unique(Z)
2103    if Zstar.size < 2:
2104        raise ValueError("anderson_ksamp needs more than one distinct "
2105                         "observation")
2106
2107    n = np.array([sample.size for sample in samples])
2108    if np.any(n == 0):
2109        raise ValueError("anderson_ksamp encountered sample without "
2110                         "observations")
2111
2112    if midrank:
2113        A2kN = _anderson_ksamp_midrank(samples, Z, Zstar, k, n, N)
2114    else:
2115        A2kN = _anderson_ksamp_right(samples, Z, Zstar, k, n, N)
2116
2117    H = (1. / n).sum()
2118    hs_cs = (1. / arange(N - 1, 1, -1)).cumsum()
2119    h = hs_cs[-1] + 1
2120    g = (hs_cs / arange(2, N)).sum()
2121
2122    a = (4*g - 6) * (k - 1) + (10 - 6*g)*H
2123    b = (2*g - 4)*k**2 + 8*h*k + (2*g - 14*h - 4)*H - 8*h + 4*g - 6
2124    c = (6*h + 2*g - 2)*k**2 + (4*h - 4*g + 6)*k + (2*h - 6)*H + 4*h
2125    d = (2*h + 6)*k**2 - 4*h*k
2126    sigmasq = (a*N**3 + b*N**2 + c*N + d) / ((N - 1.) * (N - 2.) * (N - 3.))
2127    m = k - 1
2128    A2 = (A2kN - m) / math.sqrt(sigmasq)
2129
2130    # The b_i values are the interpolation coefficients from Table 2
2131    # of Scholz and Stephens 1987
2132    b0 = np.array([0.675, 1.281, 1.645, 1.96, 2.326, 2.573, 3.085])
2133    b1 = np.array([-0.245, 0.25, 0.678, 1.149, 1.822, 2.364, 3.615])
2134    b2 = np.array([-0.105, -0.305, -0.362, -0.391, -0.396, -0.345, -0.154])
2135    critical = b0 + b1 / math.sqrt(m) + b2 / m
2136
2137    sig = np.array([0.25, 0.1, 0.05, 0.025, 0.01, 0.005, 0.001])
2138    if A2 < critical.min():
2139        p = sig.max()
2140        warnings.warn("p-value capped: true value larger than {}".format(p),
2141                      stacklevel=2)
2142    elif A2 > critical.max():
2143        p = sig.min()
2144        warnings.warn("p-value floored: true value smaller than {}".format(p),
2145                      stacklevel=2)
2146    else:
2147        # interpolation of probit of significance level
2148        pf = np.polyfit(critical, log(sig), 2)
2149        p = math.exp(np.polyval(pf, A2))
2150
2151    return Anderson_ksampResult(A2, critical, p)
2152
2153
2154AnsariResult = namedtuple('AnsariResult', ('statistic', 'pvalue'))
2155
2156
2157class _ABW:
2158    """Distribution of Ansari-Bradley W-statistic under the null hypothesis."""
2159    # TODO: calculate exact distribution considering ties
2160    # We could avoid summing over more than half the frequencies,
2161    # but inititally it doesn't seem worth the extra complexity
2162
2163    def __init__(self):
2164        """Minimal initializer."""
2165        self.m = None
2166        self.n = None
2167        self.astart = None
2168        self.total = None
2169        self.freqs = None
2170
2171    def _recalc(self, n, m):
2172        """When necessary, recalculate exact distribution."""
2173        if n != self.n or m != self.m:
2174            self.n, self.m = n, m
2175            # distribution is NOT symmetric when m + n is odd
2176            # n is len(x), m is len(y), and ratio of scales is defined x/y
2177            astart, a1, _ = statlib.gscale(n, m)
2178            self.astart = astart  # minimum value of statistic
2179            # Exact distribution of test statistic under null hypothesis
2180            # expressed as frequencies/counts/integers to maintain precision.
2181            # Stored as floats to avoid overflow of sums.
2182            self.freqs = a1.astype(np.float64)
2183            self.total = self.freqs.sum()  # could calculate from m and n
2184            # probability mass is self.freqs / self.total;
2185
2186    def pmf(self, k, n, m):
2187        """Probability mass function."""
2188        self._recalc(n, m)
2189        # The convention here is that PMF at k = 12.5 is the same as at k = 12,
2190        # -> use `floor` in case of ties.
2191        ind = np.floor(k - self.astart).astype(int)
2192        return self.freqs[ind] / self.total
2193
2194    def cdf(self, k, n, m):
2195        """Cumulative distribution function."""
2196        self._recalc(n, m)
2197        # Null distribution derived without considering ties is
2198        # approximate. Round down to avoid Type I error.
2199        ind = np.ceil(k - self.astart).astype(int)
2200        return self.freqs[:ind+1].sum() / self.total
2201
2202    def sf(self, k, n, m):
2203        """Survival function."""
2204        self._recalc(n, m)
2205        # Null distribution derived without considering ties is
2206        # approximate. Round down to avoid Type I error.
2207        ind = np.floor(k - self.astart).astype(int)
2208        return self.freqs[ind:].sum() / self.total
2209
2210
2211# Maintain state for faster repeat calls to ansari w/ method='exact'
2212_abw_state = _ABW()
2213
2214
2215def ansari(x, y, alternative='two-sided'):
2216    """Perform the Ansari-Bradley test for equal scale parameters.
2217
2218    The Ansari-Bradley test ([1]_, [2]_) is a non-parametric test
2219    for the equality of the scale parameter of the distributions
2220    from which two samples were drawn. The null hypothesis states that
2221    the ratio of the scale of the distribution underlying `x` to the scale
2222    of the distribution underlying `y` is 1.
2223
2224    Parameters
2225    ----------
2226    x, y : array_like
2227        Arrays of sample data.
2228    alternative : {'two-sided', 'less', 'greater'}, optional
2229        Defines the alternative hypothesis. Default is 'two-sided'.
2230        The following options are available:
2231
2232        * 'two-sided': the ratio of scales is not equal to 1.
2233        * 'less': the ratio of scales is less than 1.
2234        * 'greater': the ratio of scales is greater than 1.
2235
2236        .. versionadded:: 1.7.0
2237
2238    Returns
2239    -------
2240    statistic : float
2241        The Ansari-Bradley test statistic.
2242    pvalue : float
2243        The p-value of the hypothesis test.
2244
2245    See Also
2246    --------
2247    fligner : A non-parametric test for the equality of k variances
2248    mood : A non-parametric test for the equality of two scale parameters
2249
2250    Notes
2251    -----
2252    The p-value given is exact when the sample sizes are both less than
2253    55 and there are no ties, otherwise a normal approximation for the
2254    p-value is used.
2255
2256    References
2257    ----------
2258    .. [1] Ansari, A. R. and Bradley, R. A. (1960) Rank-sum tests for
2259           dispersions, Annals of Mathematical Statistics, 31, 1174-1189.
2260    .. [2] Sprent, Peter and N.C. Smeeton.  Applied nonparametric
2261           statistical methods.  3rd ed. Chapman and Hall/CRC. 2001.
2262           Section 5.8.2.
2263    .. [3] Nathaniel E. Helwig "Nonparametric Dispersion and Equality
2264           Tests" at http://users.stat.umn.edu/~helwig/notes/npde-Notes.pdf
2265
2266    Examples
2267    --------
2268    >>> from scipy.stats import ansari
2269    >>> rng = np.random.default_rng()
2270
2271    For these examples, we'll create three random data sets.  The first
2272    two, with sizes 35 and 25, are drawn from a normal distribution with
2273    mean 0 and standard deviation 2.  The third data set has size 25 and
2274    is drawn from a normal distribution with standard deviation 1.25.
2275
2276    >>> x1 = rng.normal(loc=0, scale=2, size=35)
2277    >>> x2 = rng.normal(loc=0, scale=2, size=25)
2278    >>> x3 = rng.normal(loc=0, scale=1.25, size=25)
2279
2280    First we apply `ansari` to `x1` and `x2`.  These samples are drawn
2281    from the same distribution, so we expect the Ansari-Bradley test
2282    should not lead us to conclude that the scales of the distributions
2283    are different.
2284
2285    >>> ansari(x1, x2)
2286    AnsariResult(statistic=541.0, pvalue=0.9762532927399098)
2287
2288    With a p-value close to 1, we cannot conclude that there is a
2289    significant difference in the scales (as expected).
2290
2291    Now apply the test to `x1` and `x3`:
2292
2293    >>> ansari(x1, x3)
2294    AnsariResult(statistic=425.0, pvalue=0.0003087020407974518)
2295
2296    The probability of observing such an extreme value of the statistic
2297    under the null hypothesis of equal scales is only 0.03087%. We take this
2298    as evidence against the null hypothesis in favor of the alternative:
2299    the scales of the distributions from which the samples were drawn
2300    are not equal.
2301
2302    We can use the `alternative` parameter to perform a one-tailed test.
2303    In the above example, the scale of `x1` is greater than `x3` and so
2304    the ratio of scales of `x1` and `x3` is greater than 1. This means
2305    that the p-value when ``alternative='greater'`` should be near 0 and
2306    hence we should be able to reject the null hypothesis:
2307
2308    >>> ansari(x1, x3, alternative='greater')
2309    AnsariResult(statistic=425.0, pvalue=0.0001543510203987259)
2310
2311    As we can see, the p-value is indeed quite low. Use of
2312    ``alternative='less'`` should thus yield a large p-value:
2313
2314    >>> ansari(x1, x3, alternative='less')
2315    AnsariResult(statistic=425.0, pvalue=0.9998643258449039)
2316
2317    """
2318    if alternative not in {'two-sided', 'greater', 'less'}:
2319        raise ValueError("'alternative' must be 'two-sided',"
2320                         " 'greater', or 'less'.")
2321    x, y = asarray(x), asarray(y)
2322    n = len(x)
2323    m = len(y)
2324    if m < 1:
2325        raise ValueError("Not enough other observations.")
2326    if n < 1:
2327        raise ValueError("Not enough test observations.")
2328
2329    N = m + n
2330    xy = r_[x, y]  # combine
2331    rank = stats.rankdata(xy)
2332    symrank = amin(array((rank, N - rank + 1)), 0)
2333    AB = np.sum(symrank[:n], axis=0)
2334    uxy = unique(xy)
2335    repeats = (len(uxy) != len(xy))
2336    exact = ((m < 55) and (n < 55) and not repeats)
2337    if repeats and (m < 55 or n < 55):
2338        warnings.warn("Ties preclude use of exact statistic.")
2339    if exact:
2340        if alternative == 'two-sided':
2341            pval = 2.0 * np.minimum(_abw_state.cdf(AB, n, m),
2342                                    _abw_state.sf(AB, n, m))
2343        elif alternative == 'greater':
2344            # AB statistic is _smaller_ when ratio of scales is larger,
2345            # so this is the opposite of the usual calculation
2346            pval = _abw_state.cdf(AB, n, m)
2347        else:
2348            pval = _abw_state.sf(AB, n, m)
2349        return AnsariResult(AB, min(1.0, pval))
2350
2351    # otherwise compute normal approximation
2352    if N % 2:  # N odd
2353        mnAB = n * (N+1.0)**2 / 4.0 / N
2354        varAB = n * m * (N+1.0) * (3+N**2) / (48.0 * N**2)
2355    else:
2356        mnAB = n * (N+2.0) / 4.0
2357        varAB = m * n * (N+2) * (N-2.0) / 48 / (N-1.0)
2358    if repeats:   # adjust variance estimates
2359        # compute np.sum(tj * rj**2,axis=0)
2360        fac = np.sum(symrank**2, axis=0)
2361        if N % 2:  # N odd
2362            varAB = m * n * (16*N*fac - (N+1)**4) / (16.0 * N**2 * (N-1))
2363        else:  # N even
2364            varAB = m * n * (16*fac - N*(N+2)**2) / (16.0 * N * (N-1))
2365
2366    # Small values of AB indicate larger dispersion for the x sample.
2367    # Large values of AB indicate larger dispersion for the y sample.
2368    # This is opposite to the way we define the ratio of scales. see [1]_.
2369    z = (mnAB - AB) / sqrt(varAB)
2370    z, pval = _normtest_finish(z, alternative)
2371    return AnsariResult(AB, pval)
2372
2373
2374BartlettResult = namedtuple('BartlettResult', ('statistic', 'pvalue'))
2375
2376
2377def bartlett(*args):
2378    """Perform Bartlett's test for equal variances.
2379
2380    Bartlett's test tests the null hypothesis that all input samples
2381    are from populations with equal variances.  For samples
2382    from significantly non-normal populations, Levene's test
2383    `levene` is more robust.
2384
2385    Parameters
2386    ----------
2387    sample1, sample2,... : array_like
2388        arrays of sample data.  Only 1d arrays are accepted, they may have
2389        different lengths.
2390
2391    Returns
2392    -------
2393    statistic : float
2394        The test statistic.
2395    pvalue : float
2396        The p-value of the test.
2397
2398    See Also
2399    --------
2400    fligner : A non-parametric test for the equality of k variances
2401    levene : A robust parametric test for equality of k variances
2402
2403    Notes
2404    -----
2405    Conover et al. (1981) examine many of the existing parametric and
2406    nonparametric tests by extensive simulations and they conclude that the
2407    tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be
2408    superior in terms of robustness of departures from normality and power
2409    ([3]_).
2410
2411    References
2412    ----------
2413    .. [1]  https://www.itl.nist.gov/div898/handbook/eda/section3/eda357.htm
2414
2415    .. [2]  Snedecor, George W. and Cochran, William G. (1989), Statistical
2416              Methods, Eighth Edition, Iowa State University Press.
2417
2418    .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and
2419           Hypothesis Testing based on Quadratic Inference Function. Technical
2420           Report #99-03, Center for Likelihood Studies, Pennsylvania State
2421           University.
2422
2423    .. [4] Bartlett, M. S. (1937). Properties of Sufficiency and Statistical
2424           Tests. Proceedings of the Royal Society of London. Series A,
2425           Mathematical and Physical Sciences, Vol. 160, No.901, pp. 268-282.
2426
2427    Examples
2428    --------
2429    Test whether or not the lists `a`, `b` and `c` come from populations
2430    with equal variances.
2431
2432    >>> from scipy.stats import bartlett
2433    >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]
2434    >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]
2435    >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]
2436    >>> stat, p = bartlett(a, b, c)
2437    >>> p
2438    1.1254782518834628e-05
2439
2440    The very small p-value suggests that the populations do not have equal
2441    variances.
2442
2443    This is not surprising, given that the sample variance of `b` is much
2444    larger than that of `a` and `c`:
2445
2446    >>> [np.var(x, ddof=1) for x in [a, b, c]]
2447    [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]
2448
2449    """
2450    # Handle empty input and input that is not 1d
2451    for a in args:
2452        if np.asanyarray(a).size == 0:
2453            return BartlettResult(np.nan, np.nan)
2454        if np.asanyarray(a).ndim > 1:
2455            raise ValueError('Samples must be one-dimensional.')
2456
2457    k = len(args)
2458    if k < 2:
2459        raise ValueError("Must enter at least two input sample vectors.")
2460    Ni = np.empty(k)
2461    ssq = np.empty(k, 'd')
2462    for j in range(k):
2463        Ni[j] = len(args[j])
2464        ssq[j] = np.var(args[j], ddof=1)
2465    Ntot = np.sum(Ni, axis=0)
2466    spsq = np.sum((Ni - 1)*ssq, axis=0) / (1.0*(Ntot - k))
2467    numer = (Ntot*1.0 - k) * log(spsq) - np.sum((Ni - 1.0)*log(ssq), axis=0)
2468    denom = 1.0 + 1.0/(3*(k - 1)) * ((np.sum(1.0/(Ni - 1.0), axis=0)) -
2469                                     1.0/(Ntot - k))
2470    T = numer / denom
2471    pval = distributions.chi2.sf(T, k - 1)  # 1 - cdf
2472
2473    return BartlettResult(T, pval)
2474
2475
2476LeveneResult = namedtuple('LeveneResult', ('statistic', 'pvalue'))
2477
2478
2479def levene(*args, center='median', proportiontocut=0.05):
2480    """Perform Levene test for equal variances.
2481
2482    The Levene test tests the null hypothesis that all input samples
2483    are from populations with equal variances.  Levene's test is an
2484    alternative to Bartlett's test `bartlett` in the case where
2485    there are significant deviations from normality.
2486
2487    Parameters
2488    ----------
2489    sample1, sample2, ... : array_like
2490        The sample data, possibly with different lengths. Only one-dimensional
2491        samples are accepted.
2492    center : {'mean', 'median', 'trimmed'}, optional
2493        Which function of the data to use in the test.  The default
2494        is 'median'.
2495    proportiontocut : float, optional
2496        When `center` is 'trimmed', this gives the proportion of data points
2497        to cut from each end. (See `scipy.stats.trim_mean`.)
2498        Default is 0.05.
2499
2500    Returns
2501    -------
2502    statistic : float
2503        The test statistic.
2504    pvalue : float
2505        The p-value for the test.
2506
2507    Notes
2508    -----
2509    Three variations of Levene's test are possible.  The possibilities
2510    and their recommended usages are:
2511
2512      * 'median' : Recommended for skewed (non-normal) distributions>
2513      * 'mean' : Recommended for symmetric, moderate-tailed distributions.
2514      * 'trimmed' : Recommended for heavy-tailed distributions.
2515
2516    The test version using the mean was proposed in the original article
2517    of Levene ([2]_) while the median and trimmed mean have been studied by
2518    Brown and Forsythe ([3]_), sometimes also referred to as Brown-Forsythe
2519    test.
2520
2521    References
2522    ----------
2523    .. [1] https://www.itl.nist.gov/div898/handbook/eda/section3/eda35a.htm
2524    .. [2] Levene, H. (1960). In Contributions to Probability and Statistics:
2525           Essays in Honor of Harold Hotelling, I. Olkin et al. eds.,
2526           Stanford University Press, pp. 278-292.
2527    .. [3] Brown, M. B. and Forsythe, A. B. (1974), Journal of the American
2528           Statistical Association, 69, 364-367
2529
2530    Examples
2531    --------
2532    Test whether or not the lists `a`, `b` and `c` come from populations
2533    with equal variances.
2534
2535    >>> from scipy.stats import levene
2536    >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]
2537    >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]
2538    >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]
2539    >>> stat, p = levene(a, b, c)
2540    >>> p
2541    0.002431505967249681
2542
2543    The small p-value suggests that the populations do not have equal
2544    variances.
2545
2546    This is not surprising, given that the sample variance of `b` is much
2547    larger than that of `a` and `c`:
2548
2549    >>> [np.var(x, ddof=1) for x in [a, b, c]]
2550    [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]
2551
2552    """
2553    if center not in ['mean', 'median', 'trimmed']:
2554        raise ValueError("center must be 'mean', 'median' or 'trimmed'.")
2555
2556    k = len(args)
2557    if k < 2:
2558        raise ValueError("Must enter at least two input sample vectors.")
2559    # check for 1d input
2560    for j in range(k):
2561        if np.asanyarray(args[j]).ndim > 1:
2562            raise ValueError('Samples must be one-dimensional.')
2563
2564    Ni = np.empty(k)
2565    Yci = np.empty(k, 'd')
2566
2567    if center == 'median':
2568        func = lambda x: np.median(x, axis=0)
2569    elif center == 'mean':
2570        func = lambda x: np.mean(x, axis=0)
2571    else:  # center == 'trimmed'
2572        args = tuple(stats.trimboth(np.sort(arg), proportiontocut)
2573                     for arg in args)
2574        func = lambda x: np.mean(x, axis=0)
2575
2576    for j in range(k):
2577        Ni[j] = len(args[j])
2578        Yci[j] = func(args[j])
2579    Ntot = np.sum(Ni, axis=0)
2580
2581    # compute Zij's
2582    Zij = [None] * k
2583    for i in range(k):
2584        Zij[i] = abs(asarray(args[i]) - Yci[i])
2585
2586    # compute Zbari
2587    Zbari = np.empty(k, 'd')
2588    Zbar = 0.0
2589    for i in range(k):
2590        Zbari[i] = np.mean(Zij[i], axis=0)
2591        Zbar += Zbari[i] * Ni[i]
2592
2593    Zbar /= Ntot
2594    numer = (Ntot - k) * np.sum(Ni * (Zbari - Zbar)**2, axis=0)
2595
2596    # compute denom_variance
2597    dvar = 0.0
2598    for i in range(k):
2599        dvar += np.sum((Zij[i] - Zbari[i])**2, axis=0)
2600
2601    denom = (k - 1.0) * dvar
2602
2603    W = numer / denom
2604    pval = distributions.f.sf(W, k-1, Ntot-k)  # 1 - cdf
2605    return LeveneResult(W, pval)
2606
2607
2608def binom_test(x, n=None, p=0.5, alternative='two-sided'):
2609    """Perform a test that the probability of success is p.
2610
2611    Note: `binom_test` is deprecated; it is recommended that `binomtest`
2612    be used instead.
2613
2614    This is an exact, two-sided test of the null hypothesis
2615    that the probability of success in a Bernoulli experiment
2616    is `p`.
2617
2618    Parameters
2619    ----------
2620    x : int or array_like
2621        The number of successes, or if x has length 2, it is the
2622        number of successes and the number of failures.
2623    n : int
2624        The number of trials.  This is ignored if x gives both the
2625        number of successes and failures.
2626    p : float, optional
2627        The hypothesized probability of success.  ``0 <= p <= 1``. The
2628        default value is ``p = 0.5``.
2629    alternative : {'two-sided', 'greater', 'less'}, optional
2630        Indicates the alternative hypothesis. The default value is
2631        'two-sided'.
2632
2633    Returns
2634    -------
2635    p-value : float
2636        The p-value of the hypothesis test.
2637
2638    References
2639    ----------
2640    .. [1] https://en.wikipedia.org/wiki/Binomial_test
2641
2642    Examples
2643    --------
2644    >>> from scipy import stats
2645
2646    A car manufacturer claims that no more than 10% of their cars are unsafe.
2647    15 cars are inspected for safety, 3 were found to be unsafe. Test the
2648    manufacturer's claim:
2649
2650    >>> stats.binom_test(3, n=15, p=0.1, alternative='greater')
2651    0.18406106910639114
2652
2653    The null hypothesis cannot be rejected at the 5% level of significance
2654    because the returned p-value is greater than the critical value of 5%.
2655
2656    """
2657    x = atleast_1d(x).astype(np.int_)
2658    if len(x) == 2:
2659        n = x[1] + x[0]
2660        x = x[0]
2661    elif len(x) == 1:
2662        x = x[0]
2663        if n is None or n < x:
2664            raise ValueError("n must be >= x")
2665        n = np.int_(n)
2666    else:
2667        raise ValueError("Incorrect length for x.")
2668
2669    if (p > 1.0) or (p < 0.0):
2670        raise ValueError("p must be in range [0,1]")
2671
2672    if alternative not in ('two-sided', 'less', 'greater'):
2673        raise ValueError("alternative not recognized\n"
2674                         "should be 'two-sided', 'less' or 'greater'")
2675
2676    if alternative == 'less':
2677        pval = distributions.binom.cdf(x, n, p)
2678        return pval
2679
2680    if alternative == 'greater':
2681        pval = distributions.binom.sf(x-1, n, p)
2682        return pval
2683
2684    # if alternative was neither 'less' nor 'greater', then it's 'two-sided'
2685    d = distributions.binom.pmf(x, n, p)
2686    rerr = 1 + 1e-7
2687    if x == p * n:
2688        # special case as shortcut, would also be handled by `else` below
2689        pval = 1.
2690    elif x < p * n:
2691        i = np.arange(np.ceil(p * n), n+1)
2692        y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0)
2693        pval = (distributions.binom.cdf(x, n, p) +
2694                distributions.binom.sf(n - y, n, p))
2695    else:
2696        i = np.arange(np.floor(p*n) + 1)
2697        y = np.sum(distributions.binom.pmf(i, n, p) <= d*rerr, axis=0)
2698        pval = (distributions.binom.cdf(y-1, n, p) +
2699                distributions.binom.sf(x-1, n, p))
2700
2701    return min(1.0, pval)
2702
2703
2704def _apply_func(x, g, func):
2705    # g is list of indices into x
2706    #  separating x into different groups
2707    #  func should be applied over the groups
2708    g = unique(r_[0, g, len(x)])
2709    output = [func(x[g[k]:g[k+1]]) for k in range(len(g) - 1)]
2710
2711    return asarray(output)
2712
2713
2714FlignerResult = namedtuple('FlignerResult', ('statistic', 'pvalue'))
2715
2716
2717def fligner(*args, center='median', proportiontocut=0.05):
2718    """Perform Fligner-Killeen test for equality of variance.
2719
2720    Fligner's test tests the null hypothesis that all input samples
2721    are from populations with equal variances.  Fligner-Killeen's test is
2722    distribution free when populations are identical [2]_.
2723
2724    Parameters
2725    ----------
2726    sample1, sample2, ... : array_like
2727        Arrays of sample data.  Need not be the same length.
2728    center : {'mean', 'median', 'trimmed'}, optional
2729        Keyword argument controlling which function of the data is used in
2730        computing the test statistic.  The default is 'median'.
2731    proportiontocut : float, optional
2732        When `center` is 'trimmed', this gives the proportion of data points
2733        to cut from each end. (See `scipy.stats.trim_mean`.)
2734        Default is 0.05.
2735
2736    Returns
2737    -------
2738    statistic : float
2739        The test statistic.
2740    pvalue : float
2741        The p-value for the hypothesis test.
2742
2743    See Also
2744    --------
2745    bartlett : A parametric test for equality of k variances in normal samples
2746    levene : A robust parametric test for equality of k variances
2747
2748    Notes
2749    -----
2750    As with Levene's test there are three variants of Fligner's test that
2751    differ by the measure of central tendency used in the test.  See `levene`
2752    for more information.
2753
2754    Conover et al. (1981) examine many of the existing parametric and
2755    nonparametric tests by extensive simulations and they conclude that the
2756    tests proposed by Fligner and Killeen (1976) and Levene (1960) appear to be
2757    superior in terms of robustness of departures from normality and power [3]_.
2758
2759    References
2760    ----------
2761    .. [1] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and
2762           Hypothesis Testing based on Quadratic Inference Function. Technical
2763           Report #99-03, Center for Likelihood Studies, Pennsylvania State
2764           University.
2765           https://cecas.clemson.edu/~cspark/cv/paper/qif/draftqif2.pdf
2766
2767    .. [2] Fligner, M.A. and Killeen, T.J. (1976). Distribution-free two-sample
2768           tests for scale. 'Journal of the American Statistical Association.'
2769           71(353), 210-213.
2770
2771    .. [3] Park, C. and Lindsay, B. G. (1999). Robust Scale Estimation and
2772           Hypothesis Testing based on Quadratic Inference Function. Technical
2773           Report #99-03, Center for Likelihood Studies, Pennsylvania State
2774           University.
2775
2776    .. [4] Conover, W. J., Johnson, M. E. and Johnson M. M. (1981). A
2777           comparative study of tests for homogeneity of variances, with
2778           applications to the outer continental shelf biding data.
2779           Technometrics, 23(4), 351-361.
2780
2781    Examples
2782    --------
2783    Test whether or not the lists `a`, `b` and `c` come from populations
2784    with equal variances.
2785
2786    >>> from scipy.stats import fligner
2787    >>> a = [8.88, 9.12, 9.04, 8.98, 9.00, 9.08, 9.01, 8.85, 9.06, 8.99]
2788    >>> b = [8.88, 8.95, 9.29, 9.44, 9.15, 9.58, 8.36, 9.18, 8.67, 9.05]
2789    >>> c = [8.95, 9.12, 8.95, 8.85, 9.03, 8.84, 9.07, 8.98, 8.86, 8.98]
2790    >>> stat, p = fligner(a, b, c)
2791    >>> p
2792    0.00450826080004775
2793
2794    The small p-value suggests that the populations do not have equal
2795    variances.
2796
2797    This is not surprising, given that the sample variance of `b` is much
2798    larger than that of `a` and `c`:
2799
2800    >>> [np.var(x, ddof=1) for x in [a, b, c]]
2801    [0.007054444444444413, 0.13073888888888888, 0.008890000000000002]
2802
2803    """
2804    if center not in ['mean', 'median', 'trimmed']:
2805        raise ValueError("center must be 'mean', 'median' or 'trimmed'.")
2806
2807    # Handle empty input
2808    for a in args:
2809        if np.asanyarray(a).size == 0:
2810            return FlignerResult(np.nan, np.nan)
2811
2812    k = len(args)
2813    if k < 2:
2814        raise ValueError("Must enter at least two input sample vectors.")
2815
2816    if center == 'median':
2817        func = lambda x: np.median(x, axis=0)
2818    elif center == 'mean':
2819        func = lambda x: np.mean(x, axis=0)
2820    else:  # center == 'trimmed'
2821        args = tuple(stats.trimboth(arg, proportiontocut) for arg in args)
2822        func = lambda x: np.mean(x, axis=0)
2823
2824    Ni = asarray([len(args[j]) for j in range(k)])
2825    Yci = asarray([func(args[j]) for j in range(k)])
2826    Ntot = np.sum(Ni, axis=0)
2827    # compute Zij's
2828    Zij = [abs(asarray(args[i]) - Yci[i]) for i in range(k)]
2829    allZij = []
2830    g = [0]
2831    for i in range(k):
2832        allZij.extend(list(Zij[i]))
2833        g.append(len(allZij))
2834
2835    ranks = stats.rankdata(allZij)
2836    a = distributions.norm.ppf(ranks / (2*(Ntot + 1.0)) + 0.5)
2837
2838    # compute Aibar
2839    Aibar = _apply_func(a, g, np.sum) / Ni
2840    anbar = np.mean(a, axis=0)
2841    varsq = np.var(a, axis=0, ddof=1)
2842    Xsq = np.sum(Ni * (asarray(Aibar) - anbar)**2.0, axis=0) / varsq
2843    pval = distributions.chi2.sf(Xsq, k - 1)  # 1 - cdf
2844    return FlignerResult(Xsq, pval)
2845
2846
2847def mood(x, y, axis=0, alternative="two-sided"):
2848    """Perform Mood's test for equal scale parameters.
2849
2850    Mood's two-sample test for scale parameters is a non-parametric
2851    test for the null hypothesis that two samples are drawn from the
2852    same distribution with the same scale parameter.
2853
2854    Parameters
2855    ----------
2856    x, y : array_like
2857        Arrays of sample data.
2858    axis : int, optional
2859        The axis along which the samples are tested.  `x` and `y` can be of
2860        different length along `axis`.
2861        If `axis` is None, `x` and `y` are flattened and the test is done on
2862        all values in the flattened arrays.
2863    alternative : {'two-sided', 'less', 'greater'}, optional
2864        Defines the alternative hypothesis. Default is 'two-sided'.
2865        The following options are available:
2866
2867        * 'two-sided': the scales of the distributions underlying `x` and `y`
2868          are different.
2869        * 'less': the scale of the distribution underlying `x` is less than
2870          the scale of the distribution underlying `y`.
2871        * 'greater': the scale of the distribution underlying `x` is greater
2872          than the scale of the distribution underlying `y`.
2873
2874        .. versionadded:: 1.7.0
2875
2876    Returns
2877    -------
2878    z : scalar or ndarray
2879        The z-score for the hypothesis test.  For 1-D inputs a scalar is
2880        returned.
2881    p-value : scalar ndarray
2882        The p-value for the hypothesis test.
2883
2884    See Also
2885    --------
2886    fligner : A non-parametric test for the equality of k variances
2887    ansari : A non-parametric test for the equality of 2 variances
2888    bartlett : A parametric test for equality of k variances in normal samples
2889    levene : A parametric test for equality of k variances
2890
2891    Notes
2892    -----
2893    The data are assumed to be drawn from probability distributions ``f(x)``
2894    and ``f(x/s) / s`` respectively, for some probability density function f.
2895    The null hypothesis is that ``s == 1``.
2896
2897    For multi-dimensional arrays, if the inputs are of shapes
2898    ``(n0, n1, n2, n3)``  and ``(n0, m1, n2, n3)``, then if ``axis=1``, the
2899    resulting z and p values will have shape ``(n0, n2, n3)``.  Note that
2900    ``n1`` and ``m1`` don't have to be equal, but the other dimensions do.
2901
2902    Examples
2903    --------
2904    >>> from scipy import stats
2905    >>> rng = np.random.default_rng()
2906    >>> x2 = rng.standard_normal((2, 45, 6, 7))
2907    >>> x1 = rng.standard_normal((2, 30, 6, 7))
2908    >>> z, p = stats.mood(x1, x2, axis=1)
2909    >>> p.shape
2910    (2, 6, 7)
2911
2912    Find the number of points where the difference in scale is not significant:
2913
2914    >>> (p > 0.1).sum()
2915    78
2916
2917    Perform the test with different scales:
2918
2919    >>> x1 = rng.standard_normal((2, 30))
2920    >>> x2 = rng.standard_normal((2, 35)) * 10.0
2921    >>> stats.mood(x1, x2, axis=1)
2922    (array([-5.76174136, -6.12650783]), array([8.32505043e-09, 8.98287869e-10]))
2923
2924    """
2925    x = np.asarray(x, dtype=float)
2926    y = np.asarray(y, dtype=float)
2927
2928    if axis is None:
2929        x = x.flatten()
2930        y = y.flatten()
2931        axis = 0
2932
2933    if axis < 0:
2934        axis = x.ndim + axis
2935
2936    # Determine shape of the result arrays
2937    res_shape = tuple([x.shape[ax] for ax in range(len(x.shape)) if ax != axis])
2938    if not (res_shape == tuple([y.shape[ax] for ax in range(len(y.shape)) if
2939                                ax != axis])):
2940        raise ValueError("Dimensions of x and y on all axes except `axis` "
2941                         "should match")
2942
2943    n = x.shape[axis]
2944    m = y.shape[axis]
2945    N = m + n
2946    if N < 3:
2947        raise ValueError("Not enough observations.")
2948
2949    xy = np.concatenate((x, y), axis=axis)
2950    if axis != 0:
2951        xy = np.rollaxis(xy, axis)
2952
2953    xy = xy.reshape(xy.shape[0], -1)
2954
2955    # Generalized to the n-dimensional case by adding the axis argument, and
2956    # using for loops, since rankdata is not vectorized.  For improving
2957    # performance consider vectorizing rankdata function.
2958    all_ranks = np.empty_like(xy)
2959    for j in range(xy.shape[1]):
2960        all_ranks[:, j] = stats.rankdata(xy[:, j])
2961
2962    Ri = all_ranks[:n]
2963    M = np.sum((Ri - (N + 1.0) / 2)**2, axis=0)
2964    # Approx stat.
2965    mnM = n * (N * N - 1.0) / 12
2966    varM = m * n * (N + 1.0) * (N + 2) * (N - 2) / 180
2967    z = (M - mnM) / sqrt(varM)
2968    z, pval = _normtest_finish(z, alternative)
2969
2970    if res_shape == ():
2971        # Return scalars, not 0-D arrays
2972        z = z[0]
2973        pval = pval[0]
2974    else:
2975        z.shape = res_shape
2976        pval.shape = res_shape
2977
2978    return z, pval
2979
2980
2981WilcoxonResult = namedtuple('WilcoxonResult', ('statistic', 'pvalue'))
2982
2983
2984def wilcoxon(x, y=None, zero_method="wilcox", correction=False,
2985             alternative="two-sided", mode='auto'):
2986    """Calculate the Wilcoxon signed-rank test.
2987
2988    The Wilcoxon signed-rank test tests the null hypothesis that two
2989    related paired samples come from the same distribution. In particular,
2990    it tests whether the distribution of the differences x - y is symmetric
2991    about zero. It is a non-parametric version of the paired T-test.
2992
2993    Parameters
2994    ----------
2995    x : array_like
2996        Either the first set of measurements (in which case ``y`` is the second
2997        set of measurements), or the differences between two sets of
2998        measurements (in which case ``y`` is not to be specified.)  Must be
2999        one-dimensional.
3000    y : array_like, optional
3001        Either the second set of measurements (if ``x`` is the first set of
3002        measurements), or not specified (if ``x`` is the differences between
3003        two sets of measurements.)  Must be one-dimensional.
3004    zero_method : {"pratt", "wilcox", "zsplit"}, optional
3005        The following options are available (default is "wilcox"):
3006
3007          * "pratt": Includes zero-differences in the ranking process,
3008            but drops the ranks of the zeros, see [4]_, (more conservative).
3009          * "wilcox": Discards all zero-differences, the default.
3010          * "zsplit": Includes zero-differences in the ranking process and
3011            split the zero rank between positive and negative ones.
3012    correction : bool, optional
3013        If True, apply continuity correction by adjusting the Wilcoxon rank
3014        statistic by 0.5 towards the mean value when computing the
3015        z-statistic if a normal approximation is used.  Default is False.
3016    alternative : {"two-sided", "greater", "less"}, optional
3017        The alternative hypothesis to be tested, see Notes. Default is
3018        "two-sided".
3019    mode : {"auto", "exact", "approx"}
3020        Method to calculate the p-value, see Notes. Default is "auto".
3021
3022    Returns
3023    -------
3024    statistic : float
3025        If ``alternative`` is "two-sided", the sum of the ranks of the
3026        differences above or below zero, whichever is smaller.
3027        Otherwise the sum of the ranks of the differences above zero.
3028    pvalue : float
3029        The p-value for the test depending on ``alternative`` and ``mode``.
3030
3031    See Also
3032    --------
3033    kruskal, mannwhitneyu
3034
3035    Notes
3036    -----
3037    The test has been introduced in [4]_. Given n independent samples
3038    (xi, yi) from a bivariate distribution (i.e. paired samples),
3039    it computes the differences di = xi - yi. One assumption of the test
3040    is that the differences are symmetric, see [2]_.
3041    The two-sided test has the null hypothesis that the median of the
3042    differences is zero against the alternative that it is different from
3043    zero. The one-sided test has the null hypothesis that the median is
3044    positive against the alternative that it is negative
3045    (``alternative == 'less'``), or vice versa (``alternative == 'greater.'``).
3046
3047    To derive the p-value, the exact distribution (``mode == 'exact'``)
3048    can be used for sample sizes of up to 25. The default ``mode == 'auto'``
3049    uses the exact distribution if there are at most 25 observations and no
3050    ties, otherwise a normal approximation is used (``mode == 'approx'``).
3051
3052    The treatment of ties can be controlled by the parameter `zero_method`.
3053    If ``zero_method == 'pratt'``, the normal approximation is adjusted as in
3054    [5]_. A typical rule is to require that n > 20 ([2]_, p. 383).
3055
3056    References
3057    ----------
3058    .. [1] https://en.wikipedia.org/wiki/Wilcoxon_signed-rank_test
3059    .. [2] Conover, W.J., Practical Nonparametric Statistics, 1971.
3060    .. [3] Pratt, J.W., Remarks on Zeros and Ties in the Wilcoxon Signed
3061       Rank Procedures, Journal of the American Statistical Association,
3062       Vol. 54, 1959, pp. 655-667. :doi:`10.1080/01621459.1959.10501526`
3063    .. [4] Wilcoxon, F., Individual Comparisons by Ranking Methods,
3064       Biometrics Bulletin, Vol. 1, 1945, pp. 80-83. :doi:`10.2307/3001968`
3065    .. [5] Cureton, E.E., The Normal Approximation to the Signed-Rank
3066       Sampling Distribution When Zero Differences are Present,
3067       Journal of the American Statistical Association, Vol. 62, 1967,
3068       pp. 1068-1069. :doi:`10.1080/01621459.1967.10500917`
3069
3070    Examples
3071    --------
3072    In [4]_, the differences in height between cross- and self-fertilized
3073    corn plants is given as follows:
3074
3075    >>> d = [6, 8, 14, 16, 23, 24, 28, 29, 41, -48, 49, 56, 60, -67, 75]
3076
3077    Cross-fertilized plants appear to be be higher. To test the null
3078    hypothesis that there is no height difference, we can apply the
3079    two-sided test:
3080
3081    >>> from scipy.stats import wilcoxon
3082    >>> w, p = wilcoxon(d)
3083    >>> w, p
3084    (24.0, 0.041259765625)
3085
3086    Hence, we would reject the null hypothesis at a confidence level of 5%,
3087    concluding that there is a difference in height between the groups.
3088    To confirm that the median of the differences can be assumed to be
3089    positive, we use:
3090
3091    >>> w, p = wilcoxon(d, alternative='greater')
3092    >>> w, p
3093    (96.0, 0.0206298828125)
3094
3095    This shows that the null hypothesis that the median is negative can be
3096    rejected at a confidence level of 5% in favor of the alternative that
3097    the median is greater than zero. The p-values above are exact. Using the
3098    normal approximation gives very similar values:
3099
3100    >>> w, p = wilcoxon(d, mode='approx')
3101    >>> w, p
3102    (24.0, 0.04088813291185591)
3103
3104    Note that the statistic changed to 96 in the one-sided case (the sum
3105    of ranks of positive differences) whereas it is 24 in the two-sided
3106    case (the minimum of sum of ranks above and below zero).
3107
3108    """
3109    if mode not in ["auto", "approx", "exact"]:
3110        raise ValueError("mode must be either 'auto', 'approx' or 'exact'")
3111
3112    if zero_method not in ["wilcox", "pratt", "zsplit"]:
3113        raise ValueError("Zero method must be either 'wilcox' "
3114                         "or 'pratt' or 'zsplit'")
3115
3116    if alternative not in ["two-sided", "less", "greater"]:
3117        raise ValueError("Alternative must be either 'two-sided', "
3118                         "'greater' or 'less'")
3119
3120    if y is None:
3121        d = asarray(x)
3122        if d.ndim > 1:
3123            raise ValueError('Sample x must be one-dimensional.')
3124    else:
3125        x, y = map(asarray, (x, y))
3126        if x.ndim > 1 or y.ndim > 1:
3127            raise ValueError('Samples x and y must be one-dimensional.')
3128        if len(x) != len(y):
3129            raise ValueError('The samples x and y must have the same length.')
3130        d = x - y
3131
3132    if mode == "auto":
3133        if len(d) <= 25:
3134            mode = "exact"
3135        else:
3136            mode = "approx"
3137
3138    n_zero = np.sum(d == 0)
3139    if n_zero > 0 and mode == "exact":
3140        mode = "approx"
3141        warnings.warn("Exact p-value calculation does not work if there are "
3142                      "ties. Switching to normal approximation.")
3143
3144    if mode == "approx":
3145        if zero_method in ["wilcox", "pratt"]:
3146            if n_zero == len(d):
3147                raise ValueError("zero_method 'wilcox' and 'pratt' do not "
3148                                 "work if x - y is zero for all elements.")
3149        if zero_method == "wilcox":
3150            # Keep all non-zero differences
3151            d = compress(np.not_equal(d, 0), d)
3152
3153    count = len(d)
3154    if count < 10 and mode == "approx":
3155        warnings.warn("Sample size too small for normal approximation.")
3156
3157    r = stats.rankdata(abs(d))
3158    r_plus = np.sum((d > 0) * r)
3159    r_minus = np.sum((d < 0) * r)
3160
3161    if zero_method == "zsplit":
3162        r_zero = np.sum((d == 0) * r)
3163        r_plus += r_zero / 2.
3164        r_minus += r_zero / 2.
3165
3166    # return min for two-sided test, but r_plus for one-sided test
3167    # the literature is not consistent here
3168    # r_plus is more informative since r_plus + r_minus = count*(count+1)/2,
3169    # i.e. the sum of the ranks, so r_minus and the min can be inferred
3170    # (If alternative='pratt', r_plus + r_minus = count*(count+1)/2 - r_zero.)
3171    # [3] uses the r_plus for the one-sided test, keep min for two-sided test
3172    # to keep backwards compatibility
3173    if alternative == "two-sided":
3174        T = min(r_plus, r_minus)
3175    else:
3176        T = r_plus
3177
3178    if mode == "approx":
3179        mn = count * (count + 1.) * 0.25
3180        se = count * (count + 1.) * (2. * count + 1.)
3181
3182        if zero_method == "pratt":
3183            r = r[d != 0]
3184            # normal approximation needs to be adjusted, see Cureton (1967)
3185            mn -= n_zero * (n_zero + 1.) * 0.25
3186            se -= n_zero * (n_zero + 1.) * (2. * n_zero + 1.)
3187
3188        replist, repnum = find_repeats(r)
3189        if repnum.size != 0:
3190            # Correction for repeated elements.
3191            se -= 0.5 * (repnum * (repnum * repnum - 1)).sum()
3192
3193        se = sqrt(se / 24)
3194
3195        # apply continuity correction if applicable
3196        d = 0
3197        if correction:
3198            if alternative == "two-sided":
3199                d = 0.5 * np.sign(T - mn)
3200            elif alternative == "less":
3201                d = -0.5
3202            else:
3203                d = 0.5
3204
3205        # compute statistic and p-value using normal approximation
3206        z = (T - mn - d) / se
3207        if alternative == "two-sided":
3208            prob = 2. * distributions.norm.sf(abs(z))
3209        elif alternative == "greater":
3210            # large T = r_plus indicates x is greater than y; i.e.
3211            # accept alternative in that case and return small p-value (sf)
3212            prob = distributions.norm.sf(z)
3213        else:
3214            prob = distributions.norm.cdf(z)
3215    elif mode == "exact":
3216        # get frequencies cnt of the possible positive ranksums r_plus
3217        cnt = _get_wilcoxon_distr(count)
3218        # note: r_plus is int (ties not allowed), need int for slices below
3219        r_plus = int(r_plus)
3220        if alternative == "two-sided":
3221            if r_plus == (len(cnt) - 1) // 2:
3222                # r_plus is the center of the distribution.
3223                prob = 1.0
3224            else:
3225                p_less = np.sum(cnt[:r_plus + 1]) / 2**count
3226                p_greater = np.sum(cnt[r_plus:]) / 2**count
3227                prob = 2*min(p_greater, p_less)
3228        elif alternative == "greater":
3229            prob = np.sum(cnt[r_plus:]) / 2**count
3230        else:
3231            prob = np.sum(cnt[:r_plus + 1]) / 2**count
3232
3233    return WilcoxonResult(T, prob)
3234
3235
3236def median_test(*args, ties='below', correction=True, lambda_=1,
3237                nan_policy='propagate'):
3238    """Perform a Mood's median test.
3239
3240    Test that two or more samples come from populations with the same median.
3241
3242    Let ``n = len(args)`` be the number of samples.  The "grand median" of
3243    all the data is computed, and a contingency table is formed by
3244    classifying the values in each sample as being above or below the grand
3245    median.  The contingency table, along with `correction` and `lambda_`,
3246    are passed to `scipy.stats.chi2_contingency` to compute the test statistic
3247    and p-value.
3248
3249    Parameters
3250    ----------
3251    sample1, sample2, ... : array_like
3252        The set of samples.  There must be at least two samples.
3253        Each sample must be a one-dimensional sequence containing at least
3254        one value.  The samples are not required to have the same length.
3255    ties : str, optional
3256        Determines how values equal to the grand median are classified in
3257        the contingency table.  The string must be one of::
3258
3259            "below":
3260                Values equal to the grand median are counted as "below".
3261            "above":
3262                Values equal to the grand median are counted as "above".
3263            "ignore":
3264                Values equal to the grand median are not counted.
3265
3266        The default is "below".
3267    correction : bool, optional
3268        If True, *and* there are just two samples, apply Yates' correction
3269        for continuity when computing the test statistic associated with
3270        the contingency table.  Default is True.
3271    lambda_ : float or str, optional
3272        By default, the statistic computed in this test is Pearson's
3273        chi-squared statistic.  `lambda_` allows a statistic from the
3274        Cressie-Read power divergence family to be used instead.  See
3275        `power_divergence` for details.
3276        Default is 1 (Pearson's chi-squared statistic).
3277    nan_policy : {'propagate', 'raise', 'omit'}, optional
3278        Defines how to handle when input contains nan. 'propagate' returns nan,
3279        'raise' throws an error, 'omit' performs the calculations ignoring nan
3280        values. Default is 'propagate'.
3281
3282    Returns
3283    -------
3284    stat : float
3285        The test statistic.  The statistic that is returned is determined by
3286        `lambda_`.  The default is Pearson's chi-squared statistic.
3287    p : float
3288        The p-value of the test.
3289    m : float
3290        The grand median.
3291    table : ndarray
3292        The contingency table.  The shape of the table is (2, n), where
3293        n is the number of samples.  The first row holds the counts of the
3294        values above the grand median, and the second row holds the counts
3295        of the values below the grand median.  The table allows further
3296        analysis with, for example, `scipy.stats.chi2_contingency`, or with
3297        `scipy.stats.fisher_exact` if there are two samples, without having
3298        to recompute the table.  If ``nan_policy`` is "propagate" and there
3299        are nans in the input, the return value for ``table`` is ``None``.
3300
3301    See Also
3302    --------
3303    kruskal : Compute the Kruskal-Wallis H-test for independent samples.
3304    mannwhitneyu : Computes the Mann-Whitney rank test on samples x and y.
3305
3306    Notes
3307    -----
3308    .. versionadded:: 0.15.0
3309
3310    References
3311    ----------
3312    .. [1] Mood, A. M., Introduction to the Theory of Statistics. McGraw-Hill
3313        (1950), pp. 394-399.
3314    .. [2] Zar, J. H., Biostatistical Analysis, 5th ed. Prentice Hall (2010).
3315        See Sections 8.12 and 10.15.
3316
3317    Examples
3318    --------
3319    A biologist runs an experiment in which there are three groups of plants.
3320    Group 1 has 16 plants, group 2 has 15 plants, and group 3 has 17 plants.
3321    Each plant produces a number of seeds.  The seed counts for each group
3322    are::
3323
3324        Group 1: 10 14 14 18 20 22 24 25 31 31 32 39 43 43 48 49
3325        Group 2: 28 30 31 33 34 35 36 40 44 55 57 61 91 92 99
3326        Group 3:  0  3  9 22 23 25 25 33 34 34 40 45 46 48 62 67 84
3327
3328    The following code applies Mood's median test to these samples.
3329
3330    >>> g1 = [10, 14, 14, 18, 20, 22, 24, 25, 31, 31, 32, 39, 43, 43, 48, 49]
3331    >>> g2 = [28, 30, 31, 33, 34, 35, 36, 40, 44, 55, 57, 61, 91, 92, 99]
3332    >>> g3 = [0, 3, 9, 22, 23, 25, 25, 33, 34, 34, 40, 45, 46, 48, 62, 67, 84]
3333    >>> from scipy.stats import median_test
3334    >>> stat, p, med, tbl = median_test(g1, g2, g3)
3335
3336    The median is
3337
3338    >>> med
3339    34.0
3340
3341    and the contingency table is
3342
3343    >>> tbl
3344    array([[ 5, 10,  7],
3345           [11,  5, 10]])
3346
3347    `p` is too large to conclude that the medians are not the same:
3348
3349    >>> p
3350    0.12609082774093244
3351
3352    The "G-test" can be performed by passing ``lambda_="log-likelihood"`` to
3353    `median_test`.
3354
3355    >>> g, p, med, tbl = median_test(g1, g2, g3, lambda_="log-likelihood")
3356    >>> p
3357    0.12224779737117837
3358
3359    The median occurs several times in the data, so we'll get a different
3360    result if, for example, ``ties="above"`` is used:
3361
3362    >>> stat, p, med, tbl = median_test(g1, g2, g3, ties="above")
3363    >>> p
3364    0.063873276069553273
3365
3366    >>> tbl
3367    array([[ 5, 11,  9],
3368           [11,  4,  8]])
3369
3370    This example demonstrates that if the data set is not large and there
3371    are values equal to the median, the p-value can be sensitive to the
3372    choice of `ties`.
3373
3374    """
3375    if len(args) < 2:
3376        raise ValueError('median_test requires two or more samples.')
3377
3378    ties_options = ['below', 'above', 'ignore']
3379    if ties not in ties_options:
3380        raise ValueError("invalid 'ties' option '%s'; 'ties' must be one "
3381                         "of: %s" % (ties, str(ties_options)[1:-1]))
3382
3383    data = [np.asarray(arg) for arg in args]
3384
3385    # Validate the sizes and shapes of the arguments.
3386    for k, d in enumerate(data):
3387        if d.size == 0:
3388            raise ValueError("Sample %d is empty. All samples must "
3389                             "contain at least one value." % (k + 1))
3390        if d.ndim != 1:
3391            raise ValueError("Sample %d has %d dimensions.  All "
3392                             "samples must be one-dimensional sequences." %
3393                             (k + 1, d.ndim))
3394
3395    cdata = np.concatenate(data)
3396    contains_nan, nan_policy = _contains_nan(cdata, nan_policy)
3397    if contains_nan and nan_policy == 'propagate':
3398        return np.nan, np.nan, np.nan, None
3399
3400    if contains_nan:
3401        grand_median = np.median(cdata[~np.isnan(cdata)])
3402    else:
3403        grand_median = np.median(cdata)
3404    # When the minimum version of numpy supported by scipy is 1.9.0,
3405    # the above if/else statement can be replaced by the single line:
3406    #     grand_median = np.nanmedian(cdata)
3407
3408    # Create the contingency table.
3409    table = np.zeros((2, len(data)), dtype=np.int64)
3410    for k, sample in enumerate(data):
3411        sample = sample[~np.isnan(sample)]
3412
3413        nabove = count_nonzero(sample > grand_median)
3414        nbelow = count_nonzero(sample < grand_median)
3415        nequal = sample.size - (nabove + nbelow)
3416        table[0, k] += nabove
3417        table[1, k] += nbelow
3418        if ties == "below":
3419            table[1, k] += nequal
3420        elif ties == "above":
3421            table[0, k] += nequal
3422
3423    # Check that no row or column of the table is all zero.
3424    # Such a table can not be given to chi2_contingency, because it would have
3425    # a zero in the table of expected frequencies.
3426    rowsums = table.sum(axis=1)
3427    if rowsums[0] == 0:
3428        raise ValueError("All values are below the grand median (%r)." %
3429                         grand_median)
3430    if rowsums[1] == 0:
3431        raise ValueError("All values are above the grand median (%r)." %
3432                         grand_median)
3433    if ties == "ignore":
3434        # We already checked that each sample has at least one value, but it
3435        # is possible that all those values equal the grand median.  If `ties`
3436        # is "ignore", that would result in a column of zeros in `table`.  We
3437        # check for that case here.
3438        zero_cols = np.nonzero((table == 0).all(axis=0))[0]
3439        if len(zero_cols) > 0:
3440            msg = ("All values in sample %d are equal to the grand "
3441                   "median (%r), so they are ignored, resulting in an "
3442                   "empty sample." % (zero_cols[0] + 1, grand_median))
3443            raise ValueError(msg)
3444
3445    stat, p, dof, expected = chi2_contingency(table, lambda_=lambda_,
3446                                              correction=correction)
3447    return stat, p, grand_median, table
3448
3449
3450def _circfuncs_common(samples, high, low, nan_policy='propagate'):
3451    # Ensure samples are array-like and size is not zero
3452    samples = np.asarray(samples)
3453    if samples.size == 0:
3454        return np.nan, np.asarray(np.nan), np.asarray(np.nan), None
3455
3456    # Recast samples as radians that range between 0 and 2 pi and calculate
3457    # the sine and cosine
3458    sin_samp = sin((samples - low)*2.*pi / (high - low))
3459    cos_samp = cos((samples - low)*2.*pi / (high - low))
3460
3461    # Apply the NaN policy
3462    contains_nan, nan_policy = _contains_nan(samples, nan_policy)
3463    if contains_nan and nan_policy == 'omit':
3464        mask = np.isnan(samples)
3465        # Set the sines and cosines that are NaN to zero
3466        sin_samp[mask] = 0.0
3467        cos_samp[mask] = 0.0
3468    else:
3469        mask = None
3470
3471    return samples, sin_samp, cos_samp, mask
3472
3473
3474def circmean(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'):
3475    """Compute the circular mean for samples in a range.
3476
3477    Parameters
3478    ----------
3479    samples : array_like
3480        Input array.
3481    high : float or int, optional
3482        High boundary for circular mean range.  Default is ``2*pi``.
3483    low : float or int, optional
3484        Low boundary for circular mean range.  Default is 0.
3485    axis : int, optional
3486        Axis along which means are computed.  The default is to compute
3487        the mean of the flattened array.
3488    nan_policy : {'propagate', 'raise', 'omit'}, optional
3489        Defines how to handle when input contains nan. 'propagate' returns nan,
3490        'raise' throws an error, 'omit' performs the calculations ignoring nan
3491        values. Default is 'propagate'.
3492
3493    Returns
3494    -------
3495    circmean : float
3496        Circular mean.
3497
3498    Examples
3499    --------
3500    >>> from scipy.stats import circmean
3501    >>> circmean([0.1, 2*np.pi+0.2, 6*np.pi+0.3])
3502    0.2
3503
3504    >>> from scipy.stats import circmean
3505    >>> circmean([0.2, 1.4, 2.6], high = 1, low = 0)
3506    0.4
3507
3508    """
3509    samples, sin_samp, cos_samp, nmask = _circfuncs_common(samples, high, low,
3510                                                           nan_policy=nan_policy)
3511    sin_sum = sin_samp.sum(axis=axis)
3512    cos_sum = cos_samp.sum(axis=axis)
3513    res = arctan2(sin_sum, cos_sum)
3514
3515    mask_nan = ~np.isnan(res)
3516    if mask_nan.ndim > 0:
3517        mask = res[mask_nan] < 0
3518    else:
3519        mask = res < 0
3520
3521    if mask.ndim > 0:
3522        mask_nan[mask_nan] = mask
3523        res[mask_nan] += 2*pi
3524    elif mask:
3525        res += 2*pi
3526
3527    # Set output to NaN if no samples went into the mean
3528    if nmask is not None:
3529        if nmask.all():
3530            res = np.full(shape=res.shape, fill_value=np.nan)
3531        else:
3532            # Find out if any of the axis that are being averaged consist
3533            # entirely of NaN.  If one exists, set the result (res) to NaN
3534            nshape = 0 if axis is None else axis
3535            smask = nmask.shape[nshape] == nmask.sum(axis=axis)
3536            if smask.any():
3537                res[smask] = np.nan
3538
3539    return res*(high - low)/2.0/pi + low
3540
3541
3542def circvar(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'):
3543    """Compute the circular variance for samples assumed to be in a range.
3544
3545    Parameters
3546    ----------
3547    samples : array_like
3548        Input array.
3549    high : float or int, optional
3550        High boundary for circular variance range.  Default is ``2*pi``.
3551    low : float or int, optional
3552        Low boundary for circular variance range.  Default is 0.
3553    axis : int, optional
3554        Axis along which variances are computed.  The default is to compute
3555        the variance of the flattened array.
3556    nan_policy : {'propagate', 'raise', 'omit'}, optional
3557        Defines how to handle when input contains nan. 'propagate' returns nan,
3558        'raise' throws an error, 'omit' performs the calculations ignoring nan
3559        values. Default is 'propagate'.
3560
3561    Returns
3562    -------
3563    circvar : float
3564        Circular variance.
3565
3566    Notes
3567    -----
3568    This uses a definition of circular variance that in the limit of small
3569    angles returns a number close to the 'linear' variance.
3570
3571    Examples
3572    --------
3573    >>> from scipy.stats import circvar
3574    >>> circvar([0, 2*np.pi/3, 5*np.pi/3])
3575    2.19722457734
3576
3577    """
3578    samples, sin_samp, cos_samp, mask = _circfuncs_common(samples, high, low,
3579                                                          nan_policy=nan_policy)
3580    if mask is None:
3581        sin_mean = sin_samp.mean(axis=axis)
3582        cos_mean = cos_samp.mean(axis=axis)
3583    else:
3584        nsum = np.asarray(np.sum(~mask, axis=axis).astype(float))
3585        nsum[nsum == 0] = np.nan
3586        sin_mean = sin_samp.sum(axis=axis) / nsum
3587        cos_mean = cos_samp.sum(axis=axis) / nsum
3588    # hypot can go slightly above 1 due to rounding errors
3589    with np.errstate(invalid='ignore'):
3590        R = np.minimum(1, hypot(sin_mean, cos_mean))
3591
3592    return ((high - low)/2.0/pi)**2 * -2 * log(R)
3593
3594
3595def circstd(samples, high=2*pi, low=0, axis=None, nan_policy='propagate'):
3596    """
3597    Compute the circular standard deviation for samples assumed to be in the
3598    range [low to high].
3599
3600    Parameters
3601    ----------
3602    samples : array_like
3603        Input array.
3604    high : float or int, optional
3605        High boundary for circular standard deviation range.
3606        Default is ``2*pi``.
3607    low : float or int, optional
3608        Low boundary for circular standard deviation range.  Default is 0.
3609    axis : int, optional
3610        Axis along which standard deviations are computed.  The default is
3611        to compute the standard deviation of the flattened array.
3612    nan_policy : {'propagate', 'raise', 'omit'}, optional
3613        Defines how to handle when input contains nan. 'propagate' returns nan,
3614        'raise' throws an error, 'omit' performs the calculations ignoring nan
3615        values. Default is 'propagate'.
3616
3617    Returns
3618    -------
3619    circstd : float
3620        Circular standard deviation.
3621
3622    Notes
3623    -----
3624    This uses a definition of circular standard deviation that in the limit of
3625    small angles returns a number close to the 'linear' standard deviation.
3626
3627    Examples
3628    --------
3629    >>> from scipy.stats import circstd
3630    >>> circstd([0, 0.1*np.pi/2, 0.001*np.pi, 0.03*np.pi/2])
3631    0.063564063306
3632
3633    """
3634    samples, sin_samp, cos_samp, mask = _circfuncs_common(samples, high, low,
3635                                                          nan_policy=nan_policy)
3636    if mask is None:
3637        sin_mean = sin_samp.mean(axis=axis)
3638        cos_mean = cos_samp.mean(axis=axis)
3639    else:
3640        nsum = np.asarray(np.sum(~mask, axis=axis).astype(float))
3641        nsum[nsum == 0] = np.nan
3642        sin_mean = sin_samp.sum(axis=axis) / nsum
3643        cos_mean = cos_samp.sum(axis=axis) / nsum
3644    # hypot can go slightly above 1 due to rounding errors
3645    with np.errstate(invalid='ignore'):
3646        R = np.minimum(1, hypot(sin_mean, cos_mean))
3647
3648    return ((high - low)/2.0/pi) * sqrt(-2*log(R))
3649