1"""
2Basic statistics module.
3
4This module provides functions for calculating statistics of data, including
5averages, variance, and standard deviation.
6
7Calculating averages
8--------------------
9
10==================  ==================================================
11Function            Description
12==================  ==================================================
13mean                Arithmetic mean (average) of data.
14fmean               Fast, floating point arithmetic mean.
15geometric_mean      Geometric mean of data.
16harmonic_mean       Harmonic mean of data.
17median              Median (middle value) of data.
18median_low          Low median of data.
19median_high         High median of data.
20median_grouped      Median, or 50th percentile, of grouped data.
21mode                Mode (most common value) of data.
22multimode           List of modes (most common values of data).
23quantiles           Divide data into intervals with equal probability.
24==================  ==================================================
25
26Calculate the arithmetic mean ("the average") of data:
27
28>>> mean([-1.0, 2.5, 3.25, 5.75])
292.625
30
31
32Calculate the standard median of discrete data:
33
34>>> median([2, 3, 4, 5])
353.5
36
37
38Calculate the median, or 50th percentile, of data grouped into class intervals
39centred on the data values provided. E.g. if your data points are rounded to
40the nearest whole number:
41
42>>> median_grouped([2, 2, 3, 3, 3, 4])  #doctest: +ELLIPSIS
432.8333333333...
44
45This should be interpreted in this way: you have two data points in the class
46interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
47the class interval 3.5-4.5. The median of these data points is 2.8333...
48
49
50Calculating variability or spread
51---------------------------------
52
53==================  =============================================
54Function            Description
55==================  =============================================
56pvariance           Population variance of data.
57variance            Sample variance of data.
58pstdev              Population standard deviation of data.
59stdev               Sample standard deviation of data.
60==================  =============================================
61
62Calculate the standard deviation of sample data:
63
64>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75])  #doctest: +ELLIPSIS
654.38961843444...
66
67If you have previously calculated the mean, you can pass it as the optional
68second argument to the four "spread" functions to avoid recalculating it:
69
70>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
71>>> mu = mean(data)
72>>> pvariance(data, mu)
732.5
74
75
76Statistics for relations between two inputs
77-------------------------------------------
78
79==================  ====================================================
80Function            Description
81==================  ====================================================
82covariance          Sample covariance for two variables.
83correlation         Pearson's correlation coefficient for two variables.
84linear_regression   Intercept and slope for simple linear regression.
85==================  ====================================================
86
87Calculate covariance, Pearson's correlation, and simple linear regression
88for two inputs:
89
90>>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
91>>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
92>>> covariance(x, y)
930.75
94>>> correlation(x, y)  #doctest: +ELLIPSIS
950.31622776601...
96>>> linear_regression(x, y)  #doctest:
97LinearRegression(slope=0.1, intercept=1.5)
98
99
100Exceptions
101----------
102
103A single exception is defined: StatisticsError is a subclass of ValueError.
104
105"""
106
107__all__ = [
108    'NormalDist',
109    'StatisticsError',
110    'correlation',
111    'covariance',
112    'fmean',
113    'geometric_mean',
114    'harmonic_mean',
115    'linear_regression',
116    'mean',
117    'median',
118    'median_grouped',
119    'median_high',
120    'median_low',
121    'mode',
122    'multimode',
123    'pstdev',
124    'pvariance',
125    'quantiles',
126    'stdev',
127    'variance',
128]
129
130import math
131import numbers
132import random
133
134from fractions import Fraction
135from decimal import Decimal
136from itertools import groupby, repeat
137from bisect import bisect_left, bisect_right
138from math import hypot, sqrt, fabs, exp, erf, tau, log, fsum
139from operator import itemgetter
140from collections import Counter, namedtuple
141
142# === Exceptions ===
143
144class StatisticsError(ValueError):
145    pass
146
147
148# === Private utilities ===
149
150def _sum(data):
151    """_sum(data) -> (type, sum, count)
152
153    Return a high-precision sum of the given numeric data as a fraction,
154    together with the type to be converted to and the count of items.
155
156    Examples
157    --------
158
159    >>> _sum([3, 2.25, 4.5, -0.5, 0.25])
160    (<class 'float'>, Fraction(19, 2), 5)
161
162    Some sources of round-off error will be avoided:
163
164    # Built-in sum returns zero.
165    >>> _sum([1e50, 1, -1e50] * 1000)
166    (<class 'float'>, Fraction(1000, 1), 3000)
167
168    Fractions and Decimals are also supported:
169
170    >>> from fractions import Fraction as F
171    >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
172    (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
173
174    >>> from decimal import Decimal as D
175    >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
176    >>> _sum(data)
177    (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
178
179    Mixed types are currently treated as an error, except that int is
180    allowed.
181    """
182    count = 0
183    partials = {}
184    partials_get = partials.get
185    T = int
186    for typ, values in groupby(data, type):
187        T = _coerce(T, typ)  # or raise TypeError
188        for n, d in map(_exact_ratio, values):
189            count += 1
190            partials[d] = partials_get(d, 0) + n
191    if None in partials:
192        # The sum will be a NAN or INF. We can ignore all the finite
193        # partials, and just look at this special one.
194        total = partials[None]
195        assert not _isfinite(total)
196    else:
197        # Sum all the partial sums using builtin sum.
198        total = sum(Fraction(n, d) for d, n in partials.items())
199    return (T, total, count)
200
201
202def _isfinite(x):
203    try:
204        return x.is_finite()  # Likely a Decimal.
205    except AttributeError:
206        return math.isfinite(x)  # Coerces to float first.
207
208
209def _coerce(T, S):
210    """Coerce types T and S to a common type, or raise TypeError.
211
212    Coercion rules are currently an implementation detail. See the CoerceTest
213    test class in test_statistics for details.
214    """
215    # See http://bugs.python.org/issue24068.
216    assert T is not bool, "initial type T is bool"
217    # If the types are the same, no need to coerce anything. Put this
218    # first, so that the usual case (no coercion needed) happens as soon
219    # as possible.
220    if T is S:  return T
221    # Mixed int & other coerce to the other type.
222    if S is int or S is bool:  return T
223    if T is int:  return S
224    # If one is a (strict) subclass of the other, coerce to the subclass.
225    if issubclass(S, T):  return S
226    if issubclass(T, S):  return T
227    # Ints coerce to the other type.
228    if issubclass(T, int):  return S
229    if issubclass(S, int):  return T
230    # Mixed fraction & float coerces to float (or float subclass).
231    if issubclass(T, Fraction) and issubclass(S, float):
232        return S
233    if issubclass(T, float) and issubclass(S, Fraction):
234        return T
235    # Any other combination is disallowed.
236    msg = "don't know how to coerce %s and %s"
237    raise TypeError(msg % (T.__name__, S.__name__))
238
239
240def _exact_ratio(x):
241    """Return Real number x to exact (numerator, denominator) pair.
242
243    >>> _exact_ratio(0.25)
244    (1, 4)
245
246    x is expected to be an int, Fraction, Decimal or float.
247    """
248    try:
249        return x.as_integer_ratio()
250    except AttributeError:
251        pass
252    except (OverflowError, ValueError):
253        # float NAN or INF.
254        assert not _isfinite(x)
255        return (x, None)
256    try:
257        # x may be an Integral ABC.
258        return (x.numerator, x.denominator)
259    except AttributeError:
260        msg = f"can't convert type '{type(x).__name__}' to numerator/denominator"
261        raise TypeError(msg)
262
263
264def _convert(value, T):
265    """Convert value to given numeric type T."""
266    if type(value) is T:
267        # This covers the cases where T is Fraction, or where value is
268        # a NAN or INF (Decimal or float).
269        return value
270    if issubclass(T, int) and value.denominator != 1:
271        T = float
272    try:
273        # FIXME: what do we do if this overflows?
274        return T(value)
275    except TypeError:
276        if issubclass(T, Decimal):
277            return T(value.numerator) / T(value.denominator)
278        else:
279            raise
280
281
282def _find_lteq(a, x):
283    'Locate the leftmost value exactly equal to x'
284    i = bisect_left(a, x)
285    if i != len(a) and a[i] == x:
286        return i
287    raise ValueError
288
289
290def _find_rteq(a, l, x):
291    'Locate the rightmost value exactly equal to x'
292    i = bisect_right(a, x, lo=l)
293    if i != (len(a) + 1) and a[i - 1] == x:
294        return i - 1
295    raise ValueError
296
297
298def _fail_neg(values, errmsg='negative value'):
299    """Iterate over values, failing if any are less than zero."""
300    for x in values:
301        if x < 0:
302            raise StatisticsError(errmsg)
303        yield x
304
305
306# === Measures of central tendency (averages) ===
307
308def mean(data):
309    """Return the sample arithmetic mean of data.
310
311    >>> mean([1, 2, 3, 4, 4])
312    2.8
313
314    >>> from fractions import Fraction as F
315    >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
316    Fraction(13, 21)
317
318    >>> from decimal import Decimal as D
319    >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
320    Decimal('0.5625')
321
322    If ``data`` is empty, StatisticsError will be raised.
323    """
324    if iter(data) is data:
325        data = list(data)
326    n = len(data)
327    if n < 1:
328        raise StatisticsError('mean requires at least one data point')
329    T, total, count = _sum(data)
330    assert count == n
331    return _convert(total / n, T)
332
333
334def fmean(data):
335    """Convert data to floats and compute the arithmetic mean.
336
337    This runs faster than the mean() function and it always returns a float.
338    If the input dataset is empty, it raises a StatisticsError.
339
340    >>> fmean([3.5, 4.0, 5.25])
341    4.25
342    """
343    try:
344        n = len(data)
345    except TypeError:
346        # Handle iterators that do not define __len__().
347        n = 0
348        def count(iterable):
349            nonlocal n
350            for n, x in enumerate(iterable, start=1):
351                yield x
352        total = fsum(count(data))
353    else:
354        total = fsum(data)
355    try:
356        return total / n
357    except ZeroDivisionError:
358        raise StatisticsError('fmean requires at least one data point') from None
359
360
361def geometric_mean(data):
362    """Convert data to floats and compute the geometric mean.
363
364    Raises a StatisticsError if the input dataset is empty,
365    if it contains a zero, or if it contains a negative value.
366
367    No special efforts are made to achieve exact results.
368    (However, this may change in the future.)
369
370    >>> round(geometric_mean([54, 24, 36]), 9)
371    36.0
372    """
373    try:
374        return exp(fmean(map(log, data)))
375    except ValueError:
376        raise StatisticsError('geometric mean requires a non-empty dataset '
377                              ' containing positive numbers') from None
378
379
380def harmonic_mean(data, weights=None):
381    """Return the harmonic mean of data.
382
383    The harmonic mean is the reciprocal of the arithmetic mean of the
384    reciprocals of the data.  It can be used for averaging ratios or
385    rates, for example speeds.
386
387    Suppose a car travels 40 km/hr for 5 km and then speeds-up to
388    60 km/hr for another 5 km. What is the average speed?
389
390        >>> harmonic_mean([40, 60])
391        48.0
392
393    Suppose a car travels 40 km/hr for 5 km, and when traffic clears,
394    speeds-up to 60 km/hr for the remaining 30 km of the journey. What
395    is the average speed?
396
397        >>> harmonic_mean([40, 60], weights=[5, 30])
398        56.0
399
400    If ``data`` is empty, or any element is less than zero,
401    ``harmonic_mean`` will raise ``StatisticsError``.
402    """
403    if iter(data) is data:
404        data = list(data)
405    errmsg = 'harmonic mean does not support negative values'
406    n = len(data)
407    if n < 1:
408        raise StatisticsError('harmonic_mean requires at least one data point')
409    elif n == 1 and weights is None:
410        x = data[0]
411        if isinstance(x, (numbers.Real, Decimal)):
412            if x < 0:
413                raise StatisticsError(errmsg)
414            return x
415        else:
416            raise TypeError('unsupported type')
417    if weights is None:
418        weights = repeat(1, n)
419        sum_weights = n
420    else:
421        if iter(weights) is weights:
422            weights = list(weights)
423        if len(weights) != n:
424            raise StatisticsError('Number of weights does not match data size')
425        _, sum_weights, _ = _sum(w for w in _fail_neg(weights, errmsg))
426    try:
427        data = _fail_neg(data, errmsg)
428        T, total, count = _sum(w / x if w else 0 for w, x in zip(weights, data))
429    except ZeroDivisionError:
430        return 0
431    if total <= 0:
432        raise StatisticsError('Weighted sum must be positive')
433    return _convert(sum_weights / total, T)
434
435# FIXME: investigate ways to calculate medians without sorting? Quickselect?
436def median(data):
437    """Return the median (middle value) of numeric data.
438
439    When the number of data points is odd, return the middle data point.
440    When the number of data points is even, the median is interpolated by
441    taking the average of the two middle values:
442
443    >>> median([1, 3, 5])
444    3
445    >>> median([1, 3, 5, 7])
446    4.0
447
448    """
449    data = sorted(data)
450    n = len(data)
451    if n == 0:
452        raise StatisticsError("no median for empty data")
453    if n % 2 == 1:
454        return data[n // 2]
455    else:
456        i = n // 2
457        return (data[i - 1] + data[i]) / 2
458
459
460def median_low(data):
461    """Return the low median of numeric data.
462
463    When the number of data points is odd, the middle value is returned.
464    When it is even, the smaller of the two middle values is returned.
465
466    >>> median_low([1, 3, 5])
467    3
468    >>> median_low([1, 3, 5, 7])
469    3
470
471    """
472    data = sorted(data)
473    n = len(data)
474    if n == 0:
475        raise StatisticsError("no median for empty data")
476    if n % 2 == 1:
477        return data[n // 2]
478    else:
479        return data[n // 2 - 1]
480
481
482def median_high(data):
483    """Return the high median of data.
484
485    When the number of data points is odd, the middle value is returned.
486    When it is even, the larger of the two middle values is returned.
487
488    >>> median_high([1, 3, 5])
489    3
490    >>> median_high([1, 3, 5, 7])
491    5
492
493    """
494    data = sorted(data)
495    n = len(data)
496    if n == 0:
497        raise StatisticsError("no median for empty data")
498    return data[n // 2]
499
500
501def median_grouped(data, interval=1):
502    """Return the 50th percentile (median) of grouped continuous data.
503
504    >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
505    3.7
506    >>> median_grouped([52, 52, 53, 54])
507    52.5
508
509    This calculates the median as the 50th percentile, and should be
510    used when your data is continuous and grouped. In the above example,
511    the values 1, 2, 3, etc. actually represent the midpoint of classes
512    0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
513    class 3.5-4.5, and interpolation is used to estimate it.
514
515    Optional argument ``interval`` represents the class interval, and
516    defaults to 1. Changing the class interval naturally will change the
517    interpolated 50th percentile value:
518
519    >>> median_grouped([1, 3, 3, 5, 7], interval=1)
520    3.25
521    >>> median_grouped([1, 3, 3, 5, 7], interval=2)
522    3.5
523
524    This function does not check whether the data points are at least
525    ``interval`` apart.
526    """
527    data = sorted(data)
528    n = len(data)
529    if n == 0:
530        raise StatisticsError("no median for empty data")
531    elif n == 1:
532        return data[0]
533    # Find the value at the midpoint. Remember this corresponds to the
534    # centre of the class interval.
535    x = data[n // 2]
536    for obj in (x, interval):
537        if isinstance(obj, (str, bytes)):
538            raise TypeError('expected number but got %r' % obj)
539    try:
540        L = x - interval / 2  # The lower limit of the median interval.
541    except TypeError:
542        # Mixed type. For now we just coerce to float.
543        L = float(x) - float(interval) / 2
544
545    # Uses bisection search to search for x in data with log(n) time complexity
546    # Find the position of leftmost occurrence of x in data
547    l1 = _find_lteq(data, x)
548    # Find the position of rightmost occurrence of x in data[l1...len(data)]
549    # Assuming always l1 <= l2
550    l2 = _find_rteq(data, l1, x)
551    cf = l1
552    f = l2 - l1 + 1
553    return L + interval * (n / 2 - cf) / f
554
555
556def mode(data):
557    """Return the most common data point from discrete or nominal data.
558
559    ``mode`` assumes discrete data, and returns a single value. This is the
560    standard treatment of the mode as commonly taught in schools:
561
562        >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
563        3
564
565    This also works with nominal (non-numeric) data:
566
567        >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
568        'red'
569
570    If there are multiple modes with same frequency, return the first one
571    encountered:
572
573        >>> mode(['red', 'red', 'green', 'blue', 'blue'])
574        'red'
575
576    If *data* is empty, ``mode``, raises StatisticsError.
577
578    """
579    pairs = Counter(iter(data)).most_common(1)
580    try:
581        return pairs[0][0]
582    except IndexError:
583        raise StatisticsError('no mode for empty data') from None
584
585
586def multimode(data):
587    """Return a list of the most frequently occurring values.
588
589    Will return more than one result if there are multiple modes
590    or an empty list if *data* is empty.
591
592    >>> multimode('aabbbbbbbbcc')
593    ['b']
594    >>> multimode('aabbbbccddddeeffffgg')
595    ['b', 'd', 'f']
596    >>> multimode('')
597    []
598    """
599    counts = Counter(iter(data)).most_common()
600    maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
601    return list(map(itemgetter(0), mode_items))
602
603
604# Notes on methods for computing quantiles
605# ----------------------------------------
606#
607# There is no one perfect way to compute quantiles.  Here we offer
608# two methods that serve common needs.  Most other packages
609# surveyed offered at least one or both of these two, making them
610# "standard" in the sense of "widely-adopted and reproducible".
611# They are also easy to explain, easy to compute manually, and have
612# straight-forward interpretations that aren't surprising.
613
614# The default method is known as "R6", "PERCENTILE.EXC", or "expected
615# value of rank order statistics". The alternative method is known as
616# "R7", "PERCENTILE.INC", or "mode of rank order statistics".
617
618# For sample data where there is a positive probability for values
619# beyond the range of the data, the R6 exclusive method is a
620# reasonable choice.  Consider a random sample of nine values from a
621# population with a uniform distribution from 0.0 to 1.0.  The
622# distribution of the third ranked sample point is described by
623# betavariate(alpha=3, beta=7) which has mode=0.250, median=0.286, and
624# mean=0.300.  Only the latter (which corresponds with R6) gives the
625# desired cut point with 30% of the population falling below that
626# value, making it comparable to a result from an inv_cdf() function.
627# The R6 exclusive method is also idempotent.
628
629# For describing population data where the end points are known to
630# be included in the data, the R7 inclusive method is a reasonable
631# choice.  Instead of the mean, it uses the mode of the beta
632# distribution for the interior points.  Per Hyndman & Fan, "One nice
633# property is that the vertices of Q7(p) divide the range into n - 1
634# intervals, and exactly 100p% of the intervals lie to the left of
635# Q7(p) and 100(1 - p)% of the intervals lie to the right of Q7(p)."
636
637# If needed, other methods could be added.  However, for now, the
638# position is that fewer options make for easier choices and that
639# external packages can be used for anything more advanced.
640
641def quantiles(data, *, n=4, method='exclusive'):
642    """Divide *data* into *n* continuous intervals with equal probability.
643
644    Returns a list of (n - 1) cut points separating the intervals.
645
646    Set *n* to 4 for quartiles (the default).  Set *n* to 10 for deciles.
647    Set *n* to 100 for percentiles which gives the 99 cuts points that
648    separate *data* in to 100 equal sized groups.
649
650    The *data* can be any iterable containing sample.
651    The cut points are linearly interpolated between data points.
652
653    If *method* is set to *inclusive*, *data* is treated as population
654    data.  The minimum value is treated as the 0th percentile and the
655    maximum value is treated as the 100th percentile.
656    """
657    if n < 1:
658        raise StatisticsError('n must be at least 1')
659    data = sorted(data)
660    ld = len(data)
661    if ld < 2:
662        raise StatisticsError('must have at least two data points')
663    if method == 'inclusive':
664        m = ld - 1
665        result = []
666        for i in range(1, n):
667            j, delta = divmod(i * m, n)
668            interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n
669            result.append(interpolated)
670        return result
671    if method == 'exclusive':
672        m = ld + 1
673        result = []
674        for i in range(1, n):
675            j = i * m // n                               # rescale i to m/n
676            j = 1 if j < 1 else ld-1 if j > ld-1 else j  # clamp to 1 .. ld-1
677            delta = i*m - j*n                            # exact integer math
678            interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n
679            result.append(interpolated)
680        return result
681    raise ValueError(f'Unknown method: {method!r}')
682
683
684# === Measures of spread ===
685
686# See http://mathworld.wolfram.com/Variance.html
687#     http://mathworld.wolfram.com/SampleVariance.html
688#     http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
689#
690# Under no circumstances use the so-called "computational formula for
691# variance", as that is only suitable for hand calculations with a small
692# amount of low-precision data. It has terrible numeric properties.
693#
694# See a comparison of three computational methods here:
695# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
696
697def _ss(data, c=None):
698    """Return sum of square deviations of sequence data.
699
700    If ``c`` is None, the mean is calculated in one pass, and the deviations
701    from the mean are calculated in a second pass. Otherwise, deviations are
702    calculated from ``c`` as given. Use the second case with care, as it can
703    lead to garbage results.
704    """
705    if c is not None:
706        T, total, count = _sum((x-c)**2 for x in data)
707        return (T, total)
708    T, total, count = _sum(data)
709    mean_n, mean_d = (total / count).as_integer_ratio()
710    partials = Counter()
711    for n, d in map(_exact_ratio, data):
712        diff_n = n * mean_d - d * mean_n
713        diff_d = d * mean_d
714        partials[diff_d * diff_d] += diff_n * diff_n
715    if None in partials:
716        # The sum will be a NAN or INF. We can ignore all the finite
717        # partials, and just look at this special one.
718        total = partials[None]
719        assert not _isfinite(total)
720    else:
721        total = sum(Fraction(n, d) for d, n in partials.items())
722    return (T, total)
723
724
725def variance(data, xbar=None):
726    """Return the sample variance of data.
727
728    data should be an iterable of Real-valued numbers, with at least two
729    values. The optional argument xbar, if given, should be the mean of
730    the data. If it is missing or None, the mean is automatically calculated.
731
732    Use this function when your data is a sample from a population. To
733    calculate the variance from the entire population, see ``pvariance``.
734
735    Examples:
736
737    >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
738    >>> variance(data)
739    1.3720238095238095
740
741    If you have already calculated the mean of your data, you can pass it as
742    the optional second argument ``xbar`` to avoid recalculating it:
743
744    >>> m = mean(data)
745    >>> variance(data, m)
746    1.3720238095238095
747
748    This function does not check that ``xbar`` is actually the mean of
749    ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
750    impossible results.
751
752    Decimals and Fractions are supported:
753
754    >>> from decimal import Decimal as D
755    >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
756    Decimal('31.01875')
757
758    >>> from fractions import Fraction as F
759    >>> variance([F(1, 6), F(1, 2), F(5, 3)])
760    Fraction(67, 108)
761
762    """
763    if iter(data) is data:
764        data = list(data)
765    n = len(data)
766    if n < 2:
767        raise StatisticsError('variance requires at least two data points')
768    T, ss = _ss(data, xbar)
769    return _convert(ss / (n - 1), T)
770
771
772def pvariance(data, mu=None):
773    """Return the population variance of ``data``.
774
775    data should be a sequence or iterable of Real-valued numbers, with at least one
776    value. The optional argument mu, if given, should be the mean of
777    the data. If it is missing or None, the mean is automatically calculated.
778
779    Use this function to calculate the variance from the entire population.
780    To estimate the variance from a sample, the ``variance`` function is
781    usually a better choice.
782
783    Examples:
784
785    >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
786    >>> pvariance(data)
787    1.25
788
789    If you have already calculated the mean of the data, you can pass it as
790    the optional second argument to avoid recalculating it:
791
792    >>> mu = mean(data)
793    >>> pvariance(data, mu)
794    1.25
795
796    Decimals and Fractions are supported:
797
798    >>> from decimal import Decimal as D
799    >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
800    Decimal('24.815')
801
802    >>> from fractions import Fraction as F
803    >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
804    Fraction(13, 72)
805
806    """
807    if iter(data) is data:
808        data = list(data)
809    n = len(data)
810    if n < 1:
811        raise StatisticsError('pvariance requires at least one data point')
812    T, ss = _ss(data, mu)
813    return _convert(ss / n, T)
814
815
816def stdev(data, xbar=None):
817    """Return the square root of the sample variance.
818
819    See ``variance`` for arguments and other details.
820
821    >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
822    1.0810874155219827
823
824    """
825    # Fixme: Despite the exact sum of squared deviations, some inaccuracy
826    # remain because there are two rounding steps.  The first occurs in
827    # the _convert() step for variance(), the second occurs in math.sqrt().
828    var = variance(data, xbar)
829    try:
830        return var.sqrt()
831    except AttributeError:
832        return math.sqrt(var)
833
834
835def pstdev(data, mu=None):
836    """Return the square root of the population variance.
837
838    See ``pvariance`` for arguments and other details.
839
840    >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
841    0.986893273527251
842
843    """
844    # Fixme: Despite the exact sum of squared deviations, some inaccuracy
845    # remain because there are two rounding steps.  The first occurs in
846    # the _convert() step for pvariance(), the second occurs in math.sqrt().
847    var = pvariance(data, mu)
848    try:
849        return var.sqrt()
850    except AttributeError:
851        return math.sqrt(var)
852
853
854# === Statistics for relations between two inputs ===
855
856# See https://en.wikipedia.org/wiki/Covariance
857#     https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
858#     https://en.wikipedia.org/wiki/Simple_linear_regression
859
860
861def covariance(x, y, /):
862    """Covariance
863
864    Return the sample covariance of two inputs *x* and *y*. Covariance
865    is a measure of the joint variability of two inputs.
866
867    >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
868    >>> y = [1, 2, 3, 1, 2, 3, 1, 2, 3]
869    >>> covariance(x, y)
870    0.75
871    >>> z = [9, 8, 7, 6, 5, 4, 3, 2, 1]
872    >>> covariance(x, z)
873    -7.5
874    >>> covariance(z, x)
875    -7.5
876
877    """
878    n = len(x)
879    if len(y) != n:
880        raise StatisticsError('covariance requires that both inputs have same number of data points')
881    if n < 2:
882        raise StatisticsError('covariance requires at least two data points')
883    xbar = fsum(x) / n
884    ybar = fsum(y) / n
885    sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
886    return sxy / (n - 1)
887
888
889def correlation(x, y, /):
890    """Pearson's correlation coefficient
891
892    Return the Pearson's correlation coefficient for two inputs. Pearson's
893    correlation coefficient *r* takes values between -1 and +1. It measures the
894    strength and direction of the linear relationship, where +1 means very
895    strong, positive linear relationship, -1 very strong, negative linear
896    relationship, and 0 no linear relationship.
897
898    >>> x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
899    >>> y = [9, 8, 7, 6, 5, 4, 3, 2, 1]
900    >>> correlation(x, x)
901    1.0
902    >>> correlation(x, y)
903    -1.0
904
905    """
906    n = len(x)
907    if len(y) != n:
908        raise StatisticsError('correlation requires that both inputs have same number of data points')
909    if n < 2:
910        raise StatisticsError('correlation requires at least two data points')
911    xbar = fsum(x) / n
912    ybar = fsum(y) / n
913    sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
914    sxx = fsum((xi - xbar) ** 2.0 for xi in x)
915    syy = fsum((yi - ybar) ** 2.0 for yi in y)
916    try:
917        return sxy / sqrt(sxx * syy)
918    except ZeroDivisionError:
919        raise StatisticsError('at least one of the inputs is constant')
920
921
922LinearRegression = namedtuple('LinearRegression', ('slope', 'intercept'))
923
924
925def linear_regression(x, y, /):
926    """Slope and intercept for simple linear regression.
927
928    Return the slope and intercept of simple linear regression
929    parameters estimated using ordinary least squares. Simple linear
930    regression describes relationship between an independent variable
931    *x* and a dependent variable *y* in terms of linear function:
932
933        y = slope * x + intercept + noise
934
935    where *slope* and *intercept* are the regression parameters that are
936    estimated, and noise represents the variability of the data that was
937    not explained by the linear regression (it is equal to the
938    difference between predicted and actual values of the dependent
939    variable).
940
941    The parameters are returned as a named tuple.
942
943    >>> x = [1, 2, 3, 4, 5]
944    >>> noise = NormalDist().samples(5, seed=42)
945    >>> y = [3 * x[i] + 2 + noise[i] for i in range(5)]
946    >>> linear_regression(x, y)  #doctest: +ELLIPSIS
947    LinearRegression(slope=3.09078914170..., intercept=1.75684970486...)
948
949    """
950    n = len(x)
951    if len(y) != n:
952        raise StatisticsError('linear regression requires that both inputs have same number of data points')
953    if n < 2:
954        raise StatisticsError('linear regression requires at least two data points')
955    xbar = fsum(x) / n
956    ybar = fsum(y) / n
957    sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
958    sxx = fsum((xi - xbar) ** 2.0 for xi in x)
959    try:
960        slope = sxy / sxx   # equivalent to:  covariance(x, y) / variance(x)
961    except ZeroDivisionError:
962        raise StatisticsError('x is constant')
963    intercept = ybar - slope * xbar
964    return LinearRegression(slope=slope, intercept=intercept)
965
966
967## Normal Distribution #####################################################
968
969
970def _normal_dist_inv_cdf(p, mu, sigma):
971    # There is no closed-form solution to the inverse CDF for the normal
972    # distribution, so we use a rational approximation instead:
973    # Wichura, M.J. (1988). "Algorithm AS241: The Percentage Points of the
974    # Normal Distribution".  Applied Statistics. Blackwell Publishing. 37
975    # (3): 477–484. doi:10.2307/2347330. JSTOR 2347330.
976    q = p - 0.5
977    if fabs(q) <= 0.425:
978        r = 0.180625 - q * q
979        # Hash sum: 55.88319_28806_14901_4439
980        num = (((((((2.50908_09287_30122_6727e+3 * r +
981                     3.34305_75583_58812_8105e+4) * r +
982                     6.72657_70927_00870_0853e+4) * r +
983                     4.59219_53931_54987_1457e+4) * r +
984                     1.37316_93765_50946_1125e+4) * r +
985                     1.97159_09503_06551_4427e+3) * r +
986                     1.33141_66789_17843_7745e+2) * r +
987                     3.38713_28727_96366_6080e+0) * q
988        den = (((((((5.22649_52788_52854_5610e+3 * r +
989                     2.87290_85735_72194_2674e+4) * r +
990                     3.93078_95800_09271_0610e+4) * r +
991                     2.12137_94301_58659_5867e+4) * r +
992                     5.39419_60214_24751_1077e+3) * r +
993                     6.87187_00749_20579_0830e+2) * r +
994                     4.23133_30701_60091_1252e+1) * r +
995                     1.0)
996        x = num / den
997        return mu + (x * sigma)
998    r = p if q <= 0.0 else 1.0 - p
999    r = sqrt(-log(r))
1000    if r <= 5.0:
1001        r = r - 1.6
1002        # Hash sum: 49.33206_50330_16102_89036
1003        num = (((((((7.74545_01427_83414_07640e-4 * r +
1004                     2.27238_44989_26918_45833e-2) * r +
1005                     2.41780_72517_74506_11770e-1) * r +
1006                     1.27045_82524_52368_38258e+0) * r +
1007                     3.64784_83247_63204_60504e+0) * r +
1008                     5.76949_72214_60691_40550e+0) * r +
1009                     4.63033_78461_56545_29590e+0) * r +
1010                     1.42343_71107_49683_57734e+0)
1011        den = (((((((1.05075_00716_44416_84324e-9 * r +
1012                     5.47593_80849_95344_94600e-4) * r +
1013                     1.51986_66563_61645_71966e-2) * r +
1014                     1.48103_97642_74800_74590e-1) * r +
1015                     6.89767_33498_51000_04550e-1) * r +
1016                     1.67638_48301_83803_84940e+0) * r +
1017                     2.05319_16266_37758_82187e+0) * r +
1018                     1.0)
1019    else:
1020        r = r - 5.0
1021        # Hash sum: 47.52583_31754_92896_71629
1022        num = (((((((2.01033_43992_92288_13265e-7 * r +
1023                     2.71155_55687_43487_57815e-5) * r +
1024                     1.24266_09473_88078_43860e-3) * r +
1025                     2.65321_89526_57612_30930e-2) * r +
1026                     2.96560_57182_85048_91230e-1) * r +
1027                     1.78482_65399_17291_33580e+0) * r +
1028                     5.46378_49111_64114_36990e+0) * r +
1029                     6.65790_46435_01103_77720e+0)
1030        den = (((((((2.04426_31033_89939_78564e-15 * r +
1031                     1.42151_17583_16445_88870e-7) * r +
1032                     1.84631_83175_10054_68180e-5) * r +
1033                     7.86869_13114_56132_59100e-4) * r +
1034                     1.48753_61290_85061_48525e-2) * r +
1035                     1.36929_88092_27358_05310e-1) * r +
1036                     5.99832_20655_58879_37690e-1) * r +
1037                     1.0)
1038    x = num / den
1039    if q < 0.0:
1040        x = -x
1041    return mu + (x * sigma)
1042
1043
1044# If available, use C implementation
1045try:
1046    from _statistics import _normal_dist_inv_cdf
1047except ImportError:
1048    pass
1049
1050
1051class NormalDist:
1052    "Normal distribution of a random variable"
1053    # https://en.wikipedia.org/wiki/Normal_distribution
1054    # https://en.wikipedia.org/wiki/Variance#Properties
1055
1056    __slots__ = {
1057        '_mu': 'Arithmetic mean of a normal distribution',
1058        '_sigma': 'Standard deviation of a normal distribution',
1059    }
1060
1061    def __init__(self, mu=0.0, sigma=1.0):
1062        "NormalDist where mu is the mean and sigma is the standard deviation."
1063        if sigma < 0.0:
1064            raise StatisticsError('sigma must be non-negative')
1065        self._mu = float(mu)
1066        self._sigma = float(sigma)
1067
1068    @classmethod
1069    def from_samples(cls, data):
1070        "Make a normal distribution instance from sample data."
1071        if not isinstance(data, (list, tuple)):
1072            data = list(data)
1073        xbar = fmean(data)
1074        return cls(xbar, stdev(data, xbar))
1075
1076    def samples(self, n, *, seed=None):
1077        "Generate *n* samples for a given mean and standard deviation."
1078        gauss = random.gauss if seed is None else random.Random(seed).gauss
1079        mu, sigma = self._mu, self._sigma
1080        return [gauss(mu, sigma) for i in range(n)]
1081
1082    def pdf(self, x):
1083        "Probability density function.  P(x <= X < x+dx) / dx"
1084        variance = self._sigma ** 2.0
1085        if not variance:
1086            raise StatisticsError('pdf() not defined when sigma is zero')
1087        return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau*variance)
1088
1089    def cdf(self, x):
1090        "Cumulative distribution function.  P(X <= x)"
1091        if not self._sigma:
1092            raise StatisticsError('cdf() not defined when sigma is zero')
1093        return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0))))
1094
1095    def inv_cdf(self, p):
1096        """Inverse cumulative distribution function.  x : P(X <= x) = p
1097
1098        Finds the value of the random variable such that the probability of
1099        the variable being less than or equal to that value equals the given
1100        probability.
1101
1102        This function is also called the percent point function or quantile
1103        function.
1104        """
1105        if p <= 0.0 or p >= 1.0:
1106            raise StatisticsError('p must be in the range 0.0 < p < 1.0')
1107        if self._sigma <= 0.0:
1108            raise StatisticsError('cdf() not defined when sigma at or below zero')
1109        return _normal_dist_inv_cdf(p, self._mu, self._sigma)
1110
1111    def quantiles(self, n=4):
1112        """Divide into *n* continuous intervals with equal probability.
1113
1114        Returns a list of (n - 1) cut points separating the intervals.
1115
1116        Set *n* to 4 for quartiles (the default).  Set *n* to 10 for deciles.
1117        Set *n* to 100 for percentiles which gives the 99 cuts points that
1118        separate the normal distribution in to 100 equal sized groups.
1119        """
1120        return [self.inv_cdf(i / n) for i in range(1, n)]
1121
1122    def overlap(self, other):
1123        """Compute the overlapping coefficient (OVL) between two normal distributions.
1124
1125        Measures the agreement between two normal probability distributions.
1126        Returns a value between 0.0 and 1.0 giving the overlapping area in
1127        the two underlying probability density functions.
1128
1129            >>> N1 = NormalDist(2.4, 1.6)
1130            >>> N2 = NormalDist(3.2, 2.0)
1131            >>> N1.overlap(N2)
1132            0.8035050657330205
1133        """
1134        # See: "The overlapping coefficient as a measure of agreement between
1135        # probability distributions and point estimation of the overlap of two
1136        # normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
1137        # http://dx.doi.org/10.1080/03610928908830127
1138        if not isinstance(other, NormalDist):
1139            raise TypeError('Expected another NormalDist instance')
1140        X, Y = self, other
1141        if (Y._sigma, Y._mu) < (X._sigma, X._mu):  # sort to assure commutativity
1142            X, Y = Y, X
1143        X_var, Y_var = X.variance, Y.variance
1144        if not X_var or not Y_var:
1145            raise StatisticsError('overlap() not defined when sigma is zero')
1146        dv = Y_var - X_var
1147        dm = fabs(Y._mu - X._mu)
1148        if not dv:
1149            return 1.0 - erf(dm / (2.0 * X._sigma * sqrt(2.0)))
1150        a = X._mu * Y_var - Y._mu * X_var
1151        b = X._sigma * Y._sigma * sqrt(dm**2.0 + dv * log(Y_var / X_var))
1152        x1 = (a + b) / dv
1153        x2 = (a - b) / dv
1154        return 1.0 - (fabs(Y.cdf(x1) - X.cdf(x1)) + fabs(Y.cdf(x2) - X.cdf(x2)))
1155
1156    def zscore(self, x):
1157        """Compute the Standard Score.  (x - mean) / stdev
1158
1159        Describes *x* in terms of the number of standard deviations
1160        above or below the mean of the normal distribution.
1161        """
1162        # https://www.statisticshowto.com/probability-and-statistics/z-score/
1163        if not self._sigma:
1164            raise StatisticsError('zscore() not defined when sigma is zero')
1165        return (x - self._mu) / self._sigma
1166
1167    @property
1168    def mean(self):
1169        "Arithmetic mean of the normal distribution."
1170        return self._mu
1171
1172    @property
1173    def median(self):
1174        "Return the median of the normal distribution"
1175        return self._mu
1176
1177    @property
1178    def mode(self):
1179        """Return the mode of the normal distribution
1180
1181        The mode is the value x where which the probability density
1182        function (pdf) takes its maximum value.
1183        """
1184        return self._mu
1185
1186    @property
1187    def stdev(self):
1188        "Standard deviation of the normal distribution."
1189        return self._sigma
1190
1191    @property
1192    def variance(self):
1193        "Square of the standard deviation."
1194        return self._sigma ** 2.0
1195
1196    def __add__(x1, x2):
1197        """Add a constant or another NormalDist instance.
1198
1199        If *other* is a constant, translate mu by the constant,
1200        leaving sigma unchanged.
1201
1202        If *other* is a NormalDist, add both the means and the variances.
1203        Mathematically, this works only if the two distributions are
1204        independent or if they are jointly normally distributed.
1205        """
1206        if isinstance(x2, NormalDist):
1207            return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
1208        return NormalDist(x1._mu + x2, x1._sigma)
1209
1210    def __sub__(x1, x2):
1211        """Subtract a constant or another NormalDist instance.
1212
1213        If *other* is a constant, translate by the constant mu,
1214        leaving sigma unchanged.
1215
1216        If *other* is a NormalDist, subtract the means and add the variances.
1217        Mathematically, this works only if the two distributions are
1218        independent or if they are jointly normally distributed.
1219        """
1220        if isinstance(x2, NormalDist):
1221            return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
1222        return NormalDist(x1._mu - x2, x1._sigma)
1223
1224    def __mul__(x1, x2):
1225        """Multiply both mu and sigma by a constant.
1226
1227        Used for rescaling, perhaps to change measurement units.
1228        Sigma is scaled with the absolute value of the constant.
1229        """
1230        return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
1231
1232    def __truediv__(x1, x2):
1233        """Divide both mu and sigma by a constant.
1234
1235        Used for rescaling, perhaps to change measurement units.
1236        Sigma is scaled with the absolute value of the constant.
1237        """
1238        return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
1239
1240    def __pos__(x1):
1241        "Return a copy of the instance."
1242        return NormalDist(x1._mu, x1._sigma)
1243
1244    def __neg__(x1):
1245        "Negates mu while keeping sigma the same."
1246        return NormalDist(-x1._mu, x1._sigma)
1247
1248    __radd__ = __add__
1249
1250    def __rsub__(x1, x2):
1251        "Subtract a NormalDist from a constant or another NormalDist."
1252        return -(x1 - x2)
1253
1254    __rmul__ = __mul__
1255
1256    def __eq__(x1, x2):
1257        "Two NormalDist objects are equal if their mu and sigma are both equal."
1258        if not isinstance(x2, NormalDist):
1259            return NotImplemented
1260        return x1._mu == x2._mu and x1._sigma == x2._sigma
1261
1262    def __hash__(self):
1263        "NormalDist objects hash equal if their mu and sigma are both equal."
1264        return hash((self._mu, self._sigma))
1265
1266    def __repr__(self):
1267        return f'{type(self).__name__}(mu={self._mu!r}, sigma={self._sigma!r})'
1268