1# -*- coding: utf-8 -*-
2"""
3Created on Tue Oct  6 12:42:11 2020
4
5Author: Josef Perktold
6License: BSD-3
7
8"""
9
10import numpy as np
11from scipy import stats
12
13from statsmodels.stats.base import HolderTuple
14from statsmodels.stats.effect_size import _noncentrality_chisquare
15
16
17def test_chisquare_binning(counts, expected, sort_var=None, bins=10,
18                           df=None, ordered=False, sort_method="quicksort",
19                           alpha_nc=0.05):
20    """chisquare gof test with binning of data, Hosmer-Lemeshow type
21
22    ``observed`` and ``expected`` are observation specific and should have
23    observations in rows and choices in columns
24
25    Parameters
26    ----------
27    counts : array_like
28        Observed frequency, i.e. counts for all choices
29    expected : array_like
30        Expected counts or probability. If expected are counts, then they
31        need to sum to the same total count as the sum of oberserved.
32        If those sums are unequal and all expected values are smaller or equal
33        to 1, then they are interpreted as probabilities and will be rescaled
34        to match counts.
35    sort_var : array_like
36        1-dimensional array for binning. Groups will be formed according to
37        quantiles of the sorted array ``sort_var``, so that group sizes have
38        equal or approximately equal sizes.
39
40    Returns
41    -------
42    Holdertuple instance
43        This instance contains the results of the chisquare test and some
44        information about the data
45
46        - statistic : chisquare statistic of the goodness-of-fit test
47        - pvalue : pvalue of the chisquare test
48        = df : degrees of freedom of the test
49
50    Notes
51    -----
52    Degrees of freedom for Hosmer-Lemeshow tests are given by
53
54    g groups, c choices
55
56    - binary: `df = (g - 2)` for insample,
57         Stata uses `df = g` for outsample
58    - multinomial: `df = (g−2) *(c−1)`, reduces to (g-2) for binary c=2,
59         (Fagerland, Hosmer, Bofin SIM 2008)
60    - ordinal: `df = (g - 2) * (c - 1) + (c - 2)`, reduces to (g-2) for c=2,
61         (Hosmer, ... ?)
62
63    Note: If there are ties in the ``sort_var`` array, then the split of
64    observations into groups will depend on the sort algorithm.
65    """
66
67    observed = np.asarray(counts)
68    expected = np.asarray(expected)
69    n_observed = counts.sum()
70    n_expected = expected.sum()
71    if not np.allclose(n_observed, n_expected, atol=1e-13):
72        if np.max(expected) < 1 + 1e-13:
73            # expected seems to be probability, warn and rescale
74            import warnings
75            warnings.warn("sum of expected and of observed differ, "
76                          "rescaling ``expected``")
77            expected = expected / n_expected * n_observed
78        else:
79            # expected doesn't look like fractions or probabilities
80            raise ValueError("total counts of expected and observed differ")
81
82    # k = 1 if observed.ndim == 1 else observed.shape[1]
83    if sort_var is not None:
84        argsort = np.argsort(sort_var, kind=sort_method)
85    else:
86        argsort = np.arange(observed.shape[0])
87    # indices = [arr for arr in np.array_split(argsort, bins, axis=0)]
88    indices = np.array_split(argsort, bins, axis=0)
89    # in one loop, observed expected in last dimension, too messy,
90    # freqs_probs = np.array([np.vstack([observed[idx].mean(0),
91    #                                    expected[idx].mean(0)]).T
92    #                         for idx in indices])
93    freqs = np.array([observed[idx].sum(0) for idx in indices])
94    probs = np.array([expected[idx].sum(0) for idx in indices])
95
96    # chisquare test
97    resid_pearson = (freqs - probs) / np.sqrt(probs)
98    chi2_stat_groups = ((freqs - probs)**2 / probs).sum(1)
99    chi2_stat = chi2_stat_groups.sum()
100    if df is None:
101        g, c = freqs.shape
102        if ordered is True:
103            df = (g - 2) * (c - 1) + (c - 2)
104        else:
105            df = (g - 2) * (c - 1)
106    pvalue = stats.chi2.sf(chi2_stat, df)
107    noncentrality = _noncentrality_chisquare(chi2_stat, df, alpha=alpha_nc)
108
109    res = HolderTuple(statistic=chi2_stat,
110                      pvalue=pvalue,
111                      df=df,
112                      freqs=freqs,
113                      probs=probs,
114                      noncentrality=noncentrality,
115                      resid_pearson=resid_pearson,
116                      chi2_stat_groups=chi2_stat_groups,
117                      indices=indices
118                      )
119    return res
120
121
122def prob_larger_ordinal_choice(prob):
123    """probability that observed category is larger than distribution prob
124
125    This is a helper function for Ordinal models, where endog is a 1-dim
126    categorical variable and predicted probabilities are 2-dimensional with
127    observations in rows and choices in columns.
128
129    Parameter
130    ---------
131    prob : array_like
132        Expected probabilities for ordinal choices, e.g. from prediction of
133        an ordinal model with observations in rows and choices in columns.
134
135    Returns
136    -------
137    cdf_mid : ndarray
138        mid cdf, i.e ``P(x < y) + 0.5 P(x=y)``
139    r : ndarray
140        Probability residual ``P(x > y) - P(x < y)`` for all possible choices.
141        Computed as ``r = cdf_mid * 2 - 1``
142
143    References
144    ----------
145    .. [2] Li, Chun, and Bryan E. Shepherd. 2012. “A New Residual for Ordinal
146       Outcomes.” Biometrika 99 (2): 473–80.
147
148    See Also
149    --------
150    `statsmodels.stats.nonparametric.rank_compare_2ordinal`
151
152    """
153    # similar to `nonparametric rank_compare_2ordinal`
154
155    prob = np.asarray(prob)
156    cdf = prob.cumsum(-1)
157    if cdf.ndim == 1:
158        cdf_ = np.concatenate(([0], cdf))
159    elif cdf.ndim == 2:
160        cdf_ = np.concatenate((np.zeros((len(cdf), 1)), cdf), axis=1)
161    # r_1 = cdf_[..., 1:] + cdf_[..., :-1] - 1
162    cdf_mid = (cdf_[..., 1:] + cdf_[..., :-1]) / 2
163    r = cdf_mid * 2 - 1
164    return cdf_mid, r
165
166
167def prob_larger_2ordinal(probs1, probs2):
168    """Stochastically large probability for two ordinal distributions
169
170    Computes Pr(x1 > x2) + 0.5 * Pr(x1 = x2) for two ordered multinomial
171    (ordinal) distributed random variables x1 and x2.
172
173    This is vectorized with choices along last axis.
174    Broadcasting if freq2 is 1-dim also seems to work correctly.
175
176    Returns
177    -------
178    prob1 : float
179        Probability that random draw from distribution 1 is larger than a
180        random draw from distribution 2. Pr(x1 > x2) + 0.5 * Pr(x1 = x2)
181    prob2 : float
182        prob2 = 1 - prob1 = Pr(x1 < x2) + 0.5 * Pr(x1 = x2)
183    """
184#    count1 = np.asarray(count1)
185#    count2 = np.asarray(count2)
186#    nobs1, nobs2 = count1.sum(), count2.sum()
187#    freq1 = count1 / nobs1
188#    freq2 = count2 / nobs2
189
190#     if freq1.ndim == 1:
191#         freq1_ = np.concatenate(([0], freq1))
192#     elif freq1.ndim == 2:
193#         freq1_ = np.concatenate((np.zeros((len(freq1), 1)), freq1), axis=1)
194
195#     if freq2.ndim == 1:
196#         freq2_ = np.concatenate(([0], freq2))
197#     elif freq2.ndim == 2:
198#         freq2_ = np.concatenate((np.zeros((len(freq2), 1)), freq2), axis=1)
199
200    freq1 = np.asarray(probs1)
201    freq2 = np.asarray(probs2)
202    # add zero at beginning of choices for cdf computation
203    freq1_ = np.concatenate((np.zeros(freq1.shape[:-1] + (1,)), freq1),
204                            axis=-1)
205    freq2_ = np.concatenate((np.zeros(freq2.shape[:-1] + (1,)), freq2),
206                            axis=-1)
207
208    cdf1 = freq1_.cumsum(axis=-1)
209    cdf2 = freq2_.cumsum(axis=-1)
210
211    # mid rank cdf
212    cdfm1 = (cdf1[..., 1:] + cdf1[..., :-1]) / 2
213    cdfm2 = (cdf2[..., 1:] + cdf2[..., :-1]) / 2
214    prob1 = (cdfm2 * freq1).sum(-1)
215    prob2 = (cdfm1 * freq2).sum(-1)
216    return prob1, prob2
217
218
219def cov_multinomial(probs):
220    """covariance matrix of multinomial distribution
221
222    This is vectorized with choices along last axis.
223
224    cov = diag(probs) - outer(probs, probs)
225
226    """
227
228    k = probs.shape[-1]
229    di = np.diag_indices(k, 2)
230    cov = probs[..., None] * probs[..., None, :]
231    cov *= - 1
232    cov[..., di[0], di[1]] += probs
233    return cov
234
235
236def var_multinomial(probs):
237    """variance of multinomial distribution
238
239    var = probs * (1 - probs)
240
241    """
242    var = probs * (1 - probs)
243    return var
244