1# -*- coding: utf-8 -*- 2""" 3Created on Tue Oct 6 12:42:11 2020 4 5Author: Josef Perktold 6License: BSD-3 7 8""" 9 10import numpy as np 11from scipy import stats 12 13from statsmodels.stats.base import HolderTuple 14from statsmodels.stats.effect_size import _noncentrality_chisquare 15 16 17def test_chisquare_binning(counts, expected, sort_var=None, bins=10, 18 df=None, ordered=False, sort_method="quicksort", 19 alpha_nc=0.05): 20 """chisquare gof test with binning of data, Hosmer-Lemeshow type 21 22 ``observed`` and ``expected`` are observation specific and should have 23 observations in rows and choices in columns 24 25 Parameters 26 ---------- 27 counts : array_like 28 Observed frequency, i.e. counts for all choices 29 expected : array_like 30 Expected counts or probability. If expected are counts, then they 31 need to sum to the same total count as the sum of oberserved. 32 If those sums are unequal and all expected values are smaller or equal 33 to 1, then they are interpreted as probabilities and will be rescaled 34 to match counts. 35 sort_var : array_like 36 1-dimensional array for binning. Groups will be formed according to 37 quantiles of the sorted array ``sort_var``, so that group sizes have 38 equal or approximately equal sizes. 39 40 Returns 41 ------- 42 Holdertuple instance 43 This instance contains the results of the chisquare test and some 44 information about the data 45 46 - statistic : chisquare statistic of the goodness-of-fit test 47 - pvalue : pvalue of the chisquare test 48 = df : degrees of freedom of the test 49 50 Notes 51 ----- 52 Degrees of freedom for Hosmer-Lemeshow tests are given by 53 54 g groups, c choices 55 56 - binary: `df = (g - 2)` for insample, 57 Stata uses `df = g` for outsample 58 - multinomial: `df = (g−2) *(c−1)`, reduces to (g-2) for binary c=2, 59 (Fagerland, Hosmer, Bofin SIM 2008) 60 - ordinal: `df = (g - 2) * (c - 1) + (c - 2)`, reduces to (g-2) for c=2, 61 (Hosmer, ... ?) 62 63 Note: If there are ties in the ``sort_var`` array, then the split of 64 observations into groups will depend on the sort algorithm. 65 """ 66 67 observed = np.asarray(counts) 68 expected = np.asarray(expected) 69 n_observed = counts.sum() 70 n_expected = expected.sum() 71 if not np.allclose(n_observed, n_expected, atol=1e-13): 72 if np.max(expected) < 1 + 1e-13: 73 # expected seems to be probability, warn and rescale 74 import warnings 75 warnings.warn("sum of expected and of observed differ, " 76 "rescaling ``expected``") 77 expected = expected / n_expected * n_observed 78 else: 79 # expected doesn't look like fractions or probabilities 80 raise ValueError("total counts of expected and observed differ") 81 82 # k = 1 if observed.ndim == 1 else observed.shape[1] 83 if sort_var is not None: 84 argsort = np.argsort(sort_var, kind=sort_method) 85 else: 86 argsort = np.arange(observed.shape[0]) 87 # indices = [arr for arr in np.array_split(argsort, bins, axis=0)] 88 indices = np.array_split(argsort, bins, axis=0) 89 # in one loop, observed expected in last dimension, too messy, 90 # freqs_probs = np.array([np.vstack([observed[idx].mean(0), 91 # expected[idx].mean(0)]).T 92 # for idx in indices]) 93 freqs = np.array([observed[idx].sum(0) for idx in indices]) 94 probs = np.array([expected[idx].sum(0) for idx in indices]) 95 96 # chisquare test 97 resid_pearson = (freqs - probs) / np.sqrt(probs) 98 chi2_stat_groups = ((freqs - probs)**2 / probs).sum(1) 99 chi2_stat = chi2_stat_groups.sum() 100 if df is None: 101 g, c = freqs.shape 102 if ordered is True: 103 df = (g - 2) * (c - 1) + (c - 2) 104 else: 105 df = (g - 2) * (c - 1) 106 pvalue = stats.chi2.sf(chi2_stat, df) 107 noncentrality = _noncentrality_chisquare(chi2_stat, df, alpha=alpha_nc) 108 109 res = HolderTuple(statistic=chi2_stat, 110 pvalue=pvalue, 111 df=df, 112 freqs=freqs, 113 probs=probs, 114 noncentrality=noncentrality, 115 resid_pearson=resid_pearson, 116 chi2_stat_groups=chi2_stat_groups, 117 indices=indices 118 ) 119 return res 120 121 122def prob_larger_ordinal_choice(prob): 123 """probability that observed category is larger than distribution prob 124 125 This is a helper function for Ordinal models, where endog is a 1-dim 126 categorical variable and predicted probabilities are 2-dimensional with 127 observations in rows and choices in columns. 128 129 Parameter 130 --------- 131 prob : array_like 132 Expected probabilities for ordinal choices, e.g. from prediction of 133 an ordinal model with observations in rows and choices in columns. 134 135 Returns 136 ------- 137 cdf_mid : ndarray 138 mid cdf, i.e ``P(x < y) + 0.5 P(x=y)`` 139 r : ndarray 140 Probability residual ``P(x > y) - P(x < y)`` for all possible choices. 141 Computed as ``r = cdf_mid * 2 - 1`` 142 143 References 144 ---------- 145 .. [2] Li, Chun, and Bryan E. Shepherd. 2012. “A New Residual for Ordinal 146 Outcomes.” Biometrika 99 (2): 473–80. 147 148 See Also 149 -------- 150 `statsmodels.stats.nonparametric.rank_compare_2ordinal` 151 152 """ 153 # similar to `nonparametric rank_compare_2ordinal` 154 155 prob = np.asarray(prob) 156 cdf = prob.cumsum(-1) 157 if cdf.ndim == 1: 158 cdf_ = np.concatenate(([0], cdf)) 159 elif cdf.ndim == 2: 160 cdf_ = np.concatenate((np.zeros((len(cdf), 1)), cdf), axis=1) 161 # r_1 = cdf_[..., 1:] + cdf_[..., :-1] - 1 162 cdf_mid = (cdf_[..., 1:] + cdf_[..., :-1]) / 2 163 r = cdf_mid * 2 - 1 164 return cdf_mid, r 165 166 167def prob_larger_2ordinal(probs1, probs2): 168 """Stochastically large probability for two ordinal distributions 169 170 Computes Pr(x1 > x2) + 0.5 * Pr(x1 = x2) for two ordered multinomial 171 (ordinal) distributed random variables x1 and x2. 172 173 This is vectorized with choices along last axis. 174 Broadcasting if freq2 is 1-dim also seems to work correctly. 175 176 Returns 177 ------- 178 prob1 : float 179 Probability that random draw from distribution 1 is larger than a 180 random draw from distribution 2. Pr(x1 > x2) + 0.5 * Pr(x1 = x2) 181 prob2 : float 182 prob2 = 1 - prob1 = Pr(x1 < x2) + 0.5 * Pr(x1 = x2) 183 """ 184# count1 = np.asarray(count1) 185# count2 = np.asarray(count2) 186# nobs1, nobs2 = count1.sum(), count2.sum() 187# freq1 = count1 / nobs1 188# freq2 = count2 / nobs2 189 190# if freq1.ndim == 1: 191# freq1_ = np.concatenate(([0], freq1)) 192# elif freq1.ndim == 2: 193# freq1_ = np.concatenate((np.zeros((len(freq1), 1)), freq1), axis=1) 194 195# if freq2.ndim == 1: 196# freq2_ = np.concatenate(([0], freq2)) 197# elif freq2.ndim == 2: 198# freq2_ = np.concatenate((np.zeros((len(freq2), 1)), freq2), axis=1) 199 200 freq1 = np.asarray(probs1) 201 freq2 = np.asarray(probs2) 202 # add zero at beginning of choices for cdf computation 203 freq1_ = np.concatenate((np.zeros(freq1.shape[:-1] + (1,)), freq1), 204 axis=-1) 205 freq2_ = np.concatenate((np.zeros(freq2.shape[:-1] + (1,)), freq2), 206 axis=-1) 207 208 cdf1 = freq1_.cumsum(axis=-1) 209 cdf2 = freq2_.cumsum(axis=-1) 210 211 # mid rank cdf 212 cdfm1 = (cdf1[..., 1:] + cdf1[..., :-1]) / 2 213 cdfm2 = (cdf2[..., 1:] + cdf2[..., :-1]) / 2 214 prob1 = (cdfm2 * freq1).sum(-1) 215 prob2 = (cdfm1 * freq2).sum(-1) 216 return prob1, prob2 217 218 219def cov_multinomial(probs): 220 """covariance matrix of multinomial distribution 221 222 This is vectorized with choices along last axis. 223 224 cov = diag(probs) - outer(probs, probs) 225 226 """ 227 228 k = probs.shape[-1] 229 di = np.diag_indices(k, 2) 230 cov = probs[..., None] * probs[..., None, :] 231 cov *= - 1 232 cov[..., di[0], di[1]] += probs 233 return cov 234 235 236def var_multinomial(probs): 237 """variance of multinomial distribution 238 239 var = probs * (1 - probs) 240 241 """ 242 var = probs * (1 - probs) 243 return var 244