1# -*- coding: utf-8 -*- 2""" 3Created on Fri Sep 15 12:53:45 2017 4 5Author: Josef Perktold 6""" 7 8import numpy as np 9from scipy import stats 10 11from statsmodels.discrete.discrete_model import Poisson 12from statsmodels.regression.linear_model import OLS 13 14 15def _combine_bins(edge_index, x): 16 """group columns into bins using sum 17 18 This is mainly a helper function for combining probabilities into cells. 19 It similar to `np.add.reduceat(x, edge_index, axis=-1)` except for the 20 treatment of the last index and last cell. 21 22 Parameters 23 ---------- 24 edge_index : array_like 25 This defines the (zero-based) indices for the columns that are be 26 combined. Each index in `edge_index` except the last is the starting 27 index for a bin. The largest index in a bin is the next edge_index-1. 28 x : 1d or 2d array 29 array for which columns are combined. If x is 1-dimensional that it 30 will be treated as a 2-d row vector. 31 32 Returns 33 ------- 34 x_new : ndarray 35 36 37 Examples 38 -------- 39 >>> dia.combine_bins([0,1,5], np.arange(4)) 40 (array([0, 6]), array([1, 4])) 41 42 this aggregates to two bins with the sum of 1 and 4 elements 43 >>> np.arange(4)[0].sum() 44 0 45 >>> np.arange(4)[1:5].sum() 46 6 47 48 If the rightmost index is smaller than len(x)+1, then the remaining 49 columns will not be included. 50 51 >>> dia.combine_bins([0,1,3], np.arange(4)) 52 (array([0, 3]), array([1, 2])) 53 """ 54 x = np.asarray(x) 55 if x.ndim == 1: 56 is_1d = True 57 x = x[None, :] 58 else: 59 is_1d = False 60 xli = [] 61 kli = [] 62 for bin_idx in range(len(edge_index) - 1): 63 i, j = edge_index[bin_idx : bin_idx + 2] 64 xli.append(x[:, i:j].sum(1)) 65 kli.append(j - i) 66 67 x_new = np.column_stack(xli) 68 if is_1d: 69 x_new = x_new.squeeze() 70 return x_new, np.asarray(kli) 71 72 73def plot_probs(freq, probs_predicted, label='predicted', upp_xlim=None, 74 fig=None): 75 """diagnostic plots for comparing two lists of discrete probabilities 76 77 Parameters 78 ---------- 79 freq, probs_predicted : nd_arrays 80 two arrays of probabilities, this can be any probabilities for 81 the same events, default is designed for comparing predicted 82 and observed probabilities 83 label : str or tuple 84 If string, then it will be used as the label for probs_predicted and 85 "freq" is used for the other probabilities. 86 If label is a tuple of strings, then the first is they are used as 87 label for both probabilities 88 89 upp_xlim : None or int 90 If it is not None, then the xlim of the first two plots are set to 91 (0, upp_xlim), otherwise the matplotlib default is used 92 fig : None or matplotlib figure instance 93 If fig is provided, then the axes will be added to it in a (3,1) 94 subplots, otherwise a matplotlib figure instance is created 95 96 Returns 97 ------- 98 Figure 99 The figure contains 3 subplot with probabilities, cumulative 100 probabilities and a PP-plot 101 """ 102 103 if isinstance(label, list): 104 label0, label1 = label 105 else: 106 label0, label1 = 'freq', label 107 108 if fig is None: 109 import matplotlib.pyplot as plt 110 fig = plt.figure(figsize=(8,12)) 111 ax1 = fig.add_subplot(311) 112 ax1.plot(freq, '-o', label=label0) 113 ax1.plot(probs_predicted, '-d', label=label1) 114 if upp_xlim is not None: 115 ax1.set_xlim(0, upp_xlim) 116 ax1.legend() 117 ax1.set_title('probabilities') 118 119 ax2 = fig.add_subplot(312) 120 ax2.plot(np.cumsum(freq), '-o', label=label0) 121 ax2.plot(np.cumsum(probs_predicted), '-d', label=label1) 122 if upp_xlim is not None: 123 ax2.set_xlim(0, upp_xlim) 124 ax2.legend() 125 ax2.set_title('cumulative probabilities') 126 127 ax3 = fig.add_subplot(313) 128 ax3.plot(np.cumsum(probs_predicted), np.cumsum(freq), 'o') 129 ax3.plot(np.arange(len(freq)) / len(freq), np.arange(len(freq)) / len(freq)) 130 ax3.set_title('PP-plot') 131 ax3.set_xlabel(label1) 132 ax3.set_ylabel(label0) 133 return fig 134 135 136def test_chisquare_prob(results, probs, bin_edges=None, method=None): 137 """ 138 chisquare test for predicted probabilities using cmt-opg 139 140 Parameters 141 ---------- 142 results : results instance 143 Instance of a count regression results 144 probs : ndarray 145 Array of predicted probabilities with observations 146 in rows and event counts in columns 147 bin_edges : None or array 148 intervals to combine several counts into cells 149 see combine_bins 150 151 Returns 152 ------- 153 (api not stable, replace by test-results class) 154 statistic : float 155 chisquare statistic for tes 156 p-value : float 157 p-value of test 158 df : int 159 degrees of freedom for chisquare distribution 160 extras : ??? 161 currently returns a tuple with some intermediate results 162 (diff, res_aux) 163 164 Notes 165 ----- 166 167 Status : experimental, no verified unit tests, needs to be generalized 168 currently only OPG version with auxiliary regression is implemented 169 170 171 Assumes counts are np.arange(probs.shape[1]), i.e. consecutive 172 integers starting at zero. 173 174 Auxiliary regression drops the last column of binned probs to avoid 175 that probabilities sum to 1. 176 """ 177 res = results 178 score_obs = results.model.score_obs(results.params) 179 d_ind = (res.model.endog[:, None] == np.arange(probs.shape[1])).astype(int) 180 if bin_edges is not None: 181 d_ind_bins, k_bins = _combine_bins(bin_edges, d_ind) 182 probs_bins, k_bins = _combine_bins(bin_edges, probs) 183 else: 184 d_ind_bins, k_bins = d_ind, d_ind.shape[1] 185 probs_bins = probs 186 diff1 = d_ind_bins - probs_bins 187 #diff2 = (1 - d_ind.sum(1)) - (1 - probs_bins.sum(1)) 188 x_aux = np.column_stack((score_obs, diff1[:, :-1])) #, diff2)) 189 nobs = x_aux.shape[0] 190 res_aux = OLS(np.ones(nobs), x_aux).fit() 191 192 chi2_stat = nobs * (1 - res_aux.ssr / res_aux.uncentered_tss) 193 df = res_aux.model.rank - score_obs.shape[1] 194 if df < k_bins - 1: 195 # not a problem in general, but it can be for OPG version 196 import warnings 197 warnings.warn('auxiliary model is rank deficient') 198 extras = (diff1, res_aux) 199 return chi2_stat, stats.chi2.sf(chi2_stat, df), df, extras 200 201 202def test_poisson_zeroinflation(results_poisson, exog_infl=None): 203 """score test for zero inflation or deflation in Poisson 204 205 This implements Jansakul and Hinde 2009 score test 206 for excess zeros against a zero modified Poisson 207 alternative. They use a linear link function for the 208 inflation model to allow for zero deflation. 209 210 Parameters 211 ---------- 212 results_poisson: results instance 213 The test is only valid if the results instance is a Poisson 214 model. 215 exog_infl : ndarray 216 Explanatory variables for the zero inflated or zero modified 217 alternative. I exog_infl is None, then the inflation 218 probability is assumed to be constant. 219 220 Returns 221 ------- 222 score test results based on chisquare distribution 223 224 Notes 225 ----- 226 This is a score test based on the null hypothesis that 227 the true model is Poisson. It will also reject for 228 other deviations from a Poisson model if those affect 229 the zero probabilities, e.g. in the direction of 230 excess dispersion as in the Negative Binomial 231 or Generalized Poisson model. 232 Therefore, rejection in this test does not imply that 233 zero-inflated Poisson is the appropriate model. 234 235 Status: experimental, no verified unit tests, 236 237 TODO: If the zero modification probability is assumed 238 to be constant under the alternative, then we only have 239 a scalar test score and we can use one-sided tests to 240 distinguish zero inflation and deflation from the 241 two-sided deviations. (The general one-sided case is 242 difficult.) 243 244 References 245 ---------- 246 Jansakul and Hinde 2009 247 """ 248 if not isinstance(results_poisson.model, Poisson): 249 # GLM Poisson would be also valid, not tried 250 import warnings 251 warnings.warn('Test is only valid if model is Poisson') 252 253 nobs = results_poisson.model.endog.shape[0] 254 255 if exog_infl is None: 256 exog_infl = np.ones((nobs, 1)) 257 258 259 endog = results_poisson.model.endog 260 exog = results_poisson.model.exog 261 262 mu = results_poisson.predict() 263 prob_zero = np.exp(-mu) 264 265 cov_poi = results_poisson.cov_params() 266 cross_derivative = (exog_infl.T * (-mu)).dot(exog).T 267 cov_infl = (exog_infl.T * ((1 - prob_zero) / prob_zero)).dot(exog_infl) 268 score_obs_infl = exog_infl * (((endog == 0) - prob_zero) / prob_zero)[:,None] 269 #score_obs_infl = exog_infl * ((endog == 0) * (1 - prob_zero) / prob_zero - (endog>0))[:,None] #same 270 score_infl = score_obs_infl.sum(0) 271 cov_score_infl = cov_infl - cross_derivative.T.dot(cov_poi).dot(cross_derivative) 272 cov_score_infl_inv = np.linalg.pinv(cov_score_infl) 273 statistic = score_infl.dot(cov_score_infl_inv).dot(score_infl) 274 df2 = np.linalg.matrix_rank(cov_score_infl) # more general, maybe not needed 275 df = exog_infl.shape[1] 276 pvalue = stats.chi2.sf(statistic, df) 277 return statistic, pvalue, df, df2 278 279 280def test_poisson_zeroinflation_brock(results_poisson): 281 """score test for zero modification in Poisson, special case 282 283 This assumes that the Poisson model has a constant and that 284 the zero modification probability is constant. 285 286 This is a special case of test_poisson_zeroinflation derived by 287 van den Brock 1995. 288 289 The test reports two sided and one sided alternatives based on 290 the normal distribution of the test statistic. 291 """ 292 293 mu = results_poisson.predict() 294 prob_zero = np.exp(-mu) 295 endog = results_poisson.model.endog 296 nobs = len(endog) 297 298 score = ((endog == 0) / prob_zero).sum() - nobs 299 var_score = (1 / prob_zero).sum() - nobs - endog.sum() 300 statistic = score / np.sqrt(var_score) 301 pvalue_two = 2 * stats.norm.sf(np.abs(statistic)) 302 pvalue_upp = stats.norm.sf(statistic) 303 pvalue_low = stats.norm.cdf(statistic) 304 return statistic, pvalue_two, pvalue_upp, pvalue_low, stats.chi2.sf(statistic**2, 1) 305