1# -*- coding: utf-8 -*- 2''' 3Author: Vincent Arel-Bundock <varel@umich.edu> 4Date: 2012-08-25 5 6This example file implements 5 variations of the negative binomial regression 7model for count data: NB-P, NB-1, NB-2, geometric and left-truncated. 8 9The NBin class inherits from the GenericMaximumLikelihood statsmodels class 10which provides automatic numerical differentiation for the score and hessian. 11 12NB-1, NB-2 and geometric are implemented as special cases of the NB-P model 13described in Greene (2008) Functional forms for the negative binomial model for 14count data. Economics Letters, v99n3. 15 16Tests are included to check how NB-1, NB-2 and geometric coefficient estimates 17compare to equivalent models in R. Results usually agree up to the 4th digit. 18 19The NB-P and left-truncated model results have not been compared to other 20implementations. Note that NB-P appears to only have been implemented in the 21LIMDEP software. 22''' 23from urllib.request import urlopen 24 25import numpy as np 26from numpy.testing import assert_almost_equal 27from scipy.special import digamma 28from scipy.stats import nbinom 29import pandas 30import patsy 31 32from statsmodels.base.model import GenericLikelihoodModel 33from statsmodels.base.model import GenericLikelihoodModelResults 34 35 36#### Negative Binomial Log-likelihoods #### 37def _ll_nbp(y, X, beta, alph, Q): 38 r''' 39 Negative Binomial Log-likelihood -- type P 40 41 References: 42 43 Greene, W. 2008. "Functional forms for the negative binomial model 44 for count data". Economics Letters. Volume 99, Number 3, pp.585-590. 45 Hilbe, J.M. 2011. "Negative binomial regression". Cambridge University Press. 46 47 Following notation in Greene (2008), with negative binomial heterogeneity 48 parameter :math:`\alpha`: 49 50 .. math:: 51 52 \lambda_i = exp(X\beta)\\ 53 \theta = 1 / \alpha \\ 54 g_i = \theta \lambda_i^Q \\ 55 w_i = g_i/(g_i + \lambda_i) \\ 56 r_i = \theta / (\theta+\lambda_i) \\ 57 ln \mathcal{L}_i = ln \Gamma(y_i+g_i) - ln \Gamma(1+y_i) + g_iln (r_i) + y_i ln(1-r_i) 58 ''' 59 mu = np.exp(np.dot(X, beta)) 60 size = 1/alph*mu**Q 61 prob = size/(size+mu) 62 ll = nbinom.logpmf(y, size, prob) 63 return ll 64 65 66def _ll_nb1(y, X, beta, alph): 67 '''Negative Binomial regression (type 1 likelihood)''' 68 ll = _ll_nbp(y, X, beta, alph, Q=1) 69 return ll 70 71 72def _ll_nb2(y, X, beta, alph): 73 '''Negative Binomial regression (type 2 likelihood)''' 74 ll = _ll_nbp(y, X, beta, alph, Q=0) 75 return ll 76 77 78def _ll_geom(y, X, beta): 79 '''Geometric regression''' 80 ll = _ll_nbp(y, X, beta, alph=1, Q=0) 81 return ll 82 83 84def _ll_nbt(y, X, beta, alph, C=0): 85 r''' 86 Negative Binomial (truncated) 87 88 Truncated densities for count models (Cameron & Trivedi, 2005, 680): 89 90 .. math:: 91 92 f(y|\beta, y \geq C+1) = \frac{f(y|\beta)}{1-F(C|\beta)} 93 ''' 94 Q = 0 95 mu = np.exp(np.dot(X, beta)) 96 size = 1/alph*mu**Q 97 prob = size/(size+mu) 98 ll = nbinom.logpmf(y, size, prob) - np.log(1 - nbinom.cdf(C, size, prob)) 99 return ll 100 101 102#### Model Classes #### 103class NBin(GenericLikelihoodModel): 104 ''' 105 Negative Binomial regression 106 107 Parameters 108 ---------- 109 endog : array_like 110 1-d array of the response variable. 111 exog : array_like 112 `exog` is an n x p array where n is the number of observations and p 113 is the number of regressors including the intercept if one is 114 included in the data. 115 ll_type: str 116 log-likelihood type 117 `nb2`: Negative Binomial type-2 (most common) 118 `nb1`: Negative Binomial type-1 119 `nbp`: Negative Binomial type-P (Greene, 2008) 120 `nbt`: Left-truncated Negative Binomial (type-2) 121 `geom`: Geometric regression model 122 C: int 123 Cut-point for `nbt` model 124 ''' 125 def __init__(self, endog, exog, ll_type='nb2', C=0, **kwds): 126 self.exog = np.array(exog) 127 self.endog = np.array(endog) 128 self.C = C 129 super(NBin, self).__init__(endog, exog, **kwds) 130 # Check user input 131 if ll_type not in ['nb2', 'nb1', 'nbp', 'nbt', 'geom']: 132 raise NameError('Valid ll_type are: nb2, nb1, nbp, nbt, geom') 133 self.ll_type = ll_type 134 # Starting values (assumes first column of exog is constant) 135 if ll_type == 'geom': 136 self.start_params_default = np.zeros(self.exog.shape[1]) 137 elif ll_type == 'nbp': 138 # Greene recommends starting NB-P at NB-2 139 start_mod = NBin(endog, exog, 'nb2') 140 start_res = start_mod.fit(disp=False) 141 self.start_params_default = np.append(start_res.params, 0) 142 else: 143 self.start_params_default = np.append(np.zeros(self.exog.shape[1]), .5) 144 self.start_params_default[0] = np.log(self.endog.mean()) 145 # Define loglik based on ll_type argument 146 if ll_type == 'nb1': 147 self.ll_func = _ll_nb1 148 elif ll_type == 'nb2': 149 self.ll_func = _ll_nb2 150 elif ll_type == 'geom': 151 self.ll_func = _ll_geom 152 elif ll_type == 'nbp': 153 self.ll_func = _ll_nbp 154 elif ll_type == 'nbt': 155 self.ll_func = _ll_nbt 156 157 def nloglikeobs(self, params): 158 alph = params[-1] 159 beta = params[:self.exog.shape[1]] 160 if self.ll_type == 'geom': 161 return -self.ll_func(self.endog, self.exog, beta) 162 elif self.ll_type == 'nbt': 163 return -self.ll_func(self.endog, self.exog, beta, alph, self.C) 164 elif self.ll_type == 'nbp': 165 Q = params[-2] 166 return -self.ll_func(self.endog, self.exog, beta, alph, Q) 167 else: 168 return -self.ll_func(self.endog, self.exog, beta, alph) 169 170 def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds): 171 if start_params is None: 172 countfit = super(NBin, self).fit(start_params=self.start_params_default, 173 maxiter=maxiter, maxfun=maxfun, **kwds) 174 else: 175 countfit = super(NBin, self).fit(start_params=start_params, 176 maxiter=maxiter, maxfun=maxfun, **kwds) 177 countfit = CountResults(self, countfit) 178 return countfit 179 180 181class CountResults(GenericLikelihoodModelResults): 182 def __init__(self, model, mlefit): 183 self.model = model 184 self.__dict__.update(mlefit.__dict__) 185 def summary(self, yname=None, xname=None, title=None, alpha=.05, 186 yname_list=None): 187 top_left = [('Dep. Variable:', None), 188 ('Model:', [self.model.__class__.__name__]), 189 ('Method:', ['MLE']), 190 ('Date:', None), 191 ('Time:', None), 192 ('Converged:', ["%s" % self.mle_retvals['converged']])] 193 top_right = [('No. Observations:', None), 194 ('Log-Likelihood:', None), 195 ] 196 if title is None: 197 title = self.model.__class__.__name__ + ' ' + "Regression Results" 198 #boiler plate 199 from statsmodels.iolib.summary import Summary 200 smry = Summary() 201 # for top of table 202 smry.add_table_2cols(self, gleft=top_left, gright=top_right, #[], 203 yname=yname, xname=xname, title=title) 204 # for parameters, etc 205 smry.add_table_params(self, yname=yname_list, xname=xname, alpha=alpha, 206 use_t=True) 207 return smry 208 209 210#### Score function for NB-P #### 211 212 213def _score_nbp(y, X, beta, thet, Q): 214 r''' 215 Negative Binomial Score -- type P likelihood from Greene (2007) 216 .. math:: 217 218 \lambda_i = exp(X\beta)\\ 219 g_i = \theta \lambda_i^Q \\ 220 w_i = g_i/(g_i + \lambda_i) \\ 221 r_i = \theta / (\theta+\lambda_i) \\ 222 A_i = \left [ \Psi(y_i+g_i) - \Psi(g_i) + ln w_i \right ] \\ 223 B_i = \left [ g_i (1-w_i) - y_iw_i \right ] \\ 224 \partial ln \mathcal{L}_i / \partial 225 \begin{pmatrix} \lambda_i \\ \theta \\ Q \end{pmatrix}= 226 [A_i+B_i] 227 \begin{pmatrix} Q/\lambda_i \\ 1/\theta \\ ln(\lambda_i) \end{pmatrix} 228 -B_i 229 \begin{pmatrix} 1/\lambda_i\\ 0 \\ 0 \end{pmatrix} \\ 230 \frac{\partial \lambda}{\partial \beta} = \lambda_i \mathbf{x}_i \\ 231 \frac{\partial \mathcal{L}_i}{\partial \beta} = 232 \left (\frac{\partial\mathcal{L}_i}{\partial \lambda_i} \right ) 233 \frac{\partial \lambda_i}{\partial \beta} 234 ''' 235 lamb = np.exp(np.dot(X, beta)) 236 g = thet * lamb**Q 237 w = g / (g + lamb) 238 r = thet / (thet+lamb) 239 A = digamma(y+g) - digamma(g) + np.log(w) 240 B = g*(1-w) - y*w 241 dl = (A+B) * Q/lamb - B * 1/lamb 242 dt = (A+B) * 1/thet 243 dq = (A+B) * np.log(lamb) 244 db = X * (dl * lamb)[:,np.newaxis] 245 sc = np.array([dt.sum(), dq.sum()]) 246 sc = np.concatenate([db.sum(axis=0), sc]) 247 return sc 248 249 250#### Tests #### 251medpar = pandas.read_csv(urlopen('https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/csv/COUNT/medpar.csv')) 252mdvis = pandas.read_csv(urlopen('https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/csv/COUNT/mdvis.csv')) 253 254# NB-2 255''' 256# R v2.15.1 257library(MASS) 258library(COUNT) 259data(medpar) 260f <- los~factor(type)+hmo+white 261mod <- glm.nb(f, medpar) 262summary(mod) 263Call: 264glm.nb(formula = f, data = medpar, init.theta = 2.243376203, 265 link = log) 266 267Deviance Residuals: 268 Min 1Q Median 3Q Max 269-2.4671 -0.9090 -0.2693 0.4320 3.8668 270 271Coefficients: 272 Estimate Std. Error z value Pr(>|z|) 273(Intercept) 2.31028 0.06745 34.253 < 2e-16 *** 274factor(type)2 0.22125 0.05046 4.385 1.16e-05 *** 275factor(type)3 0.70616 0.07600 9.292 < 2e-16 *** 276hmo -0.06796 0.05321 -1.277 0.202 277white -0.12907 0.06836 -1.888 0.059 . 278--- 279Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 280 281(Dispersion parameter for Negative Binomial(2.2434) family taken to be 1) 282 283 Null deviance: 1691.1 on 1494 degrees of freedom 284Residual deviance: 1568.1 on 1490 degrees of freedom 285AIC: 9607 286 287Number of Fisher Scoring iterations: 1 288 289 290 Theta: 2.2434 291 Std. Err.: 0.0997 292 293 2 x log-likelihood: -9594.9530 294''' 295 296def test_nb2(): 297 y, X = patsy.dmatrices('los ~ C(type) + hmo + white', medpar) 298 y = np.array(y)[:,0] 299 nb2 = NBin(y,X,'nb2').fit(maxiter=10000, maxfun=5000) 300 assert_almost_equal(nb2.params, 301 [2.31027893349935, 0.221248978197356, 0.706158824346228, 302 -0.067955221930748, -0.129065442248951, 0.4457567], 303 decimal=2) 304 305# NB-1 306''' 307# R v2.15.1 308# COUNT v1.2.3 309library(COUNT) 310data(medpar) 311f <- los~factor(type)+hmo+white 312ml.nb1(f, medpar) 313 314 Estimate SE Z LCL UCL 315(Intercept) 2.34918407 0.06023641 38.9994023 2.23112070 2.46724744 316factor(type)2 0.16175471 0.04585569 3.5274735 0.07187757 0.25163186 317factor(type)3 0.41879257 0.06553258 6.3906006 0.29034871 0.54723643 318hmo -0.04533566 0.05004714 -0.9058592 -0.14342805 0.05275673 319white -0.12951295 0.06071130 -2.1332593 -0.24850710 -0.01051880 320alpha 4.57898241 0.22015968 20.7984603 4.14746943 5.01049539 321''' 322 323#def test_nb1(): 324 #y, X = patsy.dmatrices('los ~ C(type) + hmo + white', medpar) 325 #y = np.array(y)[:,0] 326 ## TODO: Test fails with some of the other optimization methods 327 #nb1 = NBin(y,X,'nb1').fit(method='ncg', maxiter=10000, maxfun=5000) 328 #assert_almost_equal(nb1.params, 329 #[2.34918407014186, 0.161754714412848, 0.418792569970658, 330 # -0.0453356614650342, -0.129512952033423, 4.57898241219275], 331 #decimal=2) 332 333# NB-Geometric 334''' 335MASS v7.3-20 336R v2.15.1 337library(MASS) 338data(medpar) 339f <- los~factor(type)+hmo+white 340mod <- glm(f, family=negative.binomial(1), data=medpar) 341summary(mod) 342Call: 343glm(formula = f, family = negative.binomial(1), data = medpar) 344 345Deviance Residuals: 346 Min 1Q Median 3Q Max 347-1.7942 -0.6545 -0.1896 0.3044 2.6844 348 349Coefficients: 350 Estimate Std. Error t value Pr(>|t|) 351(Intercept) 2.30849 0.07071 32.649 < 2e-16 *** 352factor(type)2 0.22121 0.05283 4.187 2.99e-05 *** 353factor(type)3 0.70599 0.08092 8.724 < 2e-16 *** 354hmo -0.06779 0.05521 -1.228 0.2197 355white -0.12709 0.07169 -1.773 0.0765 . 356--- 357Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 358 359(Dispersion parameter for Negative Binomial(1) family taken to be 0.5409721) 360 361 Null deviance: 872.29 on 1494 degrees of freedom 362Residual deviance: 811.95 on 1490 degrees of freedom 363AIC: 9927.3 364 365Number of Fisher Scoring iterations: 5 366''' 367 368#def test_geom(): 369 #y, X = patsy.dmatrices('los ~ C(type) + hmo + white', medpar) 370 #y = np.array(y)[:,0] 371 ## TODO: remove alph from geom params 372 #geom = NBin(y,X,'geom').fit(maxiter=10000, maxfun=5000) 373 #assert_almost_equal(geom.params, 374 #[2.3084850946241, 0.221206159108742, 0.705986369841159, 375 # -0.0677871843613577, -0.127088772164963], 376 #decimal=4) 377 378test_nb2() 379