1# -*- coding: utf-8 -*-
2'''
3Author: Vincent Arel-Bundock <varel@umich.edu>
4Date: 2012-08-25
5
6This example file implements 5 variations of the negative binomial regression
7model for count data: NB-P, NB-1, NB-2, geometric and left-truncated.
8
9The NBin class inherits from the GenericMaximumLikelihood statsmodels class
10which provides automatic numerical differentiation for the score and hessian.
11
12NB-1, NB-2 and geometric are implemented as special cases of the NB-P model
13described in Greene (2008) Functional forms for the negative binomial model for
14count data. Economics Letters, v99n3.
15
16Tests are included to check how NB-1, NB-2 and geometric coefficient estimates
17compare to equivalent models in R. Results usually agree up to the 4th digit.
18
19The NB-P and left-truncated model results have not been compared to other
20implementations. Note that NB-P appears to only have been implemented in the
21LIMDEP software.
22'''
23from urllib.request import urlopen
24
25import numpy as np
26from numpy.testing import assert_almost_equal
27from scipy.special import digamma
28from scipy.stats import nbinom
29import pandas
30import patsy
31
32from statsmodels.base.model import GenericLikelihoodModel
33from statsmodels.base.model import GenericLikelihoodModelResults
34
35
36#### Negative Binomial Log-likelihoods ####
37def _ll_nbp(y, X, beta, alph, Q):
38    r'''
39    Negative Binomial Log-likelihood -- type P
40
41    References:
42
43    Greene, W. 2008. "Functional forms for the negative binomial model
44        for count data". Economics Letters. Volume 99, Number 3, pp.585-590.
45    Hilbe, J.M. 2011. "Negative binomial regression". Cambridge University Press.
46
47    Following notation in Greene (2008), with negative binomial heterogeneity
48    parameter :math:`\alpha`:
49
50    .. math::
51
52        \lambda_i = exp(X\beta)\\
53        \theta = 1 / \alpha \\
54        g_i = \theta \lambda_i^Q \\
55        w_i = g_i/(g_i + \lambda_i) \\
56        r_i = \theta / (\theta+\lambda_i) \\
57        ln \mathcal{L}_i = ln \Gamma(y_i+g_i) - ln \Gamma(1+y_i) + g_iln (r_i) + y_i ln(1-r_i)
58    '''
59    mu = np.exp(np.dot(X, beta))
60    size = 1/alph*mu**Q
61    prob = size/(size+mu)
62    ll = nbinom.logpmf(y, size, prob)
63    return ll
64
65
66def _ll_nb1(y, X, beta, alph):
67    '''Negative Binomial regression (type 1 likelihood)'''
68    ll = _ll_nbp(y, X, beta, alph, Q=1)
69    return ll
70
71
72def _ll_nb2(y, X, beta, alph):
73    '''Negative Binomial regression (type 2 likelihood)'''
74    ll = _ll_nbp(y, X, beta, alph, Q=0)
75    return ll
76
77
78def _ll_geom(y, X, beta):
79    '''Geometric regression'''
80    ll = _ll_nbp(y, X, beta, alph=1, Q=0)
81    return ll
82
83
84def _ll_nbt(y, X, beta, alph, C=0):
85    r'''
86    Negative Binomial (truncated)
87
88    Truncated densities for count models (Cameron & Trivedi, 2005, 680):
89
90    .. math::
91
92        f(y|\beta, y \geq C+1) = \frac{f(y|\beta)}{1-F(C|\beta)}
93    '''
94    Q = 0
95    mu = np.exp(np.dot(X, beta))
96    size = 1/alph*mu**Q
97    prob = size/(size+mu)
98    ll = nbinom.logpmf(y, size, prob) - np.log(1 - nbinom.cdf(C, size, prob))
99    return ll
100
101
102#### Model Classes ####
103class NBin(GenericLikelihoodModel):
104    '''
105    Negative Binomial regression
106
107    Parameters
108    ----------
109    endog : array_like
110        1-d array of the response variable.
111    exog : array_like
112        `exog` is an n x p array where n is the number of observations and p
113        is the number of regressors including the intercept if one is
114        included in the data.
115    ll_type: str
116        log-likelihood type
117        `nb2`: Negative Binomial type-2 (most common)
118        `nb1`: Negative Binomial type-1
119        `nbp`: Negative Binomial type-P (Greene, 2008)
120        `nbt`: Left-truncated Negative Binomial (type-2)
121        `geom`: Geometric regression model
122    C: int
123        Cut-point for `nbt` model
124    '''
125    def __init__(self, endog, exog, ll_type='nb2', C=0, **kwds):
126        self.exog = np.array(exog)
127        self.endog = np.array(endog)
128        self.C = C
129        super(NBin, self).__init__(endog, exog, **kwds)
130        # Check user input
131        if ll_type not in ['nb2', 'nb1', 'nbp', 'nbt', 'geom']:
132            raise NameError('Valid ll_type are: nb2, nb1, nbp,  nbt, geom')
133        self.ll_type = ll_type
134        # Starting values (assumes first column of exog is constant)
135        if ll_type == 'geom':
136            self.start_params_default = np.zeros(self.exog.shape[1])
137        elif ll_type == 'nbp':
138            # Greene recommends starting NB-P at NB-2
139            start_mod = NBin(endog, exog, 'nb2')
140            start_res = start_mod.fit(disp=False)
141            self.start_params_default = np.append(start_res.params, 0)
142        else:
143            self.start_params_default = np.append(np.zeros(self.exog.shape[1]), .5)
144        self.start_params_default[0] = np.log(self.endog.mean())
145        # Define loglik based on ll_type argument
146        if ll_type == 'nb1':
147            self.ll_func = _ll_nb1
148        elif ll_type == 'nb2':
149            self.ll_func = _ll_nb2
150        elif ll_type == 'geom':
151            self.ll_func = _ll_geom
152        elif ll_type == 'nbp':
153            self.ll_func = _ll_nbp
154        elif ll_type == 'nbt':
155            self.ll_func = _ll_nbt
156
157    def nloglikeobs(self, params):
158        alph = params[-1]
159        beta = params[:self.exog.shape[1]]
160        if self.ll_type == 'geom':
161            return -self.ll_func(self.endog, self.exog, beta)
162        elif self.ll_type == 'nbt':
163            return -self.ll_func(self.endog, self.exog, beta, alph, self.C)
164        elif self.ll_type == 'nbp':
165            Q = params[-2]
166            return -self.ll_func(self.endog, self.exog, beta, alph, Q)
167        else:
168            return -self.ll_func(self.endog, self.exog, beta, alph)
169
170    def fit(self, start_params=None, maxiter=10000, maxfun=5000, **kwds):
171        if start_params is None:
172            countfit = super(NBin, self).fit(start_params=self.start_params_default,
173                                             maxiter=maxiter, maxfun=maxfun, **kwds)
174        else:
175            countfit = super(NBin, self).fit(start_params=start_params,
176                                             maxiter=maxiter, maxfun=maxfun, **kwds)
177        countfit = CountResults(self, countfit)
178        return countfit
179
180
181class CountResults(GenericLikelihoodModelResults):
182    def __init__(self, model, mlefit):
183        self.model = model
184        self.__dict__.update(mlefit.__dict__)
185    def summary(self, yname=None, xname=None, title=None, alpha=.05,
186                yname_list=None):
187        top_left = [('Dep. Variable:', None),
188                     ('Model:', [self.model.__class__.__name__]),
189                     ('Method:', ['MLE']),
190                     ('Date:', None),
191                     ('Time:', None),
192                     ('Converged:', ["%s" % self.mle_retvals['converged']])]
193        top_right = [('No. Observations:', None),
194                     ('Log-Likelihood:', None),
195                     ]
196        if title is None:
197            title = self.model.__class__.__name__ + ' ' + "Regression Results"
198        #boiler plate
199        from statsmodels.iolib.summary import Summary
200        smry = Summary()
201        # for top of table
202        smry.add_table_2cols(self, gleft=top_left, gright=top_right, #[],
203                          yname=yname, xname=xname, title=title)
204        # for parameters, etc
205        smry.add_table_params(self, yname=yname_list, xname=xname, alpha=alpha,
206                             use_t=True)
207        return smry
208
209
210#### Score function for NB-P ####
211
212
213def _score_nbp(y, X, beta, thet, Q):
214    r'''
215    Negative Binomial Score -- type P likelihood from Greene (2007)
216    .. math::
217
218        \lambda_i = exp(X\beta)\\
219        g_i = \theta \lambda_i^Q \\
220        w_i = g_i/(g_i + \lambda_i) \\
221        r_i = \theta / (\theta+\lambda_i) \\
222        A_i = \left [ \Psi(y_i+g_i) - \Psi(g_i) + ln w_i \right ] \\
223        B_i = \left [ g_i (1-w_i) - y_iw_i \right ] \\
224        \partial ln \mathcal{L}_i / \partial
225            \begin{pmatrix} \lambda_i \\ \theta \\ Q \end{pmatrix}=
226            [A_i+B_i]
227            \begin{pmatrix} Q/\lambda_i \\ 1/\theta \\ ln(\lambda_i) \end{pmatrix}
228            -B_i
229            \begin{pmatrix} 1/\lambda_i\\ 0 \\ 0 \end{pmatrix} \\
230        \frac{\partial \lambda}{\partial \beta} = \lambda_i \mathbf{x}_i \\
231        \frac{\partial \mathcal{L}_i}{\partial \beta} =
232            \left (\frac{\partial\mathcal{L}_i}{\partial \lambda_i} \right )
233            \frac{\partial \lambda_i}{\partial \beta}
234    '''
235    lamb = np.exp(np.dot(X, beta))
236    g = thet * lamb**Q
237    w = g / (g + lamb)
238    r = thet / (thet+lamb)
239    A = digamma(y+g) - digamma(g) + np.log(w)
240    B = g*(1-w) - y*w
241    dl = (A+B) * Q/lamb - B * 1/lamb
242    dt = (A+B) * 1/thet
243    dq = (A+B) * np.log(lamb)
244    db = X * (dl * lamb)[:,np.newaxis]
245    sc = np.array([dt.sum(), dq.sum()])
246    sc = np.concatenate([db.sum(axis=0), sc])
247    return sc
248
249
250#### Tests ####
251medpar = pandas.read_csv(urlopen('https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/csv/COUNT/medpar.csv'))
252mdvis = pandas.read_csv(urlopen('https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/csv/COUNT/mdvis.csv'))
253
254# NB-2
255'''
256# R v2.15.1
257library(MASS)
258library(COUNT)
259data(medpar)
260f <- los~factor(type)+hmo+white
261mod <- glm.nb(f, medpar)
262summary(mod)
263Call:
264glm.nb(formula = f, data = medpar, init.theta = 2.243376203,
265    link = log)
266
267Deviance Residuals:
268    Min       1Q   Median       3Q      Max
269-2.4671  -0.9090  -0.2693   0.4320   3.8668
270
271Coefficients:
272              Estimate Std. Error z value Pr(>|z|)
273(Intercept)    2.31028    0.06745  34.253  < 2e-16 ***
274factor(type)2  0.22125    0.05046   4.385 1.16e-05 ***
275factor(type)3  0.70616    0.07600   9.292  < 2e-16 ***
276hmo           -0.06796    0.05321  -1.277    0.202
277white         -0.12907    0.06836  -1.888    0.059 .
278---
279Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
280
281(Dispersion parameter for Negative Binomial(2.2434) family taken to be 1)
282
283    Null deviance: 1691.1  on 1494  degrees of freedom
284Residual deviance: 1568.1  on 1490  degrees of freedom
285AIC: 9607
286
287Number of Fisher Scoring iterations: 1
288
289
290              Theta:  2.2434
291          Std. Err.:  0.0997
292
293 2 x log-likelihood:  -9594.9530
294'''
295
296def test_nb2():
297    y, X = patsy.dmatrices('los ~ C(type) + hmo + white', medpar)
298    y = np.array(y)[:,0]
299    nb2 = NBin(y,X,'nb2').fit(maxiter=10000, maxfun=5000)
300    assert_almost_equal(nb2.params,
301                        [2.31027893349935, 0.221248978197356, 0.706158824346228,
302                         -0.067955221930748, -0.129065442248951, 0.4457567],
303                        decimal=2)
304
305# NB-1
306'''
307# R v2.15.1
308# COUNT v1.2.3
309library(COUNT)
310data(medpar)
311f <- los~factor(type)+hmo+white
312ml.nb1(f, medpar)
313
314                 Estimate         SE          Z         LCL         UCL
315(Intercept)    2.34918407 0.06023641 38.9994023  2.23112070  2.46724744
316factor(type)2  0.16175471 0.04585569  3.5274735  0.07187757  0.25163186
317factor(type)3  0.41879257 0.06553258  6.3906006  0.29034871  0.54723643
318hmo           -0.04533566 0.05004714 -0.9058592 -0.14342805  0.05275673
319white         -0.12951295 0.06071130 -2.1332593 -0.24850710 -0.01051880
320alpha          4.57898241 0.22015968 20.7984603  4.14746943  5.01049539
321'''
322
323#def test_nb1():
324    #y, X = patsy.dmatrices('los ~ C(type) + hmo + white', medpar)
325    #y = np.array(y)[:,0]
326    ## TODO: Test fails with some of the other optimization methods
327    #nb1 = NBin(y,X,'nb1').fit(method='ncg', maxiter=10000, maxfun=5000)
328    #assert_almost_equal(nb1.params,
329                        #[2.34918407014186, 0.161754714412848, 0.418792569970658,
330                        # -0.0453356614650342, -0.129512952033423, 4.57898241219275],
331                        #decimal=2)
332
333# NB-Geometric
334'''
335MASS v7.3-20
336R v2.15.1
337library(MASS)
338data(medpar)
339f <- los~factor(type)+hmo+white
340mod <- glm(f, family=negative.binomial(1), data=medpar)
341summary(mod)
342Call:
343glm(formula = f, family = negative.binomial(1), data = medpar)
344
345Deviance Residuals:
346    Min       1Q   Median       3Q      Max
347-1.7942  -0.6545  -0.1896   0.3044   2.6844
348
349Coefficients:
350              Estimate Std. Error t value Pr(>|t|)
351(Intercept)    2.30849    0.07071  32.649  < 2e-16 ***
352factor(type)2  0.22121    0.05283   4.187 2.99e-05 ***
353factor(type)3  0.70599    0.08092   8.724  < 2e-16 ***
354hmo           -0.06779    0.05521  -1.228   0.2197
355white         -0.12709    0.07169  -1.773   0.0765 .
356---
357Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
358
359(Dispersion parameter for Negative Binomial(1) family taken to be 0.5409721)
360
361    Null deviance: 872.29  on 1494  degrees of freedom
362Residual deviance: 811.95  on 1490  degrees of freedom
363AIC: 9927.3
364
365Number of Fisher Scoring iterations: 5
366'''
367
368#def test_geom():
369    #y, X = patsy.dmatrices('los ~ C(type) + hmo + white', medpar)
370    #y = np.array(y)[:,0]
371    ## TODO: remove alph from geom params
372    #geom = NBin(y,X,'geom').fit(maxiter=10000, maxfun=5000)
373    #assert_almost_equal(geom.params,
374                        #[2.3084850946241, 0.221206159108742, 0.705986369841159,
375                        # -0.0677871843613577, -0.127088772164963],
376                        #decimal=4)
377
378test_nb2()
379