1"""
2Test functions for models.GLM
3"""
4import os
5import warnings
6
7import numpy as np
8from numpy.testing import (
9    assert_,
10    assert_allclose,
11    assert_almost_equal,
12    assert_array_less,
13    assert_equal,
14    assert_raises,
15)
16import pandas as pd
17from pandas.testing import assert_series_equal
18import pytest
19from scipy import stats
20
21import statsmodels.api as sm
22from statsmodels.datasets import cpunish, longley
23from statsmodels.discrete import discrete_model as discrete
24from statsmodels.genmod.generalized_linear_model import GLM, SET_USE_BIC_LLF
25from statsmodels.tools.numdiff import (
26    approx_fprime,
27    approx_fprime_cs,
28    approx_hess,
29    approx_hess_cs,
30)
31from statsmodels.tools.sm_exceptions import (
32    DomainWarning,
33    PerfectSeparationError,
34    ValueWarning,
35)
36from statsmodels.tools.tools import add_constant
37
38# Test Precisions
39DECIMAL_4 = 4
40DECIMAL_3 = 3
41DECIMAL_2 = 2
42DECIMAL_1 = 1
43DECIMAL_0 = 0
44
45pdf_output = False
46
47if pdf_output:
48    from matplotlib.backends.backend_pdf import PdfPages
49    pdf = PdfPages("test_glm.pdf")
50else:
51    pdf = None
52
53
54def close_or_save(pdf, fig):
55    if pdf_output:
56        pdf.savefig(fig)
57
58
59def teardown_module():
60    if pdf_output:
61        pdf.close()
62
63
64@pytest.fixture(scope="module")
65def iris():
66    cur_dir = os.path.dirname(os.path.abspath(__file__))
67    return np.genfromtxt(os.path.join(cur_dir, 'results', 'iris.csv'),
68                         delimiter=",", skip_header=1)
69
70
71class CheckModelResultsMixin(object):
72    '''
73    res2 should be either the results from RModelWrap
74    or the results as defined in model_results_data
75    '''
76
77    decimal_params = DECIMAL_4
78    def test_params(self):
79        assert_almost_equal(self.res1.params, self.res2.params,
80                self.decimal_params)
81
82    decimal_bse = DECIMAL_4
83    def test_standard_errors(self):
84        assert_allclose(self.res1.bse, self.res2.bse,
85                        atol=10**(-self.decimal_bse), rtol=1e-5)
86
87    decimal_resids = DECIMAL_4
88    def test_residuals(self):
89        # fix incorrect numbers in resid_working results
90        # residuals for Poisson are also tested in test_glm_weights.py
91        import copy
92
93        # new numpy would have copy method
94        resid2 = copy.copy(self.res2.resids)
95        resid2[:, 2] *= self.res1.family.link.deriv(self.res1.mu)**2
96
97        atol = 10**(-self.decimal_resids)
98        resid_a = self.res1.resid_anscombe_unscaled
99        resids = np.column_stack((self.res1.resid_pearson,
100                self.res1.resid_deviance, self.res1.resid_working,
101                resid_a, self.res1.resid_response))
102        assert_allclose(resids, resid2, rtol=1e-6, atol=atol)
103
104    decimal_aic_R = DECIMAL_4
105
106    def test_aic_R(self):
107        # R includes the estimation of the scale as a lost dof
108        # Does not with Gamma though
109        if self.res1.scale != 1:
110            dof = 2
111        else:
112            dof = 0
113        if isinstance(self.res1.model.family, (sm.families.NegativeBinomial)):
114            llf = self.res1.model.family.loglike(self.res1.model.endog,
115                                                 self.res1.mu,
116                                                 self.res1.model.var_weights,
117                                                 self.res1.model.freq_weights,
118                                                 scale=1)
119            aic = (-2*llf+2*(self.res1.df_model+1))
120        else:
121            aic = self.res1.aic
122        assert_almost_equal(aic+dof, self.res2.aic_R,
123                self.decimal_aic_R)
124
125    decimal_aic_Stata = DECIMAL_4
126    def test_aic_Stata(self):
127        # Stata uses the below llf for aic definition for these families
128        if isinstance(self.res1.model.family, (sm.families.Gamma,
129                                               sm.families.InverseGaussian,
130                                               sm.families.NegativeBinomial)):
131            llf = self.res1.model.family.loglike(self.res1.model.endog,
132                                                 self.res1.mu,
133                                                 self.res1.model.var_weights,
134                                                 self.res1.model.freq_weights,
135                                                 scale=1)
136            aic = (-2*llf+2*(self.res1.df_model+1))/self.res1.nobs
137        else:
138            aic = self.res1.aic/self.res1.nobs
139        assert_almost_equal(aic, self.res2.aic_Stata, self.decimal_aic_Stata)
140
141    decimal_deviance = DECIMAL_4
142    def test_deviance(self):
143        assert_almost_equal(self.res1.deviance, self.res2.deviance,
144                self.decimal_deviance)
145
146    decimal_scale = DECIMAL_4
147    def test_scale(self):
148        assert_almost_equal(self.res1.scale, self.res2.scale,
149                self.decimal_scale)
150
151    decimal_loglike = DECIMAL_4
152    def test_loglike(self):
153        # Stata uses the below llf for these families
154        # We differ with R for them
155        if isinstance(self.res1.model.family, (sm.families.Gamma,
156                                               sm.families.InverseGaussian,
157                                               sm.families.NegativeBinomial)):
158            llf = self.res1.model.family.loglike(self.res1.model.endog,
159                                                 self.res1.mu,
160                                                 self.res1.model.var_weights,
161                                                 self.res1.model.freq_weights,
162                                                 scale=1)
163        else:
164            llf = self.res1.llf
165        assert_almost_equal(llf, self.res2.llf, self.decimal_loglike)
166
167    decimal_null_deviance = DECIMAL_4
168    def test_null_deviance(self):
169        with warnings.catch_warnings():
170            warnings.simplefilter("ignore", DomainWarning)
171
172            assert_almost_equal(self.res1.null_deviance,
173                                self.res2.null_deviance,
174                                self.decimal_null_deviance)
175
176    decimal_bic = DECIMAL_4
177    def test_bic(self):
178        with warnings.catch_warnings():
179            warnings.simplefilter("ignore")
180            assert_almost_equal(self.res1.bic,
181                                self.res2.bic_Stata,
182                                self.decimal_bic)
183
184    def test_degrees(self):
185        assert_equal(self.res1.model.df_resid,self.res2.df_resid)
186
187    decimal_fittedvalues = DECIMAL_4
188    def test_fittedvalues(self):
189        assert_almost_equal(self.res1.fittedvalues, self.res2.fittedvalues,
190                self.decimal_fittedvalues)
191
192    def test_tpvalues(self):
193        # test comparing tvalues and pvalues with normal implementation
194        # make sure they use normal distribution (inherited in results class)
195        params = self.res1.params
196        tvalues = params / self.res1.bse
197        pvalues = stats.norm.sf(np.abs(tvalues)) * 2
198        half_width = stats.norm.isf(0.025) * self.res1.bse
199        conf_int = np.column_stack((params - half_width, params + half_width))
200        if isinstance(tvalues, pd.Series):
201            assert_series_equal(self.res1.tvalues, tvalues)
202        else:
203            assert_almost_equal(self.res1.tvalues, tvalues)
204        assert_almost_equal(self.res1.pvalues, pvalues)
205        assert_almost_equal(self.res1.conf_int(), conf_int)
206
207    def test_pearson_chi2(self):
208        if hasattr(self.res2, 'pearson_chi2'):
209            assert_allclose(self.res1.pearson_chi2, self.res2.pearson_chi2,
210                            atol=1e-6, rtol=1e-6)
211
212    def test_prsquared(self):
213        if hasattr(self.res2, 'prsquared'):
214            assert_allclose(self.res1.pseudo_rsquared(kind="mcf"),
215                            self.res2.prsquared, rtol=0.05)
216
217        if hasattr(self.res2, 'prsquared_cox_snell'):
218            assert_allclose(float(self.res1.pseudo_rsquared(kind="cs")),
219                            self.res2.prsquared_cox_snell, rtol=0.05)
220
221    @pytest.mark.smoke
222    def test_summary(self):
223        self.res1.summary()
224
225    @pytest.mark.smoke
226    def test_summary2(self):
227        with warnings.catch_warnings():
228            warnings.simplefilter("ignore", DomainWarning)
229            self.res1.summary2()
230
231    def test_get_distribution(self):
232        res1 = self.res1
233        if not hasattr(res1.model.family, "get_distribution"):
234            # only Tweedie has not get_distribution
235            pytest.skip("get_distribution not available")
236
237        if isinstance(res1.model.family, sm.families.NegativeBinomial):
238            res_scale = 1  # QMLE scale can differ from 1
239        else:
240            res_scale = res1.scale
241
242        distr = res1.model.family.get_distribution(res1.fittedvalues,
243                                                   res_scale)
244        var_endog = res1.model.family.variance(res1.fittedvalues) * res_scale
245        m, v = distr.stats()
246        assert_allclose(res1.fittedvalues, m, rtol=1e-13)
247        assert_allclose(var_endog, v, rtol=1e-13)
248        # check model method
249        distr2 = res1.model.get_distribution(res1.params, res_scale)
250        for k in distr2.kwds:
251            assert_allclose(distr.kwds[k], distr2.kwds[k], rtol=1e-13)
252
253
254class CheckComparisonMixin(object):
255
256    def test_compare_discrete(self):
257        res1 = self.res1
258        resd = self.resd
259
260        assert_allclose(res1.llf, resd.llf, rtol=1e-10)
261        score_obs1 = res1.model.score_obs(res1.params * 0.98)
262        score_obsd = resd.model.score_obs(resd.params * 0.98)
263        assert_allclose(score_obs1, score_obsd, rtol=1e-10)
264
265        # score
266        score1 = res1.model.score(res1.params * 0.98)
267        assert_allclose(score1, score_obs1.sum(0), atol=1e-20)
268        score0 = res1.model.score(res1.params)
269        assert_allclose(score0, np.zeros(score_obs1.shape[1]), atol=5e-7)
270
271        hessian1 = res1.model.hessian(res1.params * 0.98, observed=False)
272        hessiand = resd.model.hessian(resd.params * 0.98)
273        assert_allclose(hessian1, hessiand, rtol=1e-10)
274
275        hessian1 = res1.model.hessian(res1.params * 0.98, observed=True)
276        hessiand = resd.model.hessian(resd.params * 0.98)
277        assert_allclose(hessian1, hessiand, rtol=1e-9)
278
279    def test_score_test(self):
280        res1 = self.res1
281        # fake example, should be zero, k_constraint should be 0
282        st, pv, df = res1.model.score_test(res1.params, k_constraints=1)
283        assert_allclose(st, 0, atol=1e-20)
284        assert_allclose(pv, 1, atol=1e-10)
285        assert_equal(df, 1)
286
287        st, pv, df = res1.model.score_test(res1.params, k_constraints=0)
288        assert_allclose(st, 0, atol=1e-20)
289        assert_(np.isnan(pv), msg=repr(pv))
290        assert_equal(df, 0)
291
292        # TODO: no verified numbers largely SMOKE test
293        exog_extra = res1.model.exog[:,1]**2
294        st, pv, df = res1.model.score_test(res1.params, exog_extra=exog_extra)
295        assert_array_less(0.1, st)
296        assert_array_less(0.1, pv)
297        assert_equal(df, 1)
298
299
300class TestGlmGaussian(CheckModelResultsMixin):
301    @classmethod
302    def setup_class(cls):
303        '''
304        Test Gaussian family with canonical identity link
305        '''
306        # Test Precisions
307        cls.decimal_resids = DECIMAL_3
308        cls.decimal_params = DECIMAL_2
309        cls.decimal_bic = DECIMAL_0
310        cls.decimal_bse = DECIMAL_3
311
312        from statsmodels.datasets.longley import load
313        cls.data = load()
314        cls.data.endog = np.asarray(cls.data.endog)
315        cls.data.exog = np.asarray(cls.data.exog)
316        cls.data.exog = add_constant(cls.data.exog, prepend=False)
317        cls.res1 = GLM(cls.data.endog, cls.data.exog,
318                        family=sm.families.Gaussian()).fit()
319        from .results.results_glm import Longley
320        cls.res2 = Longley()
321
322
323    def test_compare_OLS(self):
324        res1 = self.res1
325        # OLS does not define score_obs
326        from statsmodels.regression.linear_model import OLS
327        resd = OLS(self.data.endog, self.data.exog).fit()
328        self.resd = resd  # attach to access from the outside
329
330        assert_allclose(res1.llf, resd.llf, rtol=1e-10)
331        score_obs1 = res1.model.score_obs(res1.params, scale=None)
332        score_obsd = resd.resid[:, None] / resd.scale * resd.model.exog
333        # low precision because of badly scaled exog
334        assert_allclose(score_obs1, score_obsd, rtol=1e-8)
335
336        score_obs1 = res1.model.score_obs(res1.params, scale=1)
337        score_obsd = resd.resid[:, None] * resd.model.exog
338        assert_allclose(score_obs1, score_obsd, rtol=1e-8)
339
340        hess_obs1 = res1.model.hessian(res1.params, scale=None)
341        hess_obsd = -1. / resd.scale * resd.model.exog.T.dot(resd.model.exog)
342        # low precision because of badly scaled exog
343        assert_allclose(hess_obs1, hess_obsd, rtol=1e-8)
344
345# FIXME: enable or delete
346#    def setup(self):
347#        if skipR:
348#            raise SkipTest, "Rpy not installed."
349#        Gauss = r.gaussian
350#        self.res2 = RModel(self.data.endog, self.data.exog, r.glm, family=Gauss)
351#        self.res2.resids = np.array(self.res2.resid)[:,None]*np.ones((1,5))
352#        self.res2.null_deviance = 185008826 # taken from R. Rpy bug?
353
354
355class TestGlmGaussianGradient(TestGlmGaussian):
356    @classmethod
357    def setup_class(cls):
358        '''
359        Test Gaussian family with canonical identity link
360        '''
361        # Test Precisions
362        cls.decimal_resids = DECIMAL_3
363        cls.decimal_params = DECIMAL_2
364        cls.decimal_bic = DECIMAL_0
365        cls.decimal_bse = DECIMAL_2
366
367        from statsmodels.datasets.longley import load
368        cls.data = load()
369        cls.data.endog = np.asarray(cls.data.endog)
370        cls.data.exog = np.asarray(cls.data.exog)
371        cls.data.exog = add_constant(cls.data.exog, prepend=False)
372        cls.res1 = GLM(cls.data.endog, cls.data.exog,
373                        family=sm.families.Gaussian()).fit(method='bfgs')
374        from .results.results_glm import Longley
375        cls.res2 = Longley()
376
377
378class TestGaussianLog(CheckModelResultsMixin):
379    @classmethod
380    def setup_class(cls):
381        # Test Precision
382        cls.decimal_aic_R = DECIMAL_0
383        cls.decimal_aic_Stata = DECIMAL_2
384        cls.decimal_loglike = DECIMAL_0
385        cls.decimal_null_deviance = DECIMAL_1
386
387        nobs = 100
388        x = np.arange(nobs)
389        np.random.seed(54321)
390#        y = 1.0 - .02*x - .001*x**2 + 0.001 * np.random.randn(nobs)
391        cls.X = np.c_[np.ones((nobs,1)),x,x**2]
392        cls.lny = np.exp(-(-1.0 + 0.02*x + 0.0001*x**2)) +\
393                        0.001 * np.random.randn(nobs)
394
395        GaussLog_Model = GLM(cls.lny, cls.X,
396                             family=sm.families.Gaussian(sm.families.links.log()))
397        cls.res1 = GaussLog_Model.fit()
398        from .results.results_glm import GaussianLog
399        cls.res2 = GaussianLog()
400
401# FIXME: enable or delete
402#    def setup(cls):
403#        if skipR:
404#            raise SkipTest, "Rpy not installed"
405#        GaussLogLink = r.gaussian(link = "log")
406#        GaussLog_Res_R = RModel(cls.lny, cls.X, r.glm, family=GaussLogLink)
407#        cls.res2 = GaussLog_Res_R
408
409class TestGaussianInverse(CheckModelResultsMixin):
410    @classmethod
411    def setup_class(cls):
412        # Test Precisions
413        cls.decimal_bic = DECIMAL_1
414        cls.decimal_aic_R = DECIMAL_1
415        cls.decimal_aic_Stata = DECIMAL_3
416        cls.decimal_loglike = DECIMAL_1
417        cls.decimal_resids = DECIMAL_3
418
419        nobs = 100
420        x = np.arange(nobs)
421        np.random.seed(54321)
422        y = 1.0 + 2.0 * x + x**2 + 0.1 * np.random.randn(nobs)
423        cls.X = np.c_[np.ones((nobs,1)),x,x**2]
424        cls.y_inv = (1. + .02*x + .001*x**2)**-1 + .001 * np.random.randn(nobs)
425        InverseLink_Model = GLM(cls.y_inv, cls.X,
426                family=sm.families.Gaussian(sm.families.links.inverse_power()))
427        InverseLink_Res = InverseLink_Model.fit()
428        cls.res1 = InverseLink_Res
429        from .results.results_glm import GaussianInverse
430        cls.res2 = GaussianInverse()
431
432# FIXME: enable or delete
433#    def setup(cls):
434#        if skipR:
435#            raise SkipTest, "Rpy not installed."
436#        InverseLink = r.gaussian(link = "inverse")
437#        InverseLink_Res_R = RModel(cls.y_inv, cls.X, r.glm, family=InverseLink)
438#        cls.res2 = InverseLink_Res_R
439
440class TestGlmBinomial(CheckModelResultsMixin):
441    @classmethod
442    def setup_class(cls):
443        '''
444        Test Binomial family with canonical logit link using star98 dataset.
445        '''
446        cls.decimal_resids = DECIMAL_1
447        cls.decimal_bic = DECIMAL_2
448
449        from statsmodels.datasets.star98 import load
450
451        from .results.results_glm import Star98
452        data = load()
453        data.endog = np.asarray(data.endog)
454        data.exog = np.asarray(data.exog)
455        data.exog = add_constant(data.exog, prepend=False)
456        cls.res1 = GLM(data.endog, data.exog,
457                       family=sm.families.Binomial()).fit()
458        # NOTE: if you want to replicate with RModel
459        # res2 = RModel(data.endog[:,0]/trials, data.exog, r.glm,
460        #        family=r.binomial, weights=trials)
461
462        cls.res2 = Star98()
463
464    def test_endog_dtype(self):
465        from statsmodels.datasets.star98 import load
466        data = load()
467        data.exog = add_constant(data.exog, prepend=False)
468        endog = data.endog.astype(int)
469        res2 = GLM(endog, data.exog, family=sm.families.Binomial()).fit()
470        assert_allclose(res2.params, self.res1.params)
471        endog = data.endog.astype(np.double)
472        res3 = GLM(endog, data.exog, family=sm.families.Binomial()).fit()
473        assert_allclose(res3.params, self.res1.params)
474
475    def test_invalid_endog(self, reset_randomstate):
476        # GH2733 inspired check
477        endog = np.random.randint(0, 100, size=(1000, 3))
478        exog = np.random.standard_normal((1000, 2))
479        with pytest.raises(ValueError, match='endog has more than 2 columns'):
480            GLM(endog, exog, family=sm.families.Binomial())
481
482    def test_invalid_endog_formula(self, reset_randomstate):
483        # GH2733
484        n = 200
485        exog = np.random.normal(size=(n, 2))
486        endog = np.random.randint(0, 3, size=n).astype(str)
487        # formula interface
488        data = pd.DataFrame({"y": endog, "x1": exog[:, 0], "x2": exog[:, 1]})
489        with pytest.raises(ValueError, match='array with multiple columns'):
490            sm.GLM.from_formula("y ~ x1 + x2", data,
491                                family=sm.families.Binomial())
492
493    def test_get_distribution_binom_count(self):
494        # test for binomial counts with n_trials > 1
495        res1 = self.res1
496        res_scale = 1  # QMLE scale can differ from 1
497
498        mu_prob = res1.fittedvalues
499        n = res1.model.n_trials
500        distr = res1.model.family.get_distribution(mu_prob, res_scale,
501                                                   n_trials=n)
502        var_endog = res1.model.family.variance(mu_prob) * res_scale
503        m, v = distr.stats()
504        assert_allclose(mu_prob * n, m, rtol=1e-13)
505        assert_allclose(var_endog * n, v, rtol=1e-13)
506
507        # check model method
508        distr2 = res1.model.get_distribution(res1.params, res_scale,
509                                             n_trials=n)
510        for k in distr2.kwds:
511            assert_allclose(distr.kwds[k], distr2.kwds[k], rtol=1e-13)
512
513
514# FIXME: enable/xfail/skip or delete
515# TODO:
516# Non-Canonical Links for the Binomial family require the algorithm to be
517# slightly changed
518# class TestGlmBinomialLog(CheckModelResultsMixin):
519#    pass
520
521# class TestGlmBinomialLogit(CheckModelResultsMixin):
522#    pass
523
524# class TestGlmBinomialProbit(CheckModelResultsMixin):
525#    pass
526
527# class TestGlmBinomialCloglog(CheckModelResultsMixin):
528#    pass
529
530# class TestGlmBinomialPower(CheckModelResultsMixin):
531#    pass
532
533# class TestGlmBinomialLoglog(CheckModelResultsMixin):
534#    pass
535
536# class TestGlmBinomialLogc(CheckModelResultsMixin):
537# TODO: need include logc link
538#    pass
539
540
541class TestGlmBernoulli(CheckModelResultsMixin, CheckComparisonMixin):
542    @classmethod
543    def setup_class(cls):
544        from .results.results_glm import Lbw
545        cls.res2 = Lbw()
546        cls.res1 = GLM(cls.res2.endog, cls.res2.exog,
547                       family=sm.families.Binomial()).fit()
548
549        modd = discrete.Logit(cls.res2.endog, cls.res2.exog)
550        cls.resd = modd.fit(start_params=cls.res1.params * 0.9, disp=False)
551
552    def test_score_r(self):
553        res1 = self.res1
554        res2 = self.res2
555        st, pv, df = res1.model.score_test(res1.params,
556                                           exog_extra=res1.model.exog[:, 1]**2)
557        st_res = 0.2837680293459376  # (-0.5326988167303712)**2
558        assert_allclose(st, st_res, rtol=1e-4)
559
560        st, pv, df = res1.model.score_test(res1.params,
561                                          exog_extra=res1.model.exog[:, 0]**2)
562        st_res = 0.6713492821514992  # (-0.8193590679009413)**2
563        assert_allclose(st, st_res, rtol=1e-4)
564
565        select = list(range(9))
566        select.pop(7)
567
568        res1b = GLM(res2.endog, res2.exog.iloc[:, select],
569                    family=sm.families.Binomial()).fit()
570        tres = res1b.model.score_test(res1b.params,
571                                      exog_extra=res1.model.exog[:, -2])
572        tres = np.asarray(tres[:2]).ravel()
573        tres_r = (2.7864148487452, 0.0950667)
574        assert_allclose(tres, tres_r, rtol=1e-4)
575
576        cmd_r = """\
577        data = read.csv("...statsmodels\\statsmodels\\genmod\\tests\\results\\stata_lbw_glm.csv")
578
579        data["race_black"] = data["race"] == "black"
580        data["race_other"] = data["race"] == "other"
581        mod = glm(low ~ age + lwt + race_black + race_other + smoke + ptl + ht + ui, family=binomial, data=data)
582        options(digits=16)
583        anova(mod, test="Rao")
584
585        library(statmod)
586        s = glm.scoretest(mod, data["age"]**2)
587        s**2
588        s = glm.scoretest(mod, data["lwt"]**2)
589        s**2
590        """
591
592# class TestGlmBernoulliIdentity(CheckModelResultsMixin):
593#    pass
594
595# class TestGlmBernoulliLog(CheckModelResultsMixin):
596#    pass
597
598# class TestGlmBernoulliProbit(CheckModelResultsMixin):
599#    pass
600
601# class TestGlmBernoulliCloglog(CheckModelResultsMixin):
602#    pass
603
604# class TestGlmBernoulliPower(CheckModelResultsMixin):
605#    pass
606
607# class TestGlmBernoulliLoglog(CheckModelResultsMixin):
608#    pass
609
610# class test_glm_bernoulli_logc(CheckModelResultsMixin):
611#    pass
612
613
614class TestGlmGamma(CheckModelResultsMixin):
615
616    @classmethod
617    def setup_class(cls):
618        '''
619        Tests Gamma family with canonical inverse link (power -1)
620        '''
621        # Test Precisions
622        cls.decimal_aic_R = -1 #TODO: off by about 1, we are right with Stata
623        cls.decimal_resids = DECIMAL_2
624
625        from statsmodels.datasets.scotland import load
626
627        from .results.results_glm import Scotvote
628        data = load()
629        data.exog = add_constant(data.exog, prepend=False)
630        with warnings.catch_warnings():
631            warnings.simplefilter("ignore")
632            res1 = GLM(data.endog, data.exog,
633                       family=sm.families.Gamma()).fit()
634        cls.res1 = res1
635#        res2 = RModel(data.endog, data.exog, r.glm, family=r.Gamma)
636        res2 = Scotvote()
637        res2.aic_R += 2 # R does not count degree of freedom for scale with gamma
638        cls.res2 = res2
639
640
641class TestGlmGammaLog(CheckModelResultsMixin):
642    @classmethod
643    def setup_class(cls):
644        # Test Precisions
645        cls.decimal_resids = DECIMAL_3
646        cls.decimal_aic_R = DECIMAL_0
647        cls.decimal_fittedvalues = DECIMAL_3
648
649        from .results.results_glm import CancerLog
650        res2 = CancerLog()
651        cls.res1 = GLM(res2.endog, res2.exog,
652            family=sm.families.Gamma(link=sm.families.links.log())).fit()
653        cls.res2 = res2
654
655# FIXME: enable or delete
656#    def setup(cls):
657#        if skipR:
658#            raise SkipTest, "Rpy not installed."
659#        cls.res2 = RModel(cls.data.endog, cls.data.exog, r.glm,
660#            family=r.Gamma(link="log"))
661#        cls.res2.null_deviance = 27.92207137420696 # From R (bug in rpy)
662#        cls.res2.bic = -154.1582089453923 # from Stata
663
664
665class TestGlmGammaIdentity(CheckModelResultsMixin):
666    @classmethod
667    def setup_class(cls):
668        # Test Precisions
669        cls.decimal_resids = -100 #TODO Very off from Stata?
670        cls.decimal_params = DECIMAL_2
671        cls.decimal_aic_R = DECIMAL_0
672        cls.decimal_loglike = DECIMAL_1
673
674        from .results.results_glm import CancerIdentity
675        res2 = CancerIdentity()
676        with warnings.catch_warnings():
677            warnings.simplefilter("ignore")
678            fam = sm.families.Gamma(link=sm.families.links.identity())
679            cls.res1 = GLM(res2.endog, res2.exog, family=fam).fit()
680        cls.res2 = res2
681
682# FIXME: enable or delete
683#    def setup(cls):
684#        if skipR:
685#            raise SkipTest, "Rpy not installed."
686#        cls.res2 = RModel(cls.data.endog, cls.data.exog, r.glm,
687#            family=r.Gamma(link="identity"))
688#        cls.res2.null_deviance = 27.92207137420696 # from R, Rpy bug
689
690class TestGlmPoisson(CheckModelResultsMixin, CheckComparisonMixin):
691    @classmethod
692    def setup_class(cls):
693        '''
694        Tests Poisson family with canonical log link.
695
696        Test results were obtained by R.
697        '''
698        from .results.results_glm import Cpunish
699        cls.data = cpunish.load()
700        cls.data.endog = np.asarray(cls.data.endog)
701        cls.data.exog = np.asarray(cls.data.exog)
702        cls.data.exog[:, 3] = np.log(cls.data.exog[:, 3])
703        cls.data.exog = add_constant(cls.data.exog, prepend=False)
704        cls.res1 = GLM(cls.data.endog, cls.data.exog,
705                       family=sm.families.Poisson()).fit()
706        cls.res2 = Cpunish()
707        # compare with discrete, start close to save time
708        modd = discrete.Poisson(cls.data.endog, cls.data.exog)
709        cls.resd = modd.fit(start_params=cls.res1.params * 0.9, disp=False)
710
711#class TestGlmPoissonIdentity(CheckModelResultsMixin):
712#    pass
713
714#class TestGlmPoissonPower(CheckModelResultsMixin):
715#    pass
716
717
718class TestGlmInvgauss(CheckModelResultsMixin):
719    @classmethod
720    def setup_class(cls):
721        '''
722        Tests the Inverse Gaussian family in GLM.
723
724        Notes
725        -----
726        Used the rndivgx.ado file provided by Hardin and Hilbe to
727        generate the data.  Results are read from model_results, which
728        were obtained by running R_ig.s
729        '''
730        # Test Precisions
731        cls.decimal_aic_R = DECIMAL_0
732        cls.decimal_loglike = DECIMAL_0
733
734        from .results.results_glm import InvGauss
735        res2 = InvGauss()
736        res1 = GLM(res2.endog, res2.exog,
737                   family=sm.families.InverseGaussian()).fit()
738        cls.res1 = res1
739        cls.res2 = res2
740
741    def test_get_distribution(self):
742        res1 = self.res1
743        distr = res1.model.family.get_distribution(res1.fittedvalues,
744                                                   res1.scale)
745        var_endog = res1.model.family.variance(res1.fittedvalues) * res1.scale
746        m, v = distr.stats()
747        assert_allclose(res1.fittedvalues, m, rtol=1e-13)
748        assert_allclose(var_endog, v, rtol=1e-13)
749
750
751class TestGlmInvgaussLog(CheckModelResultsMixin):
752    @classmethod
753    def setup_class(cls):
754        # Test Precisions
755        cls.decimal_aic_R = -10 # Big difference vs R.
756        cls.decimal_resids = DECIMAL_3
757
758        from .results.results_glm import InvGaussLog
759        res2 = InvGaussLog()
760        cls.res1 = GLM(res2.endog, res2.exog,
761            family=sm.families.InverseGaussian(
762                link=sm.families.links.log())).fit()
763        cls.res2 = res2
764
765# FIXME: enable or delete
766#    def setup(cls):
767#        if skipR:
768#            raise SkipTest, "Rpy not installed."
769#        cls.res2 = RModel(cls.data.endog, cls.data.exog, r.glm,
770#            family=r.inverse_gaussian(link="log"))
771#        cls.res2.null_deviance = 335.1539777981053 # from R, Rpy bug
772#        cls.res2.llf = -12162.72308 # from Stata, R's has big rounding diff
773
774
775class TestGlmInvgaussIdentity(CheckModelResultsMixin):
776    @classmethod
777    def setup_class(cls):
778        # Test Precisions
779        cls.decimal_aic_R = -10 #TODO: Big difference vs R
780        cls.decimal_fittedvalues = DECIMAL_3
781        cls.decimal_params = DECIMAL_3
782
783        from .results.results_glm import Medpar1
784        data = Medpar1()
785        with warnings.catch_warnings():
786            warnings.simplefilter("ignore")
787            cls.res1 = GLM(data.endog, data.exog,
788                            family=sm.families.InverseGaussian(
789                                link=sm.families.links.identity())).fit()
790        from .results.results_glm import InvGaussIdentity
791        cls.res2 = InvGaussIdentity()
792
793# FIXME: enable or delete
794#    def setup(cls):
795#        if skipR:
796#            raise SkipTest, "Rpy not installed."
797#        cls.res2 = RModel(cls.data.endog, cls.data.exog, r.glm,
798#            family=r.inverse_gaussian(link="identity"))
799#        cls.res2.null_deviance = 335.1539777981053 # from R, Rpy bug
800#        cls.res2.llf = -12163.25545    # from Stata, big diff with R
801
802
803class TestGlmNegbinomial(CheckModelResultsMixin):
804    @classmethod
805    def setup_class(cls):
806        '''
807        Test Negative Binomial family with log link
808        '''
809        # Test Precision
810        cls.decimal_resid = DECIMAL_1
811        cls.decimal_params = DECIMAL_3
812        cls.decimal_resids = -1 # 1 % mismatch at 0
813        cls.decimal_fittedvalues = DECIMAL_1
814
815        from statsmodels.datasets.committee import load
816        cls.data = load()
817        cls.data.endog = np.asarray(cls.data.endog)
818        cls.data.exog = np.asarray(cls.data.exog)
819        cls.data.exog[:,2] = np.log(cls.data.exog[:,2])
820        interaction = cls.data.exog[:,2]*cls.data.exog[:,1]
821        cls.data.exog = np.column_stack((cls.data.exog,interaction))
822        cls.data.exog = add_constant(cls.data.exog, prepend=False)
823        with warnings.catch_warnings():
824            warnings.simplefilter("ignore", category=DomainWarning)
825            fam = sm.families.NegativeBinomial()
826
827        cls.res1 = GLM(cls.data.endog, cls.data.exog,
828                family=fam).fit(scale='x2')
829        from .results.results_glm import Committee
830        res2 = Committee()
831        res2.aic_R += 2 # They do not count a degree of freedom for the scale
832        cls.res2 = res2
833
834# FIXME: enable or delete
835#    def setup(self):
836#        if skipR:
837#            raise SkipTest, "Rpy not installed"
838#        r.library('MASS')  # this does not work when done in rmodelwrap?
839#        self.res2 = RModel(self.data.endog, self.data.exog, r.glm,
840#                family=r.negative_binomial(1))
841#        self.res2.null_deviance = 27.8110469364343
842
843# FIXME: enable/xfail/skip or delete
844#class TestGlmNegbinomial_log(CheckModelResultsMixin):
845#    pass
846
847# FIXME: enable/xfail/skip or delete
848#class TestGlmNegbinomial_power(CheckModelResultsMixin):
849#    pass
850
851# FIXME: enable/xfail/skip or delete
852#class TestGlmNegbinomial_nbinom(CheckModelResultsMixin):
853#    pass
854
855
856class TestGlmPoissonOffset(CheckModelResultsMixin):
857    @classmethod
858    def setup_class(cls):
859        from .results.results_glm import Cpunish_offset
860        cls.decimal_params = DECIMAL_4
861        cls.decimal_bse = DECIMAL_4
862        cls.decimal_aic_R = 3
863        data = cpunish.load()
864        data.endog = np.asarray(data.endog)
865        data.exog = np.asarray(data.exog)
866        data.exog[:, 3] = np.log(data.exog[:, 3])
867        data.exog = add_constant(data.exog, prepend=True)
868        exposure = [100] * len(data.endog)
869        cls.data = data
870        cls.exposure = exposure
871        cls.res1 = GLM(data.endog, data.exog, family=sm.families.Poisson(),
872                       exposure=exposure).fit()
873        cls.res2 = Cpunish_offset()
874
875    def test_missing(self):
876        # make sure offset is dropped correctly
877        endog = self.data.endog.copy()
878        endog[[2,4,6,8]] = np.nan
879        mod = GLM(endog, self.data.exog, family=sm.families.Poisson(),
880                    exposure=self.exposure, missing='drop')
881        assert_equal(mod.exposure.shape[0], 13)
882
883    def test_offset_exposure(self):
884        # exposure=x and offset=log(x) should have the same effect
885        np.random.seed(382304)
886        endog = np.random.randint(0, 10, 100)
887        exog = np.random.normal(size=(100,3))
888        exposure = np.random.uniform(1, 2, 100)
889        offset = np.random.uniform(1, 2, 100)
890        mod1 = GLM(endog, exog, family=sm.families.Poisson(),
891                   offset=offset, exposure=exposure).fit()
892        offset2 = offset + np.log(exposure)
893        mod2 = GLM(endog, exog, family=sm.families.Poisson(),
894                   offset=offset2).fit()
895        assert_almost_equal(mod1.params, mod2.params)
896        assert_allclose(mod1.null, mod2.null, rtol=1e-10)
897
898        # test recreating model
899        mod1_ = mod1.model
900        kwds = mod1_._get_init_kwds()
901        assert_allclose(kwds['exposure'], exposure, rtol=1e-14)
902        assert_allclose(kwds['offset'], mod1_.offset, rtol=1e-14)
903        mod3 = mod1_.__class__(mod1_.endog, mod1_.exog, **kwds)
904        assert_allclose(mod3.exposure, mod1_.exposure, rtol=1e-14)
905        assert_allclose(mod3.offset, mod1_.offset, rtol=1e-14)
906
907        # test fit_regularized exposure, see #4605
908        resr1 = mod1.model.fit_regularized()
909        resr2 = mod2.model.fit_regularized()
910        assert_allclose(resr1.params, resr2.params, rtol=1e-10)
911
912
913    def test_predict(self):
914        np.random.seed(382304)
915        endog = np.random.randint(0, 10, 100)
916        exog = np.random.normal(size=(100,3))
917        exposure = np.random.uniform(1, 2, 100)
918        mod1 = GLM(endog, exog, family=sm.families.Poisson(),
919                   exposure=exposure).fit()
920        exog1 = np.random.normal(size=(10,3))
921        exposure1 = np.random.uniform(1, 2, 10)
922
923        # Doubling exposure time should double expected response
924        pred1 = mod1.predict(exog=exog1, exposure=exposure1)
925        pred2 = mod1.predict(exog=exog1, exposure=2*exposure1)
926        assert_almost_equal(pred2, 2*pred1)
927
928        # Check exposure defaults
929        pred3 = mod1.predict()
930        pred4 = mod1.predict(exposure=exposure)
931        pred5 = mod1.predict(exog=exog, exposure=exposure)
932        assert_almost_equal(pred3, pred4)
933        assert_almost_equal(pred4, pred5)
934
935        # Check offset defaults
936        offset = np.random.uniform(1, 2, 100)
937        mod2 = GLM(endog, exog, offset=offset, family=sm.families.Poisson()).fit()
938        pred1 = mod2.predict()
939        pred2 = mod2.predict(offset=offset)
940        pred3 = mod2.predict(exog=exog, offset=offset)
941        assert_almost_equal(pred1, pred2)
942        assert_almost_equal(pred2, pred3)
943
944        # Check that offset shifts the linear predictor
945        mod3 = GLM(endog, exog, family=sm.families.Poisson()).fit()
946        offset = np.random.uniform(1, 2, 10)
947        pred1 = mod3.predict(exog=exog1, offset=offset, linear=True)
948        pred2 = mod3.predict(exog=exog1, offset=2*offset, linear=True)
949        assert_almost_equal(pred2, pred1+offset)
950
951        # Passing exposure as a pandas series should not effect output type
952        assert isinstance(
953            mod1.predict(exog=exog1, exposure=pd.Series(exposure1)),
954            np.ndarray
955        )
956
957
958def test_perfect_pred(iris):
959    y = iris[:, -1]
960    X = iris[:, :-1]
961    X = X[y != 2]
962    y = y[y != 2]
963    X = add_constant(X, prepend=True)
964    glm = GLM(y, X, family=sm.families.Binomial())
965    with warnings.catch_warnings():
966        warnings.simplefilter("ignore", category=RuntimeWarning)
967        assert_raises(PerfectSeparationError, glm.fit)
968
969
970def test_score_test_ols():
971    # nicer example than Longley
972    from statsmodels.regression.linear_model import OLS
973    np.random.seed(5)
974    nobs = 100
975    sige = 0.5
976    x = np.random.uniform(0, 1, size=(nobs, 5))
977    x[:, 0] = 1
978    beta = 1. / np.arange(1., x.shape[1] + 1)
979    y = x.dot(beta) + sige * np.random.randn(nobs)
980
981    res_ols = OLS(y, x).fit()
982    res_olsc = OLS(y, x[:, :-2]).fit()
983    co = res_ols.compare_lm_test(res_olsc, demean=False)
984
985    res_glm = GLM(y, x[:, :-2], family=sm.families.Gaussian()).fit()
986    co2 = res_glm.model.score_test(res_glm.params, exog_extra=x[:, -2:])
987    # difference in df_resid versus nobs in scale see #1786
988    assert_allclose(co[0] * 97 / 100., co2[0], rtol=1e-13)
989
990
991def test_attribute_writable_resettable():
992    # Regression test for mutables and class constructors.
993    data = sm.datasets.longley.load()
994    endog, exog = data.endog, data.exog
995    glm_model = sm.GLM(endog, exog)
996    assert_equal(glm_model.family.link.power, 1.0)
997    glm_model.family.link.power = 2.
998    assert_equal(glm_model.family.link.power, 2.0)
999    glm_model2 = sm.GLM(endog, exog)
1000    assert_equal(glm_model2.family.link.power, 1.0)
1001
1002
1003class TestStartParams(CheckModelResultsMixin):
1004    @classmethod
1005    def setup_class(cls):
1006        '''
1007        Test Gaussian family with canonical identity link
1008        '''
1009        # Test Precisions
1010        cls.decimal_resids = DECIMAL_3
1011        cls.decimal_params = DECIMAL_2
1012        cls.decimal_bic = DECIMAL_0
1013        cls.decimal_bse = DECIMAL_3
1014
1015        from statsmodels.datasets.longley import load
1016        cls.data = load()
1017        cls.data.exog = add_constant(cls.data.exog, prepend=False)
1018        params = sm.OLS(cls.data.endog, cls.data.exog).fit().params
1019        cls.res1 = GLM(cls.data.endog, cls.data.exog,
1020                        family=sm.families.Gaussian()).fit(start_params=params)
1021        from .results.results_glm import Longley
1022        cls.res2 = Longley()
1023
1024
1025def test_glm_start_params():
1026    # see 1604
1027    y2 = np.array('0 1 0 0 0 1'.split(), int)
1028    wt = np.array([50,1,50,1,5,10])
1029    y2 = np.repeat(y2, wt)
1030    x2 = np.repeat([0,0,0.001,100,-1,-1], wt)
1031    mod = sm.GLM(y2, sm.add_constant(x2), family=sm.families.Binomial())
1032    res = mod.fit(start_params=[-4, -5])
1033    np.testing.assert_almost_equal(res.params, [-4.60305022, -5.29634545], 6)
1034
1035
1036def test_loglike_no_opt():
1037    # see 1728
1038
1039    y = np.asarray([0, 1, 0, 0, 1, 1, 0, 1, 1, 1])
1040    x = np.arange(10, dtype=np.float64)
1041
1042    def llf(params):
1043        lin_pred = params[0] + params[1]*x
1044        pr = 1 / (1 + np.exp(-lin_pred))
1045        return np.sum(y*np.log(pr) + (1-y)*np.log(1-pr))
1046
1047    for params in [0,0], [0,1], [0.5,0.5]:
1048        mod = sm.GLM(y, sm.add_constant(x), family=sm.families.Binomial())
1049        res = mod.fit(start_params=params, maxiter=0)
1050        like = llf(params)
1051        assert_almost_equal(like, res.llf)
1052
1053
1054def test_formula_missing_exposure():
1055    # see 2083
1056    import statsmodels.formula.api as smf
1057
1058    d = {'Foo': [1, 2, 10, 149], 'Bar': [1, 2, 3, np.nan],
1059         'constant': [1] * 4, 'exposure': np.random.uniform(size=4),
1060         'x': [1, 3, 2, 1.5]}
1061    df = pd.DataFrame(d)
1062
1063    family = sm.families.Gaussian(link=sm.families.links.log())
1064
1065    mod = smf.glm("Foo ~ Bar", data=df, exposure=df.exposure,
1066                  family=family)
1067    assert_(type(mod.exposure) is np.ndarray, msg='Exposure is not ndarray')
1068
1069    exposure = pd.Series(np.random.uniform(size=5))
1070    df.loc[3, 'Bar'] = 4   # nan not relevant for Valueerror for shape mismatch
1071    assert_raises(ValueError, smf.glm, "Foo ~ Bar", data=df,
1072                  exposure=exposure, family=family)
1073    assert_raises(ValueError, GLM, df.Foo, df[['constant', 'Bar']],
1074                  exposure=exposure, family=family)
1075
1076
1077@pytest.mark.matplotlib
1078def test_plots(close_figures):
1079
1080    np.random.seed(378)
1081    n = 200
1082    exog = np.random.normal(size=(n, 2))
1083    lin_pred = exog[:, 0] + exog[:, 1]**2
1084    prob = 1 / (1 + np.exp(-lin_pred))
1085    endog = 1 * (np.random.uniform(size=n) < prob)
1086
1087    model = sm.GLM(endog, exog, family=sm.families.Binomial())
1088    result = model.fit()
1089
1090    import pandas as pd
1091
1092    from statsmodels.graphics.regressionplots import add_lowess
1093
1094    # array interface
1095    for j in 0,1:
1096        fig = result.plot_added_variable(j)
1097        add_lowess(fig.axes[0], frac=0.5)
1098        close_or_save(pdf, fig)
1099        fig = result.plot_partial_residuals(j)
1100        add_lowess(fig.axes[0], frac=0.5)
1101        close_or_save(pdf, fig)
1102        fig = result.plot_ceres_residuals(j)
1103        add_lowess(fig.axes[0], frac=0.5)
1104        close_or_save(pdf, fig)
1105
1106    # formula interface
1107    data = pd.DataFrame({"y": endog, "x1": exog[:, 0], "x2": exog[:, 1]})
1108    model = sm.GLM.from_formula("y ~ x1 + x2", data, family=sm.families.Binomial())
1109    result = model.fit()
1110    for j in 0,1:
1111        xname = ["x1", "x2"][j]
1112        fig = result.plot_added_variable(xname)
1113        add_lowess(fig.axes[0], frac=0.5)
1114        close_or_save(pdf, fig)
1115        fig = result.plot_partial_residuals(xname)
1116        add_lowess(fig.axes[0], frac=0.5)
1117        close_or_save(pdf, fig)
1118        fig = result.plot_ceres_residuals(xname)
1119        add_lowess(fig.axes[0], frac=0.5)
1120        close_or_save(pdf, fig)
1121
1122def gen_endog(lin_pred, family_class, link, binom_version=0):
1123
1124    np.random.seed(872)
1125
1126    fam = sm.families
1127
1128    mu = link().inverse(lin_pred)
1129
1130    if family_class == fam.Binomial:
1131        if binom_version == 0:
1132            endog = 1*(np.random.uniform(size=len(lin_pred)) < mu)
1133        else:
1134            endog = np.empty((len(lin_pred), 2))
1135            n = 10
1136            endog[:, 0] = (np.random.uniform(size=(len(lin_pred), n)) < mu[:, None]).sum(1)
1137            endog[:, 1] = n - endog[:, 0]
1138    elif family_class == fam.Poisson:
1139        endog = np.random.poisson(mu)
1140    elif family_class == fam.Gamma:
1141        endog = np.random.gamma(2, mu)
1142    elif family_class == fam.Gaussian:
1143        endog = mu + 2 * np.random.normal(size=len(lin_pred))
1144    elif family_class == fam.NegativeBinomial:
1145        from scipy.stats.distributions import nbinom
1146        endog = nbinom.rvs(mu, 0.5)
1147    elif family_class == fam.InverseGaussian:
1148        from scipy.stats.distributions import invgauss
1149        endog = invgauss.rvs(mu, scale=20)
1150    else:
1151        raise ValueError
1152
1153    return endog
1154
1155
1156@pytest.mark.smoke
1157def test_summary():
1158    np.random.seed(4323)
1159
1160    n = 100
1161    exog = np.random.normal(size=(n, 2))
1162    exog[:, 0] = 1
1163    endog = np.random.normal(size=n)
1164
1165    for method in ["irls", "cg"]:
1166        fa = sm.families.Gaussian()
1167        model = sm.GLM(endog, exog, family=fa)
1168        rslt = model.fit(method=method)
1169        s = rslt.summary()
1170
1171
1172def check_score_hessian(results):
1173    # compare models core and hessian with numerical derivatives
1174
1175    params = results.params
1176    # avoid checking score at MLE, score close to zero
1177    sc = results.model.score(params * 0.98, scale=1)
1178    # cs currently (0.9) does not work for all families
1179    llfunc = lambda x: results.model.loglike(x, scale=1)  # noqa
1180    sc2 = approx_fprime(params * 0.98, llfunc)
1181    assert_allclose(sc, sc2, rtol=1e-4, atol=1e-4)
1182
1183    hess = results.model.hessian(params, scale=1)
1184    hess2 = approx_hess(params, llfunc)
1185    assert_allclose(hess, hess2, rtol=1e-4)
1186    scfunc = lambda x: results.model.score(x, scale=1)  # noqa
1187    hess3 = approx_fprime(params, scfunc)
1188    assert_allclose(hess, hess3, rtol=1e-4)
1189
1190
1191def test_gradient_irls():
1192    # Compare the results when using gradient optimization and IRLS.
1193
1194    # TODO: Find working examples for inverse_squared link
1195
1196    np.random.seed(87342)
1197
1198    fam = sm.families
1199    lnk = sm.families.links
1200    families = [(fam.Binomial, [lnk.logit, lnk.probit, lnk.cloglog, lnk.log, lnk.cauchy]),
1201                (fam.Poisson, [lnk.log, lnk.identity, lnk.sqrt]),
1202                (fam.Gamma, [lnk.log, lnk.identity, lnk.inverse_power]),
1203                (fam.Gaussian, [lnk.identity, lnk.log, lnk.inverse_power]),
1204                (fam.InverseGaussian, [lnk.log, lnk.identity, lnk.inverse_power, lnk.inverse_squared]),
1205                (fam.NegativeBinomial, [lnk.log, lnk.inverse_power, lnk.inverse_squared, lnk.identity])]
1206
1207    n = 100
1208    p = 3
1209    exog = np.random.normal(size=(n, p))
1210    exog[:, 0] = 1
1211
1212    skip_one = False
1213    for family_class, family_links in families:
1214        for link in family_links:
1215            for binom_version in 0,1:
1216
1217                if family_class != fam.Binomial and binom_version == 1:
1218                    continue
1219
1220                if (family_class, link) == (fam.Poisson, lnk.identity):
1221                    lin_pred = 20 + exog.sum(1)
1222                elif (family_class, link) == (fam.Binomial, lnk.log):
1223                    lin_pred = -1 + exog.sum(1) / 8
1224                elif (family_class, link) == (fam.Poisson, lnk.sqrt):
1225                    lin_pred = 2 + exog.sum(1)
1226                elif (family_class, link) == (fam.InverseGaussian, lnk.log):
1227                    #skip_zero = True
1228                    lin_pred = -1 + exog.sum(1)
1229                elif (family_class, link) == (fam.InverseGaussian, lnk.identity):
1230                    lin_pred = 20 + 5*exog.sum(1)
1231                    lin_pred = np.clip(lin_pred, 1e-4, np.inf)
1232                elif (family_class, link) == (fam.InverseGaussian, lnk.inverse_squared):
1233                    lin_pred = 0.5 + exog.sum(1) / 5
1234                    continue # skip due to non-convergence
1235                elif (family_class, link) == (fam.InverseGaussian, lnk.inverse_power):
1236                    lin_pred = 1 + exog.sum(1) / 5
1237                elif (family_class, link) == (fam.NegativeBinomial, lnk.identity):
1238                    lin_pred = 20 + 5*exog.sum(1)
1239                    lin_pred = np.clip(lin_pred, 1e-4, np.inf)
1240                elif (family_class, link) == (fam.NegativeBinomial, lnk.inverse_squared):
1241                    lin_pred = 0.1 + np.random.uniform(size=exog.shape[0])
1242                    continue # skip due to non-convergence
1243                elif (family_class, link) == (fam.NegativeBinomial, lnk.inverse_power):
1244                    lin_pred = 1 + exog.sum(1) / 5
1245
1246                elif (family_class, link) == (fam.Gaussian, lnk.inverse_power):
1247                    # adding skip because of convergence failure
1248                    skip_one = True
1249                # the following fails with identity link, because endog < 0
1250                # elif family_class == fam.Gamma:
1251                #     lin_pred = 0.5 * exog.sum(1) + np.random.uniform(size=exog.shape[0])
1252                else:
1253                    lin_pred = np.random.uniform(size=exog.shape[0])
1254
1255                endog = gen_endog(lin_pred, family_class, link, binom_version)
1256
1257                with warnings.catch_warnings():
1258                    warnings.simplefilter("ignore")
1259                    mod_irls = sm.GLM(endog, exog, family=family_class(link=link()))
1260                rslt_irls = mod_irls.fit(method="IRLS")
1261
1262                if not (family_class, link) in [(fam.Poisson, lnk.sqrt),
1263                                                (fam.Gamma, lnk.inverse_power),
1264                                                (fam.InverseGaussian, lnk.identity)
1265                                                ]:
1266                    check_score_hessian(rslt_irls)
1267
1268                # Try with and without starting values.
1269                for max_start_irls, start_params in (0, rslt_irls.params), (3, None):
1270                    # TODO: skip convergence failures for now
1271                    if max_start_irls > 0 and skip_one:
1272                        continue
1273                    with warnings.catch_warnings():
1274                        warnings.simplefilter("ignore")
1275                        mod_gradient = sm.GLM(endog, exog, family=family_class(link=link()))
1276                    rslt_gradient = mod_gradient.fit(max_start_irls=max_start_irls,
1277                                                     start_params=start_params,
1278                                                     method="newton", maxiter=300)
1279
1280                    assert_allclose(rslt_gradient.params,
1281                                    rslt_irls.params, rtol=1e-6, atol=5e-5)
1282
1283                    assert_allclose(rslt_gradient.llf, rslt_irls.llf,
1284                                    rtol=1e-6, atol=1e-6)
1285
1286                    assert_allclose(rslt_gradient.scale, rslt_irls.scale,
1287                                    rtol=1e-6, atol=1e-6)
1288
1289                    # Get the standard errors using expected information.
1290                    gradient_bse = rslt_gradient.bse
1291                    ehess = mod_gradient.hessian(rslt_gradient.params, observed=False)
1292                    gradient_bse = np.sqrt(-np.diag(np.linalg.inv(ehess)))
1293                    assert_allclose(gradient_bse, rslt_irls.bse, rtol=1e-6, atol=5e-5)
1294                    # rslt_irls.bse corresponds to observed=True
1295                    assert_allclose(rslt_gradient.bse, rslt_irls.bse, rtol=0.2, atol=5e-5)
1296
1297                    rslt_gradient_eim = mod_gradient.fit(max_start_irls=0,
1298                                                         cov_type='eim',
1299                                                         start_params=rslt_gradient.params,
1300                                                         method="newton", maxiter=300)
1301                    assert_allclose(rslt_gradient_eim.bse, rslt_irls.bse, rtol=5e-5, atol=0)
1302
1303
1304def test_gradient_irls_eim():
1305    # Compare the results when using eime gradient optimization and IRLS.
1306
1307    # TODO: Find working examples for inverse_squared link
1308
1309    np.random.seed(87342)
1310
1311    fam = sm.families
1312    lnk = sm.families.links
1313    families = [(fam.Binomial, [lnk.logit, lnk.probit, lnk.cloglog, lnk.log,
1314                                lnk.cauchy]),
1315                (fam.Poisson, [lnk.log, lnk.identity, lnk.sqrt]),
1316                (fam.Gamma, [lnk.log, lnk.identity, lnk.inverse_power]),
1317                (fam.Gaussian, [lnk.identity, lnk.log, lnk.inverse_power]),
1318                (fam.InverseGaussian, [lnk.log, lnk.identity,
1319                                       lnk.inverse_power,
1320                                       lnk.inverse_squared]),
1321                (fam.NegativeBinomial, [lnk.log, lnk.inverse_power,
1322                                        lnk.inverse_squared, lnk.identity])]
1323
1324    n = 100
1325    p = 3
1326    exog = np.random.normal(size=(n, p))
1327    exog[:, 0] = 1
1328
1329    skip_one = False
1330    for family_class, family_links in families:
1331        for link in family_links:
1332            for binom_version in 0, 1:
1333
1334                if family_class != fam.Binomial and binom_version == 1:
1335                    continue
1336
1337                if (family_class, link) == (fam.Poisson, lnk.identity):
1338                    lin_pred = 20 + exog.sum(1)
1339                elif (family_class, link) == (fam.Binomial, lnk.log):
1340                    lin_pred = -1 + exog.sum(1) / 8
1341                elif (family_class, link) == (fam.Poisson, lnk.sqrt):
1342                    lin_pred = 2 + exog.sum(1)
1343                elif (family_class, link) == (fam.InverseGaussian, lnk.log):
1344                    # skip_zero = True
1345                    lin_pred = -1 + exog.sum(1)
1346                elif (family_class, link) == (fam.InverseGaussian,
1347                                              lnk.identity):
1348                    lin_pred = 20 + 5*exog.sum(1)
1349                    lin_pred = np.clip(lin_pred, 1e-4, np.inf)
1350                elif (family_class, link) == (fam.InverseGaussian,
1351                                              lnk.inverse_squared):
1352                    lin_pred = 0.5 + exog.sum(1) / 5
1353                    continue  # skip due to non-convergence
1354                elif (family_class, link) == (fam.InverseGaussian,
1355                                              lnk.inverse_power):
1356                    lin_pred = 1 + exog.sum(1) / 5
1357                elif (family_class, link) == (fam.NegativeBinomial,
1358                                              lnk.identity):
1359                    lin_pred = 20 + 5*exog.sum(1)
1360                    lin_pred = np.clip(lin_pred, 1e-4, np.inf)
1361                elif (family_class, link) == (fam.NegativeBinomial,
1362                                              lnk.inverse_squared):
1363                    lin_pred = 0.1 + np.random.uniform(size=exog.shape[0])
1364                    continue  # skip due to non-convergence
1365                elif (family_class, link) == (fam.NegativeBinomial,
1366                                              lnk.inverse_power):
1367                    lin_pred = 1 + exog.sum(1) / 5
1368
1369                elif (family_class, link) == (fam.Gaussian, lnk.inverse_power):
1370                    # adding skip because of convergence failure
1371                    skip_one = True
1372                else:
1373                    lin_pred = np.random.uniform(size=exog.shape[0])
1374
1375                endog = gen_endog(lin_pred, family_class, link, binom_version)
1376
1377                with warnings.catch_warnings():
1378                    warnings.simplefilter("ignore")
1379                    mod_irls = sm.GLM(endog, exog,
1380                                      family=family_class(link=link()))
1381                rslt_irls = mod_irls.fit(method="IRLS")
1382
1383                # Try with and without starting values.
1384                for max_start_irls, start_params in ((0, rslt_irls.params),
1385                                                     (3, None)):
1386                    # TODO: skip convergence failures for now
1387                    if max_start_irls > 0 and skip_one:
1388                        continue
1389                    with warnings.catch_warnings():
1390                        warnings.simplefilter("ignore")
1391                        mod_gradient = sm.GLM(endog, exog,
1392                                              family=family_class(link=link()))
1393                    rslt_gradient = mod_gradient.fit(
1394                            max_start_irls=max_start_irls,
1395                            start_params=start_params,
1396                            method="newton",
1397                            optim_hessian='eim'
1398                    )
1399
1400                    assert_allclose(rslt_gradient.params, rslt_irls.params,
1401                                    rtol=1e-6, atol=5e-5)
1402
1403                    assert_allclose(rslt_gradient.llf, rslt_irls.llf,
1404                                    rtol=1e-6, atol=1e-6)
1405
1406                    assert_allclose(rslt_gradient.scale, rslt_irls.scale,
1407                                    rtol=1e-6, atol=1e-6)
1408
1409                    # Get the standard errors using expected information.
1410                    ehess = mod_gradient.hessian(rslt_gradient.params,
1411                                                 observed=False)
1412                    gradient_bse = np.sqrt(-np.diag(np.linalg.inv(ehess)))
1413
1414                    assert_allclose(gradient_bse, rslt_irls.bse, rtol=1e-6,
1415                                    atol=5e-5)
1416
1417
1418def test_glm_irls_method():
1419    nobs, k_vars = 50, 4
1420    np.random.seed(987126)
1421    x = np.random.randn(nobs, k_vars - 1)
1422    exog = add_constant(x, has_constant='add')
1423    y = exog.sum(1) + np.random.randn(nobs)
1424
1425    mod = GLM(y, exog)
1426    res1 = mod.fit()
1427    res2 = mod.fit(wls_method='pinv', attach_wls=True)
1428    res3 = mod.fit(wls_method='qr', attach_wls=True)
1429    # fit_gradient does not attach mle_settings
1430    res_g1 = mod.fit(start_params=res1.params, method='bfgs')
1431
1432    for r in [res1, res2, res3]:
1433        assert_equal(r.mle_settings['optimizer'], 'IRLS')
1434        assert_equal(r.method, 'IRLS')
1435
1436    assert_equal(res1.mle_settings['wls_method'], 'lstsq')
1437    assert_equal(res2.mle_settings['wls_method'], 'pinv')
1438    assert_equal(res3.mle_settings['wls_method'], 'qr')
1439
1440    assert_(hasattr(res2.results_wls.model, 'pinv_wexog'))
1441    assert_(hasattr(res3.results_wls.model, 'exog_Q'))
1442
1443    # fit_gradient currently does not attach mle_settings
1444    assert_equal(res_g1.method, 'bfgs')
1445
1446
1447class CheckWtdDuplicationMixin(object):
1448    decimal_params = DECIMAL_4
1449
1450    @classmethod
1451    def setup_class(cls):
1452        cls.data = cpunish.load()
1453        cls.data.endog = np.asarray(cls.data.endog)
1454        cls.data.exog = np.asarray(cls.data.exog)
1455        cls.endog = cls.data.endog
1456        cls.exog = cls.data.exog
1457        np.random.seed(1234)
1458        cls.weight = np.random.randint(5, 100, len(cls.endog))
1459        cls.endog_big = np.repeat(cls.endog, cls.weight)
1460        cls.exog_big = np.repeat(cls.exog, cls.weight, axis=0)
1461
1462    def test_params(self):
1463        assert_allclose(self.res1.params, self.res2.params,  atol=1e-6,
1464                        rtol=1e-6)
1465
1466    decimal_bse = DECIMAL_4
1467
1468    def test_standard_errors(self):
1469        assert_allclose(self.res1.bse, self.res2.bse, rtol=1e-5, atol=1e-6)
1470
1471    decimal_resids = DECIMAL_4
1472
1473    # TODO: This does not work... Arrays are of different shape.
1474    # Perhaps we use self.res1.model.family.resid_XXX()?
1475    """
1476    def test_residuals(self):
1477        resids1 = np.column_stack((self.res1.resid_pearson,
1478                                   self.res1.resid_deviance,
1479                                   self.res1.resid_working,
1480                                   self.res1.resid_anscombe,
1481                                   self.res1.resid_response))
1482        resids2 = np.column_stack((self.res1.resid_pearson,
1483                                   self.res2.resid_deviance,
1484                                   self.res2.resid_working,
1485                                   self.res2.resid_anscombe,
1486                                   self.res2.resid_response))
1487        assert_allclose(resids1, resids2, self.decimal_resids)
1488    """
1489
1490    def test_aic(self):
1491        # R includes the estimation of the scale as a lost dof
1492        # Does not with Gamma though
1493        assert_allclose(self.res1.aic, self.res2.aic,  atol=1e-6, rtol=1e-6)
1494
1495    def test_deviance(self):
1496        assert_allclose(self.res1.deviance, self.res2.deviance,  atol=1e-6,
1497                        rtol=1e-6)
1498
1499    def test_scale(self):
1500        assert_allclose(self.res1.scale, self.res2.scale, atol=1e-6, rtol=1e-6)
1501
1502    def test_loglike(self):
1503        # Stata uses the below llf for these families
1504        # We differ with R for them
1505        assert_allclose(self.res1.llf, self.res2.llf, 1e-6)
1506
1507    decimal_null_deviance = DECIMAL_4
1508
1509    def test_null_deviance(self):
1510        with warnings.catch_warnings():
1511            warnings.simplefilter("ignore", DomainWarning)
1512
1513            assert_allclose(self.res1.null_deviance,
1514                            self.res2.null_deviance,
1515                            atol=1e-6,
1516                            rtol=1e-6)
1517
1518    decimal_bic = DECIMAL_4
1519
1520    def test_bic(self):
1521        with warnings.catch_warnings():
1522            warnings.simplefilter("ignore")
1523            assert_allclose(self.res1.bic, self.res2.bic,  atol=1e-6, rtol=1e-6)
1524
1525    decimal_fittedvalues = DECIMAL_4
1526
1527    def test_fittedvalues(self):
1528        res2_fitted = self.res2.predict(self.res1.model.exog)
1529        assert_allclose(self.res1.fittedvalues, res2_fitted, atol=1e-5,
1530                        rtol=1e-5)
1531
1532    decimal_tpvalues = DECIMAL_4
1533
1534    def test_tpvalues(self):
1535        # test comparing tvalues and pvalues with normal implementation
1536        # make sure they use normal distribution (inherited in results class)
1537        assert_allclose(self.res1.tvalues, self.res2.tvalues, atol=1e-6,
1538                        rtol=2e-4)
1539        assert_allclose(self.res1.pvalues, self.res2.pvalues, atol=1e-6,
1540                        rtol=1e-6)
1541        assert_allclose(self.res1.conf_int(), self.res2.conf_int(), atol=1e-6,
1542                        rtol=1e-6)
1543
1544
1545class TestWtdGlmPoisson(CheckWtdDuplicationMixin):
1546
1547    @classmethod
1548    def setup_class(cls):
1549        '''
1550        Tests Poisson family with canonical log link.
1551        '''
1552        super(TestWtdGlmPoisson, cls).setup_class()
1553        cls.endog = np.asarray(cls.endog)
1554        cls.exog = np.asarray(cls.exog)
1555
1556        cls.res1 = GLM(cls.endog, cls.exog,
1557                        freq_weights=cls.weight,
1558                        family=sm.families.Poisson()).fit()
1559        cls.res2 = GLM(cls.endog_big, cls.exog_big,
1560                        family=sm.families.Poisson()).fit()
1561
1562
1563class TestWtdGlmPoissonNewton(CheckWtdDuplicationMixin):
1564    @classmethod
1565    def setup_class(cls):
1566        '''
1567        Tests Poisson family with canonical log link.
1568        '''
1569        super(TestWtdGlmPoissonNewton, cls).setup_class()
1570
1571        start_params = np.array([1.82794424e-04, -4.76785037e-02,
1572                                 -9.48249717e-02, -2.92293226e-04,
1573                                 2.63728909e+00, -2.05934384e+01])
1574
1575        fit_kwds = dict(method='newton')
1576        cls.res1 = GLM(cls.endog, cls.exog,
1577                        freq_weights=cls.weight,
1578                        family=sm.families.Poisson()).fit(**fit_kwds)
1579        fit_kwds = dict(method='newton', start_params=start_params)
1580        cls.res2 = GLM(cls.endog_big, cls.exog_big,
1581                        family=sm.families.Poisson()).fit(**fit_kwds)
1582
1583
1584class TestWtdGlmPoissonHC0(CheckWtdDuplicationMixin):
1585    @classmethod
1586    def setup_class(cls):
1587
1588        '''
1589        Tests Poisson family with canonical log link.
1590        '''
1591        super(TestWtdGlmPoissonHC0, cls).setup_class()
1592
1593        start_params = np.array([1.82794424e-04, -4.76785037e-02,
1594                                 -9.48249717e-02, -2.92293226e-04,
1595                                 2.63728909e+00, -2.05934384e+01])
1596
1597        fit_kwds = dict(cov_type='HC0')
1598        cls.res1 = GLM(cls.endog, cls.exog,
1599                        freq_weights=cls.weight,
1600                        family=sm.families.Poisson()).fit(**fit_kwds)
1601        fit_kwds = dict(cov_type='HC0', start_params=start_params)
1602        cls.res2 = GLM(cls.endog_big, cls.exog_big,
1603                        family=sm.families.Poisson()).fit(**fit_kwds)
1604
1605
1606class TestWtdGlmPoissonClu(CheckWtdDuplicationMixin):
1607    @classmethod
1608    def setup_class(cls):
1609
1610        '''
1611        Tests Poisson family with canonical log link.
1612        '''
1613        super(TestWtdGlmPoissonClu, cls).setup_class()
1614
1615        start_params = np.array([1.82794424e-04, -4.76785037e-02,
1616                                 -9.48249717e-02, -2.92293226e-04,
1617                                 2.63728909e+00, -2.05934384e+01])
1618
1619        gid = np.arange(1, len(cls.endog) + 1) // 2
1620        fit_kwds = dict(cov_type='cluster', cov_kwds={'groups': gid, 'use_correction':False})
1621
1622        import warnings
1623        with warnings.catch_warnings():
1624            warnings.simplefilter("ignore")
1625            cls.res1 = GLM(cls.endog, cls.exog,
1626                            freq_weights=cls.weight,
1627                            family=sm.families.Poisson()).fit(**fit_kwds)
1628            gidr = np.repeat(gid, cls.weight)
1629            fit_kwds = dict(cov_type='cluster', cov_kwds={'groups': gidr, 'use_correction':False})
1630            cls.res2 = GLM(cls.endog_big, cls.exog_big,
1631                            family=sm.families.Poisson()).fit(start_params=start_params,
1632                                                              **fit_kwds)
1633
1634
1635class TestWtdGlmBinomial(CheckWtdDuplicationMixin):
1636    @classmethod
1637    def setup_class(cls):
1638
1639        '''
1640        Tests Binomial family with canonical logit link.
1641        '''
1642        super(TestWtdGlmBinomial, cls).setup_class()
1643        cls.endog = cls.endog / 100
1644        cls.endog_big = cls.endog_big / 100
1645        cls.res1 = GLM(cls.endog, cls.exog,
1646                       freq_weights=cls.weight,
1647                       family=sm.families.Binomial()).fit()
1648        cls.res2 = GLM(cls.endog_big, cls.exog_big,
1649                       family=sm.families.Binomial()).fit()
1650
1651
1652class TestWtdGlmNegativeBinomial(CheckWtdDuplicationMixin):
1653    @classmethod
1654    def setup_class(cls):
1655
1656        '''
1657        Tests Negative Binomial family with canonical link
1658        g(p) = log(p/(p + 1/alpha))
1659        '''
1660        super(TestWtdGlmNegativeBinomial, cls).setup_class()
1661        alpha = 1.
1662
1663        with warnings.catch_warnings():
1664            warnings.simplefilter("ignore", category=DomainWarning)
1665            family_link = sm.families.NegativeBinomial(
1666                link=sm.families.links.nbinom(alpha=alpha),
1667                alpha=alpha)
1668            cls.res1 = GLM(cls.endog, cls.exog,
1669                           freq_weights=cls.weight,
1670                           family=family_link).fit()
1671            cls.res2 = GLM(cls.endog_big, cls.exog_big,
1672                           family=family_link).fit()
1673
1674
1675class TestWtdGlmGamma(CheckWtdDuplicationMixin):
1676    @classmethod
1677    def setup_class(cls):
1678
1679        '''
1680        Tests Gamma family with log link.
1681        '''
1682        super(TestWtdGlmGamma, cls).setup_class()
1683        family_link = sm.families.Gamma(sm.families.links.log())
1684        cls.res1 = GLM(cls.endog, cls.exog,
1685                       freq_weights=cls.weight,
1686                       family=family_link).fit()
1687        cls.res2 = GLM(cls.endog_big, cls.exog_big,
1688                       family=family_link).fit()
1689
1690
1691class TestWtdGlmGaussian(CheckWtdDuplicationMixin):
1692    @classmethod
1693    def setup_class(cls):
1694        '''
1695        Tests Gaussian family with log link.
1696        '''
1697        super(TestWtdGlmGaussian, cls).setup_class()
1698        family_link = sm.families.Gaussian(sm.families.links.log())
1699        cls.res1 = GLM(cls.endog, cls.exog,
1700                       freq_weights=cls.weight,
1701                       family=family_link).fit()
1702        cls.res2 = GLM(cls.endog_big, cls.exog_big,
1703                       family=family_link).fit()
1704
1705
1706class TestWtdGlmInverseGaussian(CheckWtdDuplicationMixin):
1707    @classmethod
1708    def setup_class(cls):
1709        '''
1710        Tests InverseGaussian family with log link.
1711        '''
1712        super(TestWtdGlmInverseGaussian, cls).setup_class()
1713        family_link = sm.families.InverseGaussian(sm.families.links.log())
1714        cls.res1 = GLM(cls.endog, cls.exog,
1715                       freq_weights=cls.weight,
1716                       family=family_link).fit()
1717        cls.res2 = GLM(cls.endog_big, cls.exog_big,
1718                       family=family_link).fit()
1719
1720
1721class TestWtdGlmGammaNewton(CheckWtdDuplicationMixin):
1722    @classmethod
1723    def setup_class(cls):
1724        '''
1725        Tests Gamma family with log link.
1726        '''
1727        super(TestWtdGlmGammaNewton, cls).setup_class()
1728        family_link = sm.families.Gamma(sm.families.links.log())
1729        cls.res1 = GLM(cls.endog, cls.exog,
1730                       freq_weights=cls.weight,
1731                       family=family_link
1732                       ).fit(method='newton')
1733        cls.res2 = GLM(cls.endog_big, cls.exog_big,
1734                       family=family_link
1735                       ).fit(method='newton')
1736
1737    def test_init_kwargs(self):
1738        family_link = sm.families.Gamma(sm.families.links.log())
1739
1740        with pytest.warns(ValueWarning, match="unknown kwargs"):
1741            GLM(self.endog, self.exog, family=family_link,
1742                weights=self.weight,  # incorrect keyword
1743                )
1744
1745
1746class TestWtdGlmGammaScale_X2(CheckWtdDuplicationMixin):
1747    @classmethod
1748    def setup_class(cls):
1749        '''
1750        Tests Gamma family with log link.
1751        '''
1752        super(TestWtdGlmGammaScale_X2, cls).setup_class()
1753        family_link = sm.families.Gamma(sm.families.links.log())
1754        cls.res1 = GLM(cls.endog, cls.exog,
1755                       freq_weights=cls.weight,
1756                       family=family_link,
1757                       ).fit(scale='X2')
1758        cls.res2 = GLM(cls.endog_big, cls.exog_big,
1759                       family=family_link,
1760                       ).fit(scale='X2')
1761
1762
1763class TestWtdGlmGammaScale_dev(CheckWtdDuplicationMixin):
1764    @classmethod
1765    def setup_class(cls):
1766        '''
1767        Tests Gamma family with log link.
1768        '''
1769        super(TestWtdGlmGammaScale_dev, cls).setup_class()
1770        family_link = sm.families.Gamma(sm.families.links.log())
1771        cls.res1 = GLM(cls.endog, cls.exog,
1772                       freq_weights=cls.weight,
1773                       family=family_link,
1774                       ).fit(scale='dev')
1775        cls.res2 = GLM(cls.endog_big, cls.exog_big,
1776                       family=family_link,
1777                       ).fit(scale='dev')
1778
1779    def test_missing(self):
1780        endog = self.data.endog.copy()
1781        exog = self.data.exog.copy()
1782        exog[0, 0] = np.nan
1783        endog[[2, 4, 6, 8]] = np.nan
1784        freq_weights = self.weight
1785        mod_misisng = GLM(endog, exog, family=self.res1.model.family,
1786                          freq_weights=freq_weights, missing='drop')
1787        assert_equal(mod_misisng.freq_weights.shape[0],
1788                     mod_misisng.endog.shape[0])
1789        assert_equal(mod_misisng.freq_weights.shape[0],
1790                     mod_misisng.exog.shape[0])
1791        keep_idx = np.array([1,  3,  5,  7,  9, 10, 11, 12, 13, 14, 15, 16])
1792        assert_equal(mod_misisng.freq_weights, self.weight[keep_idx])
1793
1794
1795class TestWtdTweedieLog(CheckWtdDuplicationMixin):
1796    @classmethod
1797    def setup_class(cls):
1798        '''
1799        Tests Tweedie family with log link and var_power=1.
1800        '''
1801        super(TestWtdTweedieLog, cls).setup_class()
1802        family_link = sm.families.Tweedie(link=sm.families.links.log(),
1803                                          var_power=1)
1804        cls.res1 = GLM(cls.endog, cls.exog,
1805                        freq_weights=cls.weight,
1806                        family=family_link).fit()
1807        cls.res2 = GLM(cls.endog_big, cls.exog_big,
1808                        family=family_link).fit()
1809
1810
1811class TestWtdTweediePower2(CheckWtdDuplicationMixin):
1812    @classmethod
1813    def setup_class(cls):
1814        '''
1815        Tests Tweedie family with Power(1) link and var_power=2.
1816        '''
1817        cls.data = cpunish.load_pandas()
1818        cls.endog = cls.data.endog
1819        cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
1820        np.random.seed(1234)
1821        cls.weight = np.random.randint(5, 100, len(cls.endog))
1822        cls.endog_big = np.repeat(cls.endog.values, cls.weight)
1823        cls.exog_big = np.repeat(cls.exog.values, cls.weight, axis=0)
1824        link = sm.families.links.Power()
1825        family_link = sm.families.Tweedie(link=link, var_power=2)
1826        cls.res1 = GLM(cls.endog, cls.exog,
1827                       freq_weights=cls.weight,
1828                       family=family_link).fit()
1829        cls.res2 = GLM(cls.endog_big, cls.exog_big,
1830                       family=family_link).fit()
1831
1832
1833class TestWtdTweediePower15(CheckWtdDuplicationMixin):
1834    @classmethod
1835    def setup_class(cls):
1836        '''
1837        Tests Tweedie family with Power(0.5) link and var_power=1.5.
1838        '''
1839        super(TestWtdTweediePower15, cls).setup_class()
1840        family_link = sm.families.Tweedie(link=sm.families.links.Power(0.5),
1841                                          var_power=1.5)
1842        cls.res1 = GLM(cls.endog, cls.exog,
1843                        freq_weights=cls.weight,
1844                        family=family_link).fit()
1845        cls.res2 = GLM(cls.endog_big, cls.exog_big,
1846                        family=family_link).fit()
1847
1848
1849def test_wtd_patsy_missing():
1850    import pandas as pd
1851    data = cpunish.load()
1852    data.endog = np.asarray(data.endog)
1853    data.exog = np.asarray(data.exog)
1854    data.exog[0, 0] = np.nan
1855    data.endog[[2, 4, 6, 8]] = np.nan
1856    data.pandas = pd.DataFrame(data.exog, columns=data.exog_name)
1857    data.pandas['EXECUTIONS'] = data.endog
1858    weights = np.arange(1, len(data.endog)+1)
1859    formula = """EXECUTIONS ~ INCOME + PERPOVERTY + PERBLACK + VC100k96 +
1860                 SOUTH + DEGREE"""
1861    mod_misisng = GLM.from_formula(formula, data=data.pandas,
1862                                   freq_weights=weights)
1863    assert_equal(mod_misisng.freq_weights.shape[0],
1864                 mod_misisng.endog.shape[0])
1865    assert_equal(mod_misisng.freq_weights.shape[0],
1866                 mod_misisng.exog.shape[0])
1867    assert_equal(mod_misisng.freq_weights.shape[0], 12)
1868    keep_weights = np.array([2,  4,  6,  8, 10, 11, 12, 13, 14, 15, 16, 17])
1869    assert_equal(mod_misisng.freq_weights, keep_weights)
1870
1871
1872class CheckTweedie(object):
1873    def test_resid(self):
1874        idx1 = len(self.res1.resid_response) - 1
1875        idx2 = len(self.res2.resid_response) - 1
1876        assert_allclose(np.concatenate((self.res1.resid_response[:17],
1877                                        [self.res1.resid_response[idx1]])),
1878                        np.concatenate((self.res2.resid_response[:17],
1879                                        [self.res2.resid_response[idx2]])),
1880                        rtol=1e-5, atol=1e-5)
1881        assert_allclose(np.concatenate((self.res1.resid_pearson[:17],
1882                                        [self.res1.resid_pearson[idx1]])),
1883                        np.concatenate((self.res2.resid_pearson[:17],
1884                                        [self.res2.resid_pearson[idx2]])),
1885                        rtol=1e-5, atol=1e-5)
1886        assert_allclose(np.concatenate((self.res1.resid_deviance[:17],
1887                                        [self.res1.resid_deviance[idx1]])),
1888                        np.concatenate((self.res2.resid_deviance[:17],
1889                                        [self.res2.resid_deviance[idx2]])),
1890                        rtol=1e-5, atol=1e-5)
1891
1892        assert_allclose(np.concatenate((self.res1.resid_working[:17],
1893                                        [self.res1.resid_working[idx1]])),
1894                        np.concatenate((self.res2.resid_working[:17],
1895                                        [self.res2.resid_working[idx2]])),
1896                        rtol=1e-5, atol=1e-5)
1897
1898
1899    def test_bse(self):
1900        assert_allclose(self.res1.bse, self.res2.bse, atol=1e-6, rtol=1e6)
1901
1902    def test_params(self):
1903        assert_allclose(self.res1.params, self.res2.params, atol=1e-5,
1904                        rtol=1e-5)
1905
1906    def test_deviance(self):
1907        assert_allclose(self.res1.deviance, self.res2.deviance, atol=1e-6,
1908                        rtol=1e-6)
1909
1910    def test_df(self):
1911        assert_equal(self.res1.df_model, self.res2.df_model)
1912        assert_equal(self.res1.df_resid, self.res2.df_resid)
1913
1914    def test_fittedvalues(self):
1915        idx1 = len(self.res1.fittedvalues) - 1
1916        idx2 = len(self.res2.resid_response) - 1
1917        assert_allclose(np.concatenate((self.res1.fittedvalues[:17],
1918                                        [self.res1.fittedvalues[idx1]])),
1919                        np.concatenate((self.res2.fittedvalues[:17],
1920                                        [self.res2.fittedvalues[idx2]])),
1921                        atol=1e-4, rtol=1e-4)
1922
1923    def test_summary(self):
1924        self.res1.summary()
1925        self.res1.summary2()
1926
1927
1928class TestTweediePower15(CheckTweedie):
1929    @classmethod
1930    def setup_class(cls):
1931        from .results.results_glm import CpunishTweediePower15
1932        cls.data = cpunish.load_pandas()
1933        cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
1934        cls.endog = cls.data.endog
1935        family_link = sm.families.Tweedie(link=sm.families.links.Power(1),
1936                                          var_power=1.5)
1937        cls.res1 = sm.GLM(endog=cls.data.endog,
1938                          exog=cls.data.exog[['INCOME', 'SOUTH']],
1939                          family=family_link).fit()
1940        cls.res2 = CpunishTweediePower15()
1941
1942
1943class TestTweediePower2(CheckTweedie):
1944    @classmethod
1945    def setup_class(cls):
1946        from .results.results_glm import CpunishTweediePower2
1947        cls.data = cpunish.load_pandas()
1948        cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
1949        cls.endog = cls.data.endog
1950        family_link = sm.families.Tweedie(link=sm.families.links.Power(1),
1951                                          var_power=2.)
1952        cls.res1 = sm.GLM(endog=cls.data.endog,
1953                          exog=cls.data.exog[['INCOME', 'SOUTH']],
1954                          family=family_link).fit()
1955        cls.res2 = CpunishTweediePower2()
1956
1957
1958class TestTweedieLog1(CheckTweedie):
1959    @classmethod
1960    def setup_class(cls):
1961        from .results.results_glm import CpunishTweedieLog1
1962        cls.data = cpunish.load_pandas()
1963        cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
1964        cls.endog = cls.data.endog
1965        family_link = sm.families.Tweedie(link=sm.families.links.log(),
1966                                          var_power=1.)
1967        cls.res1 = sm.GLM(endog=cls.data.endog,
1968                          exog=cls.data.exog[['INCOME', 'SOUTH']],
1969                          family=family_link).fit()
1970        cls.res2 = CpunishTweedieLog1()
1971
1972
1973class TestTweedieLog15Fair(CheckTweedie):
1974    @classmethod
1975    def setup_class(cls):
1976        from statsmodels.datasets.fair import load_pandas
1977
1978        from .results.results_glm import FairTweedieLog15
1979        data = load_pandas()
1980        family_link = sm.families.Tweedie(link=sm.families.links.log(),
1981                                          var_power=1.5)
1982        cls.res1 = sm.GLM(endog=data.endog,
1983                          exog=data.exog[['rate_marriage', 'age',
1984                                          'yrs_married']],
1985                          family=family_link).fit()
1986        cls.res2 = FairTweedieLog15()
1987
1988
1989class CheckTweedieSpecial(object):
1990    def test_mu(self):
1991        assert_allclose(self.res1.mu, self.res2.mu, rtol=1e-5, atol=1e-5)
1992
1993    def test_resid(self):
1994        assert_allclose(self.res1.resid_response, self.res2.resid_response,
1995                        rtol=1e-5, atol=1e-5)
1996        assert_allclose(self.res1.resid_pearson, self.res2.resid_pearson,
1997                        rtol=1e-5, atol=1e-5)
1998        assert_allclose(self.res1.resid_deviance, self.res2.resid_deviance,
1999                        rtol=1e-5, atol=1e-5)
2000        assert_allclose(self.res1.resid_working, self.res2.resid_working,
2001                        rtol=1e-5, atol=1e-5)
2002        assert_allclose(self.res1.resid_anscombe_unscaled,
2003                        self.res2.resid_anscombe_unscaled,
2004                        rtol=1e-5, atol=1e-5)
2005
2006
2007class TestTweedieSpecialLog0(CheckTweedieSpecial):
2008    @classmethod
2009    def setup_class(cls):
2010        cls.data = cpunish.load_pandas()
2011        cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
2012        cls.endog = cls.data.endog
2013        family1 = sm.families.Gaussian(link=sm.families.links.log())
2014        cls.res1 = sm.GLM(endog=cls.data.endog,
2015                          exog=cls.data.exog[['INCOME', 'SOUTH']],
2016                          family=family1).fit()
2017        family2 = sm.families.Tweedie(link=sm.families.links.log(),
2018                                      var_power=0)
2019        cls.res2 = sm.GLM(endog=cls.data.endog,
2020                          exog=cls.data.exog[['INCOME', 'SOUTH']],
2021                          family=family2).fit()
2022
2023
2024class TestTweedieSpecialLog1(CheckTweedieSpecial):
2025    @classmethod
2026    def setup_class(cls):
2027        cls.data = cpunish.load_pandas()
2028        cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
2029        cls.endog = cls.data.endog
2030        family1 = sm.families.Poisson(link=sm.families.links.log())
2031        cls.res1 = sm.GLM(endog=cls.data.endog,
2032                          exog=cls.data.exog[['INCOME', 'SOUTH']],
2033                          family=family1).fit()
2034        family2 = sm.families.Tweedie(link=sm.families.links.log(),
2035                                      var_power=1)
2036        cls.res2 = sm.GLM(endog=cls.data.endog,
2037                          exog=cls.data.exog[['INCOME', 'SOUTH']],
2038                          family=family2).fit()
2039
2040
2041class TestTweedieSpecialLog2(CheckTweedieSpecial):
2042    @classmethod
2043    def setup_class(cls):
2044        cls.data = cpunish.load_pandas()
2045        cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
2046        cls.endog = cls.data.endog
2047        family1 = sm.families.Gamma(link=sm.families.links.log())
2048        cls.res1 = sm.GLM(endog=cls.data.endog,
2049                          exog=cls.data.exog[['INCOME', 'SOUTH']],
2050                          family=family1).fit()
2051        family2 = sm.families.Tweedie(link=sm.families.links.log(),
2052                                      var_power=2)
2053        cls.res2 = sm.GLM(endog=cls.data.endog,
2054                          exog=cls.data.exog[['INCOME', 'SOUTH']],
2055                          family=family2).fit()
2056
2057
2058class TestTweedieSpecialLog3(CheckTweedieSpecial):
2059    @classmethod
2060    def setup_class(cls):
2061        cls.data = cpunish.load_pandas()
2062        cls.exog = cls.data.exog[['INCOME', 'SOUTH']]
2063        cls.endog = cls.data.endog
2064        family1 = sm.families.InverseGaussian(link=sm.families.links.log())
2065        cls.res1 = sm.GLM(endog=cls.data.endog,
2066                          exog=cls.data.exog[['INCOME', 'SOUTH']],
2067                          family=family1).fit()
2068        family2 = sm.families.Tweedie(link=sm.families.links.log(),
2069                                      var_power=3)
2070        cls.res2 = sm.GLM(endog=cls.data.endog,
2071                          exog=cls.data.exog[['INCOME', 'SOUTH']],
2072                          family=family2).fit()
2073
2074def gen_tweedie(p):
2075
2076    np.random.seed(3242)
2077    n = 500
2078    x = np.random.normal(size=(n, 4))
2079    lpr = np.dot(x, np.r_[1, -1, 0, 0.5])
2080    mu = np.exp(lpr)
2081    lam = 10 * mu**(2 - p) / (2 - p)
2082    alp = (2 - p) / (p - 1)
2083    bet = 10 * mu**(1 - p) / (p - 1)
2084
2085    # Generate Tweedie values using commpound Poisson distribution
2086    y = np.empty(n)
2087    N = np.random.poisson(lam)
2088    for i in range(n):
2089        y[i] = np.random.gamma(alp, 1 / bet[i], N[i]).sum()
2090
2091    return y, x
2092
2093@pytest.mark.filterwarnings("ignore:GLM ridge optimization")
2094def test_tweedie_EQL():
2095    # All tests below are regression tests, but the results
2096    # are very close to the population values.
2097
2098    p = 1.5
2099    y, x = gen_tweedie(p)
2100
2101    # Un-regularized fit using gradients
2102    fam = sm.families.Tweedie(var_power=p, eql=True)
2103    model1 = sm.GLM(y, x, family=fam)
2104    result1 = model1.fit(method="newton")
2105    assert_allclose(result1.params,
2106       np.array([1.00350497, -0.99656954, 0.00802702, 0.50713209]),
2107       rtol=1e-5, atol=1e-5)
2108
2109    # Un-regularized fit using IRLS
2110    model1x = sm.GLM(y, x, family=fam)
2111    result1x = model1x.fit(method="irls")
2112    assert_allclose(result1.params, result1x.params)
2113    assert_allclose(result1.bse, result1x.bse, rtol=1e-2)
2114
2115    # Lasso fit using coordinate-wise descent
2116    # TODO: The search gets trapped in an infinite oscillation, so use
2117    # a slack convergence tolerance.
2118    model2 = sm.GLM(y, x, family=fam)
2119    result2 = model2.fit_regularized(L1_wt=1, alpha=0.07, maxiter=200,
2120                   cnvrg_tol=0.01)
2121
2122    rtol, atol = 1e-2, 1e-4
2123    assert_allclose(result2.params,
2124        np.array([0.976831, -0.952854, 0., 0.470171]),
2125        rtol=rtol, atol=atol)
2126
2127    # Series of ridge fits using gradients
2128    ev = (np.array([1.001778, -0.99388, 0.00797, 0.506183]),
2129          np.array([0.98586638, -0.96953481, 0.00749983, 0.4975267]),
2130          np.array([0.206429, -0.164547, 0.000235, 0.102489]))
2131    for j, alpha in enumerate([0.05, 0.5, 0.7]):
2132        model3 = sm.GLM(y, x, family=fam)
2133        result3 = model3.fit_regularized(L1_wt=0, alpha=alpha)
2134        assert_allclose(result3.params, ev[j], rtol=rtol, atol=atol)
2135        result4 = model3.fit_regularized(L1_wt=0, alpha=alpha * np.ones(x.shape[1]))
2136        assert_allclose(result4.params, result3.params, rtol=rtol, atol=atol)
2137        alpha = alpha * np.ones(x.shape[1])
2138        alpha[0] = 0
2139        result5 = model3.fit_regularized(L1_wt=0, alpha=alpha)
2140        assert not np.allclose(result5.params, result4.params)
2141
2142def test_tweedie_elastic_net():
2143    # Check that the coefficients vanish one-by-one
2144    # when using the elastic net.
2145
2146    p = 1.5 # Tweedie variance exponent
2147    y, x = gen_tweedie(p)
2148
2149    # Un-regularized fit using gradients
2150    fam = sm.families.Tweedie(var_power=p, eql=True)
2151    model1 = sm.GLM(y, x, family=fam)
2152
2153    nnz = []
2154    for alpha in np.linspace(0, 10, 20):
2155        result1 = model1.fit_regularized(L1_wt=0.5, alpha=alpha)
2156        nnz.append((np.abs(result1.params) > 0).sum())
2157    nnz = np.unique(nnz)
2158    assert len(nnz) == 5
2159
2160
2161def test_tweedie_EQL_poisson_limit():
2162    # Test the limiting Poisson case of the Nelder/Pregibon/Tweedie
2163    # EQL.
2164
2165    np.random.seed(3242)
2166    n = 500
2167
2168    x = np.random.normal(size=(n, 3))
2169    x[:, 0] = 1
2170    lpr = 4 + x[:, 1:].sum(1)
2171    mn = np.exp(lpr)
2172    y = np.random.poisson(mn)
2173
2174    for scale in 1.0, 'x2', 'dev':
2175
2176        # Un-regularized fit using gradients not IRLS
2177        fam = sm.families.Tweedie(var_power=1, eql=True)
2178        model1 = sm.GLM(y, x, family=fam)
2179        result1 = model1.fit(method="newton", scale=scale)
2180
2181        # Poisson GLM
2182        model2 = sm.GLM(y, x, family=sm.families.Poisson())
2183        result2 = model2.fit(method="newton", scale=scale)
2184
2185        assert_allclose(result1.params, result2.params, atol=1e-6, rtol=1e-6)
2186        assert_allclose(result1.bse, result2.bse, 1e-6, 1e-6)
2187
2188
2189def test_tweedie_EQL_upper_limit():
2190    # Test the limiting case of the Nelder/Pregibon/Tweedie
2191    # EQL with var = mean^2.  These are tests against population
2192    # values so accuracy is not high.
2193
2194    np.random.seed(3242)
2195    n = 500
2196
2197    x = np.random.normal(size=(n, 3))
2198    x[:, 0] = 1
2199    lpr = 4 + x[:, 1:].sum(1)
2200    mn = np.exp(lpr)
2201    y = np.random.poisson(mn)
2202
2203    for scale in 'x2', 'dev', 1.0:
2204
2205        # Un-regularized fit using gradients not IRLS
2206        fam = sm.families.Tweedie(var_power=2, eql=True)
2207        model1 = sm.GLM(y, x, family=fam)
2208        result1 = model1.fit(method="newton", scale=scale)
2209        assert_allclose(result1.params, np.r_[4, 1, 1], atol=1e-3, rtol=1e-1)
2210
2211
2212def testTweediePowerEstimate():
2213    # Test the Pearson estimate of the Tweedie variance and scale parameters.
2214    #
2215    # Ideally, this would match the following R code, but I cannot make it work...
2216    #
2217    # setwd('c:/workspace')
2218    # data <- read.csv('cpunish.csv', sep=",")
2219    #
2220    # library(tweedie)
2221    #
2222    # y <- c(1.00113835e+05,   6.89668315e+03,   6.15726842e+03,
2223    #        1.41718806e+03,   5.11776456e+02,   2.55369154e+02,
2224    #        1.07147443e+01,   3.56874698e+00,   4.06797842e-02,
2225    #        7.06996731e-05,   2.10165106e-07,   4.34276938e-08,
2226    #        1.56354040e-09,   0.00000000e+00,   0.00000000e+00,
2227    #        0.00000000e+00,   0.00000000e+00)
2228    #
2229    # data$NewY <- y
2230    #
2231    # out <- tweedie.profile( NewY ~ INCOME + SOUTH - 1,
2232    #                         p.vec=c(1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8,
2233    #                                 1.9), link.power=0,
2234    #                         data=data,do.plot = TRUE)
2235    data = cpunish.load_pandas()
2236    y = [1.00113835e+05,   6.89668315e+03,   6.15726842e+03,
2237         1.41718806e+03,   5.11776456e+02,   2.55369154e+02,
2238         1.07147443e+01,   3.56874698e+00,   4.06797842e-02,
2239         7.06996731e-05,   2.10165106e-07,   4.34276938e-08,
2240         1.56354040e-09,   0.00000000e+00,   0.00000000e+00,
2241         0.00000000e+00,   0.00000000e+00]
2242    model1 = sm.GLM(y, data.exog[['INCOME', 'SOUTH']],
2243                    family=sm.families.Tweedie(link=sm.families.links.log(),
2244                                               var_power=1.5))
2245    res1 = model1.fit()
2246    model2 = sm.GLM((y - res1.mu) ** 2,
2247                    np.column_stack((np.ones(len(res1.mu)), np.log(res1.mu))),
2248                    family=sm.families.Gamma(sm.families.links.log()))
2249    res2 = model2.fit()
2250    # Sample may be too small for this...
2251    # assert_allclose(res1.scale, np.exp(res2.params[0]), rtol=0.25)
2252    p = model1.estimate_tweedie_power(res1.mu)
2253    assert_allclose(p, res2.params[1], rtol=0.25)
2254
2255def test_glm_lasso_6431():
2256
2257    # Based on issue #6431
2258    # Fails with newton-cg as optimizer
2259    np.random.seed(123)
2260
2261    from statsmodels.regression.linear_model import OLS
2262
2263    n = 50
2264    x = np.ones((n, 2))
2265    x[:, 1] = np.arange(0, n)
2266    y = 1000 + x[:, 1] + np.random.normal(0, 1, n)
2267
2268    params = np.r_[999.82244338, 1.0077889]
2269
2270    for method in "bfgs", None:
2271        for fun in [OLS, GLM]:
2272
2273            # Changing L1_wtValue from 0 to 1e-9 changes
2274            # the algorithm from scipy gradient optimization
2275            # to statsmodels coordinate descent
2276            for L1_wtValue in [0, 1e-9]:
2277                model = fun(y, x)
2278                if fun == OLS:
2279                    fit = model.fit_regularized(alpha=0, L1_wt=L1_wtValue)
2280                else:
2281                    fit = model._fit_ridge(alpha=0, start_params=None, method=method)
2282                assert_allclose(params, fit.params, atol=1e-6, rtol=1e-6)
2283
2284class TestRegularized(object):
2285
2286    def test_regularized(self):
2287
2288        import os
2289
2290        from .results import glmnet_r_results
2291
2292        for dtype in "binomial", "poisson":
2293
2294            cur_dir = os.path.dirname(os.path.abspath(__file__))
2295            data = np.loadtxt(os.path.join(cur_dir, "results", "enet_%s.csv" % dtype),
2296                              delimiter=",")
2297
2298            endog = data[:, 0]
2299            exog = data[:, 1:]
2300
2301            fam = {"binomial" : sm.families.Binomial,
2302                   "poisson" : sm.families.Poisson}[dtype]
2303
2304            for j in range(9):
2305
2306                vn = "rslt_%s_%d" % (dtype, j)
2307                r_result = getattr(glmnet_r_results, vn)
2308                L1_wt = r_result[0]
2309                alpha = r_result[1]
2310                params = r_result[2:]
2311
2312                model = GLM(endog, exog, family=fam())
2313                sm_result = model.fit_regularized(L1_wt=L1_wt, alpha=alpha)
2314
2315                # Agreement is OK, see below for further check
2316                assert_allclose(params, sm_result.params, atol=1e-2, rtol=0.3)
2317
2318                # The penalized log-likelihood that we are maximizing.
2319                def plf(params):
2320                    llf = model.loglike(params) / len(endog)
2321                    llf = llf - alpha * ((1 - L1_wt)*np.sum(params**2) / 2 + L1_wt*np.sum(np.abs(params)))
2322                    return llf
2323
2324                # Confirm that we are doing better than glmnet.
2325                llf_r = plf(params)
2326                llf_sm = plf(sm_result.params)
2327                assert_equal(np.sign(llf_sm - llf_r), 1)
2328
2329
2330class TestConvergence(object):
2331    @classmethod
2332    def setup_class(cls):
2333        '''
2334        Test Binomial family with canonical logit link using star98 dataset.
2335        '''
2336        from statsmodels.datasets.star98 import load
2337        data = load()
2338        data.exog = add_constant(data.exog, prepend=False)
2339        cls.model = GLM(data.endog, data.exog,
2340                         family=sm.families.Binomial())
2341
2342    def _when_converged(self, atol=1e-8, rtol=0, tol_criterion='deviance'):
2343        for i, dev in enumerate(self.res.fit_history[tol_criterion]):
2344            orig = self.res.fit_history[tol_criterion][i]
2345            new = self.res.fit_history[tol_criterion][i + 1]
2346            if np.allclose(orig, new, atol=atol, rtol=rtol):
2347                return i
2348        raise ValueError('CONVERGENCE CHECK: It seems this doens\'t converge!')
2349
2350    def test_convergence_atol_only(self):
2351        atol = 1e-8
2352        rtol = 0
2353        self.res = self.model.fit(atol=atol, rtol=rtol)
2354        expected_iterations = self._when_converged(atol=atol, rtol=rtol)
2355        actual_iterations = self.res.fit_history['iteration']
2356        # Note the first value is the list is np.inf. The second value
2357        # is the initial guess based off of start_params or the
2358        # estimate thereof. The third value (index = 2) is the actual "first
2359        # iteration"
2360        assert_equal(expected_iterations, actual_iterations)
2361        assert_equal(len(self.res.fit_history['deviance']) - 2,
2362                     actual_iterations)
2363
2364    def test_convergence_rtol_only(self):
2365        atol = 0
2366        rtol = 1e-8
2367        self.res = self.model.fit(atol=atol, rtol=rtol)
2368        expected_iterations = self._when_converged(atol=atol, rtol=rtol)
2369        actual_iterations = self.res.fit_history['iteration']
2370        # Note the first value is the list is np.inf. The second value
2371        # is the initial guess based off of start_params or the
2372        # estimate thereof. The third value (index = 2) is the actual "first
2373        # iteration"
2374        assert_equal(expected_iterations, actual_iterations)
2375        assert_equal(len(self.res.fit_history['deviance']) - 2,
2376                     actual_iterations)
2377
2378    def test_convergence_atol_rtol(self):
2379        atol = 1e-8
2380        rtol = 1e-8
2381        self.res = self.model.fit(atol=atol, rtol=rtol)
2382        expected_iterations = self._when_converged(atol=atol, rtol=rtol)
2383        actual_iterations = self.res.fit_history['iteration']
2384        # Note the first value is the list is np.inf. The second value
2385        # is the initial guess based off of start_params or the
2386        # estimate thereof. The third value (index = 2) is the actual "first
2387        # iteration"
2388        assert_equal(expected_iterations, actual_iterations)
2389        assert_equal(len(self.res.fit_history['deviance']) - 2,
2390                     actual_iterations)
2391
2392    def test_convergence_atol_only_params(self):
2393        atol = 1e-8
2394        rtol = 0
2395        self.res = self.model.fit(atol=atol, rtol=rtol, tol_criterion='params')
2396        expected_iterations = self._when_converged(atol=atol, rtol=rtol,
2397                                                   tol_criterion='params')
2398        actual_iterations = self.res.fit_history['iteration']
2399        # Note the first value is the list is np.inf. The second value
2400        # is the initial guess based off of start_params or the
2401        # estimate thereof. The third value (index = 2) is the actual "first
2402        # iteration"
2403        assert_equal(expected_iterations, actual_iterations)
2404        assert_equal(len(self.res.fit_history['deviance']) - 2,
2405                     actual_iterations)
2406
2407    def test_convergence_rtol_only_params(self):
2408        atol = 0
2409        rtol = 1e-8
2410        self.res = self.model.fit(atol=atol, rtol=rtol, tol_criterion='params')
2411        expected_iterations = self._when_converged(atol=atol, rtol=rtol,
2412                                                   tol_criterion='params')
2413        actual_iterations = self.res.fit_history['iteration']
2414        # Note the first value is the list is np.inf. The second value
2415        # is the initial guess based off of start_params or the
2416        # estimate thereof. The third value (index = 2) is the actual "first
2417        # iteration"
2418        assert_equal(expected_iterations, actual_iterations)
2419        assert_equal(len(self.res.fit_history['deviance']) - 2,
2420                     actual_iterations)
2421
2422    def test_convergence_atol_rtol_params(self):
2423        atol = 1e-8
2424        rtol = 1e-8
2425        self.res = self.model.fit(atol=atol, rtol=rtol, tol_criterion='params')
2426        expected_iterations = self._when_converged(atol=atol, rtol=rtol,
2427                                                   tol_criterion='params')
2428        actual_iterations = self.res.fit_history['iteration']
2429        # Note the first value is the list is np.inf. The second value
2430        # is the initial guess based off of start_params or the
2431        # estimate thereof. The third value (index = 2) is the actual "first
2432        # iteration"
2433        assert_equal(expected_iterations, actual_iterations)
2434        assert_equal(len(self.res.fit_history['deviance']) - 2,
2435                     actual_iterations)
2436
2437
2438def test_poisson_deviance():
2439    # see #3355 missing term in deviance if resid_response.sum() != 0
2440    np.random.seed(123987)
2441    nobs, k_vars = 50, 3-1
2442    x = sm.add_constant(np.random.randn(nobs, k_vars))
2443
2444    mu_true = np.exp(x.sum(1))
2445    y = np.random.poisson(mu_true, size=nobs)
2446
2447    mod = sm.GLM(y, x[:, :], family=sm.genmod.families.Poisson())
2448    res = mod.fit()
2449
2450    d_i = res.resid_deviance
2451    d = res.deviance
2452    lr = (mod.family.loglike(y, y+1e-20) -
2453          mod.family.loglike(y, res.fittedvalues)) * 2
2454
2455    assert_allclose(d, (d_i**2).sum(), rtol=1e-12)
2456    assert_allclose(d, lr, rtol=1e-12)
2457
2458    # case without constant, resid_response.sum() != 0
2459    mod_nc = sm.GLM(y, x[:, 1:], family=sm.genmod.families.Poisson())
2460    res_nc = mod_nc.fit()
2461
2462    d_i = res_nc.resid_deviance
2463    d = res_nc.deviance
2464    lr = (mod.family.loglike(y, y+1e-20) -
2465          mod.family.loglike(y, res_nc.fittedvalues)) * 2
2466
2467    assert_allclose(d, (d_i**2).sum(), rtol=1e-12)
2468    assert_allclose(d, lr, rtol=1e-12)
2469
2470
2471def test_non_invertible_hessian_fails_summary():
2472    # Test when the hessian fails the summary is still available.
2473    data = cpunish.load_pandas()
2474
2475    data.endog[:] = 1
2476    with warnings.catch_warnings():
2477        # we filter DomainWarning, the convergence problems
2478        # and warnings in summary
2479        warnings.simplefilter("ignore")
2480        mod = sm.GLM(data.endog, data.exog, family=sm.families.Gamma())
2481        res = mod.fit(maxiter=1, method='bfgs', max_start_irls=0)
2482        res.summary()
2483
2484
2485def test_int_scale():
2486    # GH-6627, make sure it works with int scale
2487    data = longley.load()
2488    mod = GLM(data.endog, data.exog, family=sm.families.Gaussian())
2489    res = mod.fit(scale=1)
2490    assert isinstance(res.params, pd.Series)
2491    assert res.scale.dtype == np.float64
2492
2493
2494@pytest.mark.parametrize("dtype", [np.int8, np.int16, np.int32, np.int64])
2495def test_int_exog(dtype):
2496    # GH-6627, make use of floats internally
2497    count1, n1, count2, n2 = 60, 51477.5, 30, 54308.7
2498    y = [count1, count2]
2499    x = np.asarray([[1, 1], [1, 0]]).astype(dtype)
2500    exposure = np.asarray([n1, n2])
2501    mod = GLM(y, x, exposure=exposure, family=sm.families.Poisson())
2502    res = mod.fit(method='bfgs', max_start_irls=0)
2503    assert isinstance(res.params, np.ndarray)
2504
2505
2506def test_glm_bic(iris):
2507    X = np.c_[np.ones(100), iris[50:, :4]]
2508    y = np.array(iris)[50:, 4].astype(np.int32)
2509    y -= 1
2510    SET_USE_BIC_LLF(True)
2511    model = GLM(y, X, family=sm.families.Binomial()).fit()
2512    # 34.9244 is what glm() of R yields
2513    assert_almost_equal(model.bic, 34.9244, decimal=3)
2514    assert_almost_equal(model.bic_llf, 34.9244, decimal=3)
2515    SET_USE_BIC_LLF(False)
2516    assert_almost_equal(model.bic, model.bic_deviance, decimal=3)
2517    SET_USE_BIC_LLF(None)
2518
2519
2520def test_glm_bic_warning(iris):
2521    X = np.c_[np.ones(100), iris[50:, :4]]
2522    y = np.array(iris)[50:, 4].astype(np.int32)
2523    y -= 1
2524    model = GLM(y, X, family=sm.families.Binomial()).fit()
2525    with pytest.warns(FutureWarning, match="The bic"):
2526        assert isinstance(model.bic, float)
2527
2528
2529def test_output_exposure_null(reset_randomstate):
2530    # GH 6953
2531
2532    x0 = [np.sin(i / 20) + 2 for i in range(1000)]
2533    rs = np.random.RandomState(0)
2534    # Variable exposures for each observation
2535    exposure = rs.randint(100, 200, size=1000)
2536    y = [np.sum(rs.poisson(x, size=e)) for x, e in zip(x0, exposure)]
2537    x = add_constant(x0)
2538
2539    model = GLM(
2540        endog=y, exog=x, exposure=exposure, family=sm.families.Poisson()
2541    ).fit()
2542    null_model = GLM(
2543        endog=y, exog=x[:, 0], exposure=exposure, family=sm.families.Poisson()
2544    ).fit()
2545    null_model_without_exposure = GLM(
2546        endog=y, exog=x[:, 0], family=sm.families.Poisson()
2547    ).fit()
2548    assert_allclose(model.llnull, null_model.llf)
2549    # Check that they are different
2550    assert np.abs(null_model_without_exposure.llf - model.llnull) > 1
2551
2552
2553def test_qaic():
2554
2555    # Example from documentation of R package MuMIn
2556    import patsy
2557    ldose = np.concatenate((np.arange(6), np.arange(6)))
2558    sex = ["M"]*6 + ["F"]*6
2559    numdead = [10, 4, 9, 12, 18, 20, 0, 2, 6, 10, 12, 16]
2560    df = pd.DataFrame({"ldose": ldose, "sex": sex, "numdead": numdead})
2561    df["numalive"] = 20 - df["numdead"]
2562    df["SF"] = df["numdead"]
2563
2564    y = df[["numalive", "numdead"]].values
2565    x = patsy.dmatrix("sex*ldose", data=df, return_type='dataframe')
2566    m = GLM(y, x, family=sm.families.Binomial())
2567    r = m.fit()
2568    scale = 2.412699
2569    qaic = r.info_criteria(crit="qaic", scale=scale)
2570
2571    # R gives 31.13266 because it uses a df that is 1 greater,
2572    # presumably because they count the scale parameter in df.
2573    # This won't matter when comparing models by differencing
2574    # QAICs.
2575    # Binomial doesn't have a scale parameter, so adding +1 is not correct.
2576    assert_allclose(qaic, 29.13266, rtol=1e-5, atol=1e-5)
2577    qaic1 = r.info_criteria(crit="qaic", scale=scale, dk_params=1)
2578    assert_allclose(qaic1, 31.13266, rtol=1e-5, atol=1e-5)
2579
2580
2581def test_tweedie_score():
2582
2583    np.random.seed(3242)
2584    n = 500
2585    x = np.random.normal(size=(n, 4))
2586    lpr = np.dot(x, np.r_[1, -1, 0, 0.5])
2587    mu = np.exp(lpr)
2588
2589    p0 = 1.5
2590    lam = 10 * mu**(2 - p0) / (2 - p0)
2591    alp = (2 - p0) / (p0 - 1)
2592    bet = 10 * mu**(1 - p0) / (p0 - 1)
2593    y = np.empty(n)
2594    N = np.random.poisson(lam)
2595    for i in range(n):
2596        y[i] = np.random.gamma(alp, 1 / bet[i], N[i]).sum()
2597
2598    for p in [1, 1.5, 2]:
2599
2600        fam = sm.families.Tweedie(var_power=p, eql=True)
2601        model = GLM(y, x, family=fam)
2602        result = model.fit()
2603
2604        pa = result.params + 0.2*np.random.normal(size=result.params.size)
2605
2606        ngrad = approx_fprime_cs(pa, lambda x: model.loglike(x, scale=1))
2607        agrad = model.score(pa, scale=1)
2608        assert_allclose(ngrad, agrad, atol=1e-8, rtol=1e-8)
2609
2610        nhess = approx_hess_cs(pa, lambda x: model.loglike(x, scale=1))
2611        ahess = model.hessian(pa, scale=1)
2612        assert_allclose(nhess, ahess, atol=5e-8, rtol=5e-8)
2613