1# -*- coding: utf-8 -*-
2"""
3Created on Fri May 30 16:22:29 2014
4
5Author: Josef Perktold
6License: BSD-3
7
8"""
9from io import StringIO
10
11import numpy as np
12from numpy.testing import assert_, assert_allclose, assert_equal
13import pandas as pd
14import patsy
15import pytest
16
17from statsmodels import datasets
18from statsmodels.base._constraints import fit_constrained
19from statsmodels.discrete.discrete_model import Poisson
20from statsmodels.genmod import families
21from statsmodels.genmod.generalized_linear_model import GLM
22from statsmodels.tools.tools import add_constant
23
24from .results import (
25    results_glm_logit_constrained as reslogit,
26    results_poisson_constrained as results,
27)
28
29spector_data = datasets.spector.load()
30spector_data.endog = np.asarray(spector_data.endog)
31spector_data.exog = np.asarray(spector_data.exog)
32spector_data.exog = add_constant(spector_data.exog, prepend=False)
33
34
35DEBUG = False
36
37ss = '''\
38agecat	smokes	deaths	pyears
391	1	32	52407
402	1	104	43248
413	1	206	28612
424	1	186	12663
435	1	102	5317
441	0	2	18790
452	0	12	10673
463	0	28	5710
474	0	28	2585
485	0	31	1462'''
49
50data = pd.read_csv(StringIO(ss), delimiter='\t')
51data = data.astype(int)
52data['logpyears'] = np.log(data['pyears'])
53
54
55class CheckPoissonConstrainedMixin(object):
56
57    def test_basic(self):
58        res1 = self.res1
59        res2 = self.res2
60        assert_allclose(res1[0], res2.params[self.idx], rtol=1e-6)
61        # see below Stata has nan, we have zero
62        bse1 = np.sqrt(np.diag(res1[1]))
63        mask = (bse1 == 0) & np.isnan(res2.bse[self.idx])
64        assert_allclose(bse1[~mask], res2.bse[self.idx][~mask], rtol=1e-6)
65
66    def test_basic_method(self):
67        if hasattr(self, 'res1m'):
68            res1 = (self.res1m if not hasattr(self.res1m, '_results')
69                    else self.res1m._results)
70            res2 = self.res2
71            assert_allclose(res1.params, res2.params[self.idx], rtol=1e-6)
72
73            # when a parameter is fixed, the Stata has bse=nan, we have bse=0
74            mask = (res1.bse == 0) & np.isnan(res2.bse[self.idx])
75            assert_allclose(res1.bse[~mask], res2.bse[self.idx][~mask],
76                            rtol=1e-6)
77
78            tvalues = res2.params_table[self.idx, 2]
79            # when a parameter is fixed, the Stata has tvalue=nan,
80            #  we have tvalue=inf
81            mask = np.isinf(res1.tvalues) & np.isnan(tvalues)
82            assert_allclose(res1.tvalues[~mask], tvalues[~mask], rtol=1e-6)
83            pvalues = res2.params_table[self.idx, 3]
84            # note most pvalues are very small
85            # examples so far agree at 8 or more decimal, but rtol is stricter
86            # see above
87            mask = (res1.pvalues == 0) & np.isnan(pvalues)
88            assert_allclose(res1.pvalues[~mask], pvalues[~mask], rtol=5e-5)
89
90            ci_low = res2.params_table[self.idx, 4]
91            ci_upp = res2.params_table[self.idx, 5]
92            ci = np.column_stack((ci_low, ci_upp))
93            # note most pvalues are very small
94            # examples so far agree at 8 or more decimal, but rtol is stricter
95            # see above: nan versus value
96            assert_allclose(res1.conf_int()[~np.isnan(ci)], ci[~np.isnan(ci)],
97                            rtol=5e-5)
98
99            # other
100            assert_allclose(res1.llf, res2.ll, rtol=1e-6)
101            assert_equal(res1.df_model, res2.df_m)
102            # Stata does not have df_resid
103            df_r = res2.N - res2.df_m - 1
104            assert_equal(res1.df_resid, df_r)
105        else:
106            pytest.skip("not available yet")
107
108    def test_other(self):
109        # some results may not be valid or available for all models
110        if hasattr(self, 'res1m'):
111            res1 = self.res1m
112            res2 = self.res2
113
114            if hasattr(res2, 'll_0'):
115                assert_allclose(res1.llnull, res2.ll_0, rtol=1e-6)
116            else:
117                if DEBUG:
118                    import warnings
119                    message = ('test: ll_0 not available, llnull=%6.4F'
120                               % res1.llnull)
121                    warnings.warn(message)
122
123        else:
124            pytest.skip("not available yet")
125
126
127class TestPoissonConstrained1a(CheckPoissonConstrainedMixin):
128
129    @classmethod
130    def setup_class(cls):
131
132        cls.res2 = results.results_noexposure_constraint
133        # 2 is dropped baseline for categorical
134        cls.idx = [7, 3, 4, 5, 6, 0, 1]
135
136        # example without offset
137        formula = 'deaths ~ logpyears + smokes + C(agecat)'
138        mod = Poisson.from_formula(formula, data=data)
139        # get start_params, example fails to converge on one CI run
140        k_vars = len(mod.exog_names)
141        start_params = np.zeros(k_vars)
142        start_params[0] = np.log(mod.endog.mean())
143        # if we need it, this is desired params
144        #  p = np.array([-3.93478643,  1.37276214,  2.33077032,  2.71338891,
145        #                2.71338891, 0.57966535,  0.97254074])
146
147        constr = 'C(agecat)[T.4] = C(agecat)[T.5]'
148        lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr)
149        cls.res1 = fit_constrained(mod, lc.coefs, lc.constants,
150                                   start_params=start_params,
151                                   fit_kwds={'method': 'bfgs', 'disp': 0})
152        # TODO: Newton fails
153
154        # test method of Poisson, not monkey patched
155        cls.res1m = mod.fit_constrained(constr, start_params=start_params,
156                                        method='bfgs', disp=0)
157
158    @pytest.mark.smoke
159    def test_summary(self):
160        # trailing text in summary, assumes it's the first extra string
161        # NOTE: see comment about convergence in llnull for self.res1m
162        summ = self.res1m.summary()
163        assert_('linear equality constraints' in summ.extra_txt)
164
165    @pytest.mark.smoke
166    def test_summary2(self):
167        # trailing text in summary, assumes it's the first extra string
168        # NOTE: see comment about convergence in llnull for self.res1m
169        summ = self.res1m.summary2()
170        assert_('linear equality constraints' in summ.extra_txt[0])
171
172
173class TestPoissonConstrained1b(CheckPoissonConstrainedMixin):
174
175    @classmethod
176    def setup_class(cls):
177
178        cls.res2 = results.results_exposure_constraint
179        cls.idx = [6, 2, 3, 4, 5, 0]  # 2 is dropped baseline for categorical
180
181        # example without offset
182        formula = 'deaths ~ smokes + C(agecat)'
183        mod = Poisson.from_formula(formula, data=data,
184                                   exposure=data['pyears'].values)
185        constr = 'C(agecat)[T.4] = C(agecat)[T.5]'
186        lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr)
187        cls.res1 = fit_constrained(mod, lc.coefs, lc.constants,
188                                   fit_kwds={'method': 'newton',
189                                             'disp': 0})
190        cls.constraints = lc
191        # TODO: bfgs fails
192        # test method of Poisson, not monkey patched
193        cls.res1m = mod.fit_constrained(constr, method='newton',
194                                        disp=0)
195
196
197class TestPoissonConstrained1c(CheckPoissonConstrainedMixin):
198
199    @classmethod
200    def setup_class(cls):
201
202        cls.res2 = results.results_exposure_constraint
203        cls.idx = [6, 2, 3, 4, 5, 0]  # 2 is dropped baseline for categorical
204
205        # example without offset
206        formula = 'deaths ~ smokes + C(agecat)'
207        mod = Poisson.from_formula(formula, data=data,
208                                   offset=np.log(data['pyears'].values))
209        constr = 'C(agecat)[T.4] = C(agecat)[T.5]'
210        lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr)
211        cls.res1 = fit_constrained(mod, lc.coefs, lc.constants,
212                                   fit_kwds={'method': 'newton',
213                                             'disp': 0})
214        cls.constraints = lc
215        # TODO: bfgs fails
216
217        # test method of Poisson, not monkey patched
218        cls.res1m = mod.fit_constrained(constr, method='newton', disp=0)
219
220
221class TestPoissonNoConstrained(CheckPoissonConstrainedMixin):
222
223    @classmethod
224    def setup_class(cls):
225
226        cls.res2 = results.results_exposure_noconstraint
227        cls.idx = [6, 2, 3, 4, 5, 0]  # 1 is dropped baseline for categorical
228
229        # example without offset
230        formula = 'deaths ~ smokes + C(agecat)'
231        mod = Poisson.from_formula(formula, data=data,
232                                   offset=np.log(data['pyears'].values))
233        res1 = mod.fit(disp=0)._results
234        # res1 is duplicate check, so we can follow the same pattern
235        cls.res1 = (res1.params, res1.cov_params())
236        cls.res1m = res1
237
238
239class TestPoissonConstrained2a(CheckPoissonConstrainedMixin):
240
241    @classmethod
242    def setup_class(cls):
243
244        cls.res2 = results.results_noexposure_constraint2
245        # 2 is dropped baseline for categorical
246        cls.idx = [7, 3, 4, 5, 6, 0, 1]
247
248        # example without offset
249        formula = 'deaths ~ logpyears + smokes + C(agecat)'
250        mod = Poisson.from_formula(formula, data=data)
251
252        # get start_params, example fails to converge on one CI run
253        k_vars = len(mod.exog_names)
254        start_params = np.zeros(k_vars)
255        start_params[0] = np.log(mod.endog.mean())
256        # if we need it, this is desired params
257        #  p = np.array([-9.43762015,  1.52762442,  2.74155711,  3.58730007,
258        #                4.08730007,  1.15987869,  0.12111539])
259
260        constr = 'C(agecat)[T.5] - C(agecat)[T.4] = 0.5'
261        lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr)
262        cls.res1 = fit_constrained(mod, lc.coefs, lc.constants,
263                                   start_params=start_params,
264                                   fit_kwds={'method': 'bfgs', 'disp': 0})
265        # TODO: Newton fails
266
267        # test method of Poisson, not monkey patched
268        cls.res1m = mod.fit_constrained(constr, start_params=start_params,
269                                        method='bfgs', disp=0)
270
271
272class TestPoissonConstrained2b(CheckPoissonConstrainedMixin):
273
274    @classmethod
275    def setup_class(cls):
276
277        cls.res2 = results.results_exposure_constraint2
278        cls.idx = [6, 2, 3, 4, 5, 0]  # 2 is dropped baseline for categorical
279
280        # example without offset
281        formula = 'deaths ~ smokes + C(agecat)'
282        mod = Poisson.from_formula(formula, data=data,
283                                   exposure=data['pyears'].values)
284        constr = 'C(agecat)[T.5] - C(agecat)[T.4] = 0.5'
285        lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr)
286        cls.res1 = fit_constrained(mod, lc.coefs, lc.constants,
287                                   fit_kwds={'method': 'newton',
288                                             'disp': 0})
289        cls.constraints = lc
290        # TODO: bfgs fails to converge. overflow somewhere?
291
292        # test method of Poisson, not monkey patched
293        cls.res1m = mod.fit_constrained(constr, method='bfgs', disp=0,
294                                        start_params=cls.res1[0])
295
296
297class TestPoissonConstrained2c(CheckPoissonConstrainedMixin):
298
299    @classmethod
300    def setup_class(cls):
301
302        cls.res2 = results.results_exposure_constraint2
303        cls.idx = [6, 2, 3, 4, 5, 0]  # 2 is dropped baseline for categorical
304
305        # example without offset
306        formula = 'deaths ~ smokes + C(agecat)'
307        mod = Poisson.from_formula(formula, data=data,
308                                   offset=np.log(data['pyears'].values))
309
310        constr = 'C(agecat)[T.5] - C(agecat)[T.4] = 0.5'
311        lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr)
312        cls.res1 = fit_constrained(mod, lc.coefs, lc.constants,
313                                   fit_kwds={'method': 'newton', 'disp': 0})
314        cls.constraints = lc
315        # TODO: bfgs fails
316
317        # test method of Poisson, not monkey patched
318        cls.res1m = mod.fit_constrained(constr,
319                                        method='bfgs', disp=0,
320                                        start_params=cls.res1[0])
321
322
323class TestGLMPoissonConstrained1a(CheckPoissonConstrainedMixin):
324
325    @classmethod
326    def setup_class(cls):
327        from statsmodels.base._constraints import fit_constrained
328
329        cls.res2 = results.results_noexposure_constraint
330        # 2 is dropped baseline for categorical
331        cls.idx = [7, 3, 4, 5, 6, 0, 1]
332
333        # example without offset
334        formula = 'deaths ~ logpyears + smokes + C(agecat)'
335        mod = GLM.from_formula(formula, data=data,
336                               family=families.Poisson())
337
338        constr = 'C(agecat)[T.4] = C(agecat)[T.5]'
339        lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr)
340        cls.res1 = fit_constrained(mod, lc.coefs, lc.constants,
341                                   fit_kwds={'atol': 1e-10})
342        cls.constraints = lc
343        cls.res1m = mod.fit_constrained(constr, atol=1e-10)
344
345
346class TestGLMPoissonConstrained1b(CheckPoissonConstrainedMixin):
347
348    @classmethod
349    def setup_class(cls):
350        from statsmodels.base._constraints import fit_constrained
351        from statsmodels.genmod import families
352        from statsmodels.genmod.generalized_linear_model import GLM
353
354        cls.res2 = results.results_exposure_constraint
355        cls.idx = [6, 2, 3, 4, 5, 0]  # 2 is dropped baseline for categorical
356
357        # example with offset
358        formula = 'deaths ~ smokes + C(agecat)'
359        mod = GLM.from_formula(formula, data=data,
360                               family=families.Poisson(),
361                               offset=np.log(data['pyears'].values))
362
363        constr = 'C(agecat)[T.4] = C(agecat)[T.5]'
364        lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr)
365
366        cls.res1 = fit_constrained(mod, lc.coefs, lc.constants,
367                                   fit_kwds={'atol': 1e-10})
368        cls.constraints = lc
369        cls.res1m = mod.fit_constrained(constr, atol=1e-10)._results
370
371    def test_compare_glm_poisson(self):
372        res1 = self.res1m
373        res2 = self.res2
374
375        formula = 'deaths ~ smokes + C(agecat)'
376        mod = Poisson.from_formula(formula, data=data,
377                                   exposure=data['pyears'].values)
378
379        constr = 'C(agecat)[T.4] = C(agecat)[T.5]'
380        res2 = mod.fit_constrained(constr, start_params=self.res1m.params,
381                                   method='newton', warn_convergence=False,
382                                   disp=0)
383
384        # we get high precision because we use the params as start_params
385
386        # basic, just as check that we have the same model
387        assert_allclose(res1.params, res2.params, rtol=1e-12)
388        assert_allclose(res1.bse, res2.bse, rtol=1e-11)
389
390        # check predict, fitted, ...
391
392        predicted = res1.predict()
393        assert_allclose(predicted, res2.predict(), rtol=1e-10)
394        assert_allclose(res1.mu, predicted, rtol=1e-10)
395        assert_allclose(res1.fittedvalues, predicted, rtol=1e-10)
396        assert_allclose(res2.predict(linear=True), res2.predict(linear=True),
397                        rtol=1e-10)
398
399
400class CheckGLMConstrainedMixin(CheckPoissonConstrainedMixin):
401    # add tests for some GLM specific attributes
402
403    def test_glm(self):
404        res2 = self.res2  # reference results
405        res1 = self.res1m
406
407        # FIXME: dont leave commented-out
408        # assert_allclose(res1.aic, res2.aic, rtol=1e-10)  # far away
409        # Stata aic in ereturn and in estat ic are very different
410        # we have the same as estat ic
411        # see issue GH#1733
412        assert_allclose(res1.aic, res2.infocrit[4], rtol=1e-10)
413
414        import warnings
415
416        with warnings.catch_warnings():
417            warnings.simplefilter("ignore", FutureWarning)
418            # FutureWarning for BIC changes
419            assert_allclose(res1.bic, res2.bic, rtol=1e-10)
420        # bic is deviance based
421        # FIXME: dont leave commented-out
422        #  assert_allclose(res1.bic, res2.infocrit[5], rtol=1e-10)
423        assert_allclose(res1.deviance, res2.deviance, rtol=1e-10)
424        # FIXME: dont leave commented-out
425        # TODO: which chi2 are these
426        # assert_allclose(res1.pearson_chi2, res2.chi2, rtol=1e-10)
427
428
429class TestGLMLogitConstrained1(CheckGLMConstrainedMixin):
430
431    @classmethod
432    def setup_class(cls):
433        cls.idx = slice(None)
434        # params sequence same as Stata, but Stata reports param = nan
435        # and we have param = value = 0
436
437        cls.res2 = reslogit.results_constraint1
438
439        mod1 = GLM(spector_data.endog, spector_data.exog,
440                   family=families.Binomial())
441
442        constr = 'x1 = 2.8'
443        cls.res1m = mod1.fit_constrained(constr)
444
445        R, q = cls.res1m.constraints
446        cls.res1 = fit_constrained(mod1, R, q)
447
448
449class TestGLMLogitConstrained2(CheckGLMConstrainedMixin):
450
451    @classmethod
452    def setup_class(cls):
453        cls.idx = slice(None)  # params sequence same as Stata
454        cls.res2 = reslogit.results_constraint2
455
456        mod1 = GLM(spector_data.endog, spector_data.exog,
457                   family=families.Binomial())
458
459        constr = 'x1 - x3 = 0'
460        cls.res1m = mod1.fit_constrained(constr, atol=1e-10)
461
462        # patsy compatible constraints
463        R, q = cls.res1m.constraints.coefs, cls.res1m.constraints.constants
464        cls.res1 = fit_constrained(mod1, R, q, fit_kwds={'atol': 1e-10})
465        cls.constraints_rq = (R, q)
466
467    def test_predict(self):
468        # results only available for this case
469        res2 = self.res2  # reference results
470        res1 = self.res1m
471
472        predicted = res1.predict()
473        assert_allclose(predicted, res2.predict_mu, atol=1e-7)
474        assert_allclose(res1.mu, predicted, rtol=1e-10)
475        assert_allclose(res1.fittedvalues, predicted, rtol=1e-10)
476
477    @pytest.mark.smoke
478    def test_summary(self):
479        # trailing text in summary, assumes it's the first extra string
480        summ = self.res1m.summary()
481        assert_('linear equality constraints' in summ.extra_txt)
482
483        lc_string = str(self.res1m.constraints)
484        assert lc_string == "x1 - x3 = 0.0"
485
486    @pytest.mark.smoke
487    def test_summary2(self):
488        # trailing text in summary, assumes it's the first extra string
489        import warnings
490
491        with warnings.catch_warnings():
492            warnings.simplefilter("ignore", FutureWarning)
493            # FutureWarning for BIC changes
494            summ = self.res1m.summary2()
495
496        assert_('linear equality constraints' in summ.extra_txt[0])
497
498    def test_fit_constrained_wrap(self):
499        # minimal test
500        res2 = self.res2  # reference results
501
502        from statsmodels.base._constraints import fit_constrained_wrap
503        res_wrap = fit_constrained_wrap(self.res1m.model, self.constraints_rq)
504        assert_allclose(res_wrap.params, res2.params, rtol=1e-6)
505        assert_allclose(res_wrap.params, res2.params, rtol=1e-6)
506
507
508class TestGLMLogitConstrained2HC(CheckGLMConstrainedMixin):
509
510    @classmethod
511    def setup_class(cls):
512        cls.idx = slice(None)  # params sequence same as Stata
513        cls.res2 = reslogit.results_constraint2_robust
514
515        mod1 = GLM(spector_data.endog, spector_data.exog,
516                   family=families.Binomial())
517
518        # not used to match Stata for HC
519        # nobs, k_params = mod1.exog.shape
520        # k_params -= 1   # one constraint
521        cov_type = 'HC0'
522        cov_kwds = {'scaling_factor': 32/31}
523        # looks like nobs / (nobs - 1) and not (nobs - 1.) / (nobs - k_params)}
524        constr = 'x1 - x3 = 0'
525        cls.res1m = mod1.fit_constrained(constr, cov_type=cov_type,
526                                         cov_kwds=cov_kwds, atol=1e-10)
527
528        R, q = cls.res1m.constraints
529        cls.res1 = fit_constrained(mod1, R, q, fit_kwds={'atol': 1e-10,
530                                                         'cov_type': cov_type,
531                                                         'cov_kwds': cov_kwds})
532        cls.constraints_rq = (R, q)
533
534
535def junk():  # FIXME: make this into a test, or move/remove
536    # Singular Matrix in mod1a.fit()
537
538    # same as Stata default
539    formula2 = 'deaths ~ C(agecat) + C(smokes) : C(agecat)'
540
541    mod = Poisson.from_formula(formula2, data=data,
542                               exposure=data['pyears'].values)
543
544    mod.fit()
545
546    constraints = 'C(smokes)[T.1]:C(agecat)[3] = C(smokes)[T.1]:C(agec`at)[4]'
547
548    import patsy
549    lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constraints)
550    R, q = lc.coefs, lc.constants
551
552    mod.fit_constrained(R, q, fit_kwds={'method': 'bfgs'})
553
554    # example without offset
555    formula1a = 'deaths ~ logpyears + smokes + C(agecat)'
556    mod1a = Poisson.from_formula(formula1a, data=data)
557
558    mod1a.fit()
559    lc_1a = patsy.DesignInfo(mod1a.exog_names).linear_constraint(
560        'C(agecat)[T.4] = C(agecat)[T.5]')
561    mod1a.fit_constrained(lc_1a.coefs, lc_1a.constants,
562                          fit_kwds={'method': 'newton'})
563