1# -*- coding: utf-8 -*- 2""" 3Created on Fri May 30 16:22:29 2014 4 5Author: Josef Perktold 6License: BSD-3 7 8""" 9from io import StringIO 10 11import numpy as np 12from numpy.testing import assert_, assert_allclose, assert_equal 13import pandas as pd 14import patsy 15import pytest 16 17from statsmodels import datasets 18from statsmodels.base._constraints import fit_constrained 19from statsmodels.discrete.discrete_model import Poisson 20from statsmodels.genmod import families 21from statsmodels.genmod.generalized_linear_model import GLM 22from statsmodels.tools.tools import add_constant 23 24from .results import ( 25 results_glm_logit_constrained as reslogit, 26 results_poisson_constrained as results, 27) 28 29spector_data = datasets.spector.load() 30spector_data.endog = np.asarray(spector_data.endog) 31spector_data.exog = np.asarray(spector_data.exog) 32spector_data.exog = add_constant(spector_data.exog, prepend=False) 33 34 35DEBUG = False 36 37ss = '''\ 38agecat smokes deaths pyears 391 1 32 52407 402 1 104 43248 413 1 206 28612 424 1 186 12663 435 1 102 5317 441 0 2 18790 452 0 12 10673 463 0 28 5710 474 0 28 2585 485 0 31 1462''' 49 50data = pd.read_csv(StringIO(ss), delimiter='\t') 51data = data.astype(int) 52data['logpyears'] = np.log(data['pyears']) 53 54 55class CheckPoissonConstrainedMixin(object): 56 57 def test_basic(self): 58 res1 = self.res1 59 res2 = self.res2 60 assert_allclose(res1[0], res2.params[self.idx], rtol=1e-6) 61 # see below Stata has nan, we have zero 62 bse1 = np.sqrt(np.diag(res1[1])) 63 mask = (bse1 == 0) & np.isnan(res2.bse[self.idx]) 64 assert_allclose(bse1[~mask], res2.bse[self.idx][~mask], rtol=1e-6) 65 66 def test_basic_method(self): 67 if hasattr(self, 'res1m'): 68 res1 = (self.res1m if not hasattr(self.res1m, '_results') 69 else self.res1m._results) 70 res2 = self.res2 71 assert_allclose(res1.params, res2.params[self.idx], rtol=1e-6) 72 73 # when a parameter is fixed, the Stata has bse=nan, we have bse=0 74 mask = (res1.bse == 0) & np.isnan(res2.bse[self.idx]) 75 assert_allclose(res1.bse[~mask], res2.bse[self.idx][~mask], 76 rtol=1e-6) 77 78 tvalues = res2.params_table[self.idx, 2] 79 # when a parameter is fixed, the Stata has tvalue=nan, 80 # we have tvalue=inf 81 mask = np.isinf(res1.tvalues) & np.isnan(tvalues) 82 assert_allclose(res1.tvalues[~mask], tvalues[~mask], rtol=1e-6) 83 pvalues = res2.params_table[self.idx, 3] 84 # note most pvalues are very small 85 # examples so far agree at 8 or more decimal, but rtol is stricter 86 # see above 87 mask = (res1.pvalues == 0) & np.isnan(pvalues) 88 assert_allclose(res1.pvalues[~mask], pvalues[~mask], rtol=5e-5) 89 90 ci_low = res2.params_table[self.idx, 4] 91 ci_upp = res2.params_table[self.idx, 5] 92 ci = np.column_stack((ci_low, ci_upp)) 93 # note most pvalues are very small 94 # examples so far agree at 8 or more decimal, but rtol is stricter 95 # see above: nan versus value 96 assert_allclose(res1.conf_int()[~np.isnan(ci)], ci[~np.isnan(ci)], 97 rtol=5e-5) 98 99 # other 100 assert_allclose(res1.llf, res2.ll, rtol=1e-6) 101 assert_equal(res1.df_model, res2.df_m) 102 # Stata does not have df_resid 103 df_r = res2.N - res2.df_m - 1 104 assert_equal(res1.df_resid, df_r) 105 else: 106 pytest.skip("not available yet") 107 108 def test_other(self): 109 # some results may not be valid or available for all models 110 if hasattr(self, 'res1m'): 111 res1 = self.res1m 112 res2 = self.res2 113 114 if hasattr(res2, 'll_0'): 115 assert_allclose(res1.llnull, res2.ll_0, rtol=1e-6) 116 else: 117 if DEBUG: 118 import warnings 119 message = ('test: ll_0 not available, llnull=%6.4F' 120 % res1.llnull) 121 warnings.warn(message) 122 123 else: 124 pytest.skip("not available yet") 125 126 127class TestPoissonConstrained1a(CheckPoissonConstrainedMixin): 128 129 @classmethod 130 def setup_class(cls): 131 132 cls.res2 = results.results_noexposure_constraint 133 # 2 is dropped baseline for categorical 134 cls.idx = [7, 3, 4, 5, 6, 0, 1] 135 136 # example without offset 137 formula = 'deaths ~ logpyears + smokes + C(agecat)' 138 mod = Poisson.from_formula(formula, data=data) 139 # get start_params, example fails to converge on one CI run 140 k_vars = len(mod.exog_names) 141 start_params = np.zeros(k_vars) 142 start_params[0] = np.log(mod.endog.mean()) 143 # if we need it, this is desired params 144 # p = np.array([-3.93478643, 1.37276214, 2.33077032, 2.71338891, 145 # 2.71338891, 0.57966535, 0.97254074]) 146 147 constr = 'C(agecat)[T.4] = C(agecat)[T.5]' 148 lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) 149 cls.res1 = fit_constrained(mod, lc.coefs, lc.constants, 150 start_params=start_params, 151 fit_kwds={'method': 'bfgs', 'disp': 0}) 152 # TODO: Newton fails 153 154 # test method of Poisson, not monkey patched 155 cls.res1m = mod.fit_constrained(constr, start_params=start_params, 156 method='bfgs', disp=0) 157 158 @pytest.mark.smoke 159 def test_summary(self): 160 # trailing text in summary, assumes it's the first extra string 161 # NOTE: see comment about convergence in llnull for self.res1m 162 summ = self.res1m.summary() 163 assert_('linear equality constraints' in summ.extra_txt) 164 165 @pytest.mark.smoke 166 def test_summary2(self): 167 # trailing text in summary, assumes it's the first extra string 168 # NOTE: see comment about convergence in llnull for self.res1m 169 summ = self.res1m.summary2() 170 assert_('linear equality constraints' in summ.extra_txt[0]) 171 172 173class TestPoissonConstrained1b(CheckPoissonConstrainedMixin): 174 175 @classmethod 176 def setup_class(cls): 177 178 cls.res2 = results.results_exposure_constraint 179 cls.idx = [6, 2, 3, 4, 5, 0] # 2 is dropped baseline for categorical 180 181 # example without offset 182 formula = 'deaths ~ smokes + C(agecat)' 183 mod = Poisson.from_formula(formula, data=data, 184 exposure=data['pyears'].values) 185 constr = 'C(agecat)[T.4] = C(agecat)[T.5]' 186 lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) 187 cls.res1 = fit_constrained(mod, lc.coefs, lc.constants, 188 fit_kwds={'method': 'newton', 189 'disp': 0}) 190 cls.constraints = lc 191 # TODO: bfgs fails 192 # test method of Poisson, not monkey patched 193 cls.res1m = mod.fit_constrained(constr, method='newton', 194 disp=0) 195 196 197class TestPoissonConstrained1c(CheckPoissonConstrainedMixin): 198 199 @classmethod 200 def setup_class(cls): 201 202 cls.res2 = results.results_exposure_constraint 203 cls.idx = [6, 2, 3, 4, 5, 0] # 2 is dropped baseline for categorical 204 205 # example without offset 206 formula = 'deaths ~ smokes + C(agecat)' 207 mod = Poisson.from_formula(formula, data=data, 208 offset=np.log(data['pyears'].values)) 209 constr = 'C(agecat)[T.4] = C(agecat)[T.5]' 210 lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) 211 cls.res1 = fit_constrained(mod, lc.coefs, lc.constants, 212 fit_kwds={'method': 'newton', 213 'disp': 0}) 214 cls.constraints = lc 215 # TODO: bfgs fails 216 217 # test method of Poisson, not monkey patched 218 cls.res1m = mod.fit_constrained(constr, method='newton', disp=0) 219 220 221class TestPoissonNoConstrained(CheckPoissonConstrainedMixin): 222 223 @classmethod 224 def setup_class(cls): 225 226 cls.res2 = results.results_exposure_noconstraint 227 cls.idx = [6, 2, 3, 4, 5, 0] # 1 is dropped baseline for categorical 228 229 # example without offset 230 formula = 'deaths ~ smokes + C(agecat)' 231 mod = Poisson.from_formula(formula, data=data, 232 offset=np.log(data['pyears'].values)) 233 res1 = mod.fit(disp=0)._results 234 # res1 is duplicate check, so we can follow the same pattern 235 cls.res1 = (res1.params, res1.cov_params()) 236 cls.res1m = res1 237 238 239class TestPoissonConstrained2a(CheckPoissonConstrainedMixin): 240 241 @classmethod 242 def setup_class(cls): 243 244 cls.res2 = results.results_noexposure_constraint2 245 # 2 is dropped baseline for categorical 246 cls.idx = [7, 3, 4, 5, 6, 0, 1] 247 248 # example without offset 249 formula = 'deaths ~ logpyears + smokes + C(agecat)' 250 mod = Poisson.from_formula(formula, data=data) 251 252 # get start_params, example fails to converge on one CI run 253 k_vars = len(mod.exog_names) 254 start_params = np.zeros(k_vars) 255 start_params[0] = np.log(mod.endog.mean()) 256 # if we need it, this is desired params 257 # p = np.array([-9.43762015, 1.52762442, 2.74155711, 3.58730007, 258 # 4.08730007, 1.15987869, 0.12111539]) 259 260 constr = 'C(agecat)[T.5] - C(agecat)[T.4] = 0.5' 261 lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) 262 cls.res1 = fit_constrained(mod, lc.coefs, lc.constants, 263 start_params=start_params, 264 fit_kwds={'method': 'bfgs', 'disp': 0}) 265 # TODO: Newton fails 266 267 # test method of Poisson, not monkey patched 268 cls.res1m = mod.fit_constrained(constr, start_params=start_params, 269 method='bfgs', disp=0) 270 271 272class TestPoissonConstrained2b(CheckPoissonConstrainedMixin): 273 274 @classmethod 275 def setup_class(cls): 276 277 cls.res2 = results.results_exposure_constraint2 278 cls.idx = [6, 2, 3, 4, 5, 0] # 2 is dropped baseline for categorical 279 280 # example without offset 281 formula = 'deaths ~ smokes + C(agecat)' 282 mod = Poisson.from_formula(formula, data=data, 283 exposure=data['pyears'].values) 284 constr = 'C(agecat)[T.5] - C(agecat)[T.4] = 0.5' 285 lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) 286 cls.res1 = fit_constrained(mod, lc.coefs, lc.constants, 287 fit_kwds={'method': 'newton', 288 'disp': 0}) 289 cls.constraints = lc 290 # TODO: bfgs fails to converge. overflow somewhere? 291 292 # test method of Poisson, not monkey patched 293 cls.res1m = mod.fit_constrained(constr, method='bfgs', disp=0, 294 start_params=cls.res1[0]) 295 296 297class TestPoissonConstrained2c(CheckPoissonConstrainedMixin): 298 299 @classmethod 300 def setup_class(cls): 301 302 cls.res2 = results.results_exposure_constraint2 303 cls.idx = [6, 2, 3, 4, 5, 0] # 2 is dropped baseline for categorical 304 305 # example without offset 306 formula = 'deaths ~ smokes + C(agecat)' 307 mod = Poisson.from_formula(formula, data=data, 308 offset=np.log(data['pyears'].values)) 309 310 constr = 'C(agecat)[T.5] - C(agecat)[T.4] = 0.5' 311 lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) 312 cls.res1 = fit_constrained(mod, lc.coefs, lc.constants, 313 fit_kwds={'method': 'newton', 'disp': 0}) 314 cls.constraints = lc 315 # TODO: bfgs fails 316 317 # test method of Poisson, not monkey patched 318 cls.res1m = mod.fit_constrained(constr, 319 method='bfgs', disp=0, 320 start_params=cls.res1[0]) 321 322 323class TestGLMPoissonConstrained1a(CheckPoissonConstrainedMixin): 324 325 @classmethod 326 def setup_class(cls): 327 from statsmodels.base._constraints import fit_constrained 328 329 cls.res2 = results.results_noexposure_constraint 330 # 2 is dropped baseline for categorical 331 cls.idx = [7, 3, 4, 5, 6, 0, 1] 332 333 # example without offset 334 formula = 'deaths ~ logpyears + smokes + C(agecat)' 335 mod = GLM.from_formula(formula, data=data, 336 family=families.Poisson()) 337 338 constr = 'C(agecat)[T.4] = C(agecat)[T.5]' 339 lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) 340 cls.res1 = fit_constrained(mod, lc.coefs, lc.constants, 341 fit_kwds={'atol': 1e-10}) 342 cls.constraints = lc 343 cls.res1m = mod.fit_constrained(constr, atol=1e-10) 344 345 346class TestGLMPoissonConstrained1b(CheckPoissonConstrainedMixin): 347 348 @classmethod 349 def setup_class(cls): 350 from statsmodels.base._constraints import fit_constrained 351 from statsmodels.genmod import families 352 from statsmodels.genmod.generalized_linear_model import GLM 353 354 cls.res2 = results.results_exposure_constraint 355 cls.idx = [6, 2, 3, 4, 5, 0] # 2 is dropped baseline for categorical 356 357 # example with offset 358 formula = 'deaths ~ smokes + C(agecat)' 359 mod = GLM.from_formula(formula, data=data, 360 family=families.Poisson(), 361 offset=np.log(data['pyears'].values)) 362 363 constr = 'C(agecat)[T.4] = C(agecat)[T.5]' 364 lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constr) 365 366 cls.res1 = fit_constrained(mod, lc.coefs, lc.constants, 367 fit_kwds={'atol': 1e-10}) 368 cls.constraints = lc 369 cls.res1m = mod.fit_constrained(constr, atol=1e-10)._results 370 371 def test_compare_glm_poisson(self): 372 res1 = self.res1m 373 res2 = self.res2 374 375 formula = 'deaths ~ smokes + C(agecat)' 376 mod = Poisson.from_formula(formula, data=data, 377 exposure=data['pyears'].values) 378 379 constr = 'C(agecat)[T.4] = C(agecat)[T.5]' 380 res2 = mod.fit_constrained(constr, start_params=self.res1m.params, 381 method='newton', warn_convergence=False, 382 disp=0) 383 384 # we get high precision because we use the params as start_params 385 386 # basic, just as check that we have the same model 387 assert_allclose(res1.params, res2.params, rtol=1e-12) 388 assert_allclose(res1.bse, res2.bse, rtol=1e-11) 389 390 # check predict, fitted, ... 391 392 predicted = res1.predict() 393 assert_allclose(predicted, res2.predict(), rtol=1e-10) 394 assert_allclose(res1.mu, predicted, rtol=1e-10) 395 assert_allclose(res1.fittedvalues, predicted, rtol=1e-10) 396 assert_allclose(res2.predict(linear=True), res2.predict(linear=True), 397 rtol=1e-10) 398 399 400class CheckGLMConstrainedMixin(CheckPoissonConstrainedMixin): 401 # add tests for some GLM specific attributes 402 403 def test_glm(self): 404 res2 = self.res2 # reference results 405 res1 = self.res1m 406 407 # FIXME: dont leave commented-out 408 # assert_allclose(res1.aic, res2.aic, rtol=1e-10) # far away 409 # Stata aic in ereturn and in estat ic are very different 410 # we have the same as estat ic 411 # see issue GH#1733 412 assert_allclose(res1.aic, res2.infocrit[4], rtol=1e-10) 413 414 import warnings 415 416 with warnings.catch_warnings(): 417 warnings.simplefilter("ignore", FutureWarning) 418 # FutureWarning for BIC changes 419 assert_allclose(res1.bic, res2.bic, rtol=1e-10) 420 # bic is deviance based 421 # FIXME: dont leave commented-out 422 # assert_allclose(res1.bic, res2.infocrit[5], rtol=1e-10) 423 assert_allclose(res1.deviance, res2.deviance, rtol=1e-10) 424 # FIXME: dont leave commented-out 425 # TODO: which chi2 are these 426 # assert_allclose(res1.pearson_chi2, res2.chi2, rtol=1e-10) 427 428 429class TestGLMLogitConstrained1(CheckGLMConstrainedMixin): 430 431 @classmethod 432 def setup_class(cls): 433 cls.idx = slice(None) 434 # params sequence same as Stata, but Stata reports param = nan 435 # and we have param = value = 0 436 437 cls.res2 = reslogit.results_constraint1 438 439 mod1 = GLM(spector_data.endog, spector_data.exog, 440 family=families.Binomial()) 441 442 constr = 'x1 = 2.8' 443 cls.res1m = mod1.fit_constrained(constr) 444 445 R, q = cls.res1m.constraints 446 cls.res1 = fit_constrained(mod1, R, q) 447 448 449class TestGLMLogitConstrained2(CheckGLMConstrainedMixin): 450 451 @classmethod 452 def setup_class(cls): 453 cls.idx = slice(None) # params sequence same as Stata 454 cls.res2 = reslogit.results_constraint2 455 456 mod1 = GLM(spector_data.endog, spector_data.exog, 457 family=families.Binomial()) 458 459 constr = 'x1 - x3 = 0' 460 cls.res1m = mod1.fit_constrained(constr, atol=1e-10) 461 462 # patsy compatible constraints 463 R, q = cls.res1m.constraints.coefs, cls.res1m.constraints.constants 464 cls.res1 = fit_constrained(mod1, R, q, fit_kwds={'atol': 1e-10}) 465 cls.constraints_rq = (R, q) 466 467 def test_predict(self): 468 # results only available for this case 469 res2 = self.res2 # reference results 470 res1 = self.res1m 471 472 predicted = res1.predict() 473 assert_allclose(predicted, res2.predict_mu, atol=1e-7) 474 assert_allclose(res1.mu, predicted, rtol=1e-10) 475 assert_allclose(res1.fittedvalues, predicted, rtol=1e-10) 476 477 @pytest.mark.smoke 478 def test_summary(self): 479 # trailing text in summary, assumes it's the first extra string 480 summ = self.res1m.summary() 481 assert_('linear equality constraints' in summ.extra_txt) 482 483 lc_string = str(self.res1m.constraints) 484 assert lc_string == "x1 - x3 = 0.0" 485 486 @pytest.mark.smoke 487 def test_summary2(self): 488 # trailing text in summary, assumes it's the first extra string 489 import warnings 490 491 with warnings.catch_warnings(): 492 warnings.simplefilter("ignore", FutureWarning) 493 # FutureWarning for BIC changes 494 summ = self.res1m.summary2() 495 496 assert_('linear equality constraints' in summ.extra_txt[0]) 497 498 def test_fit_constrained_wrap(self): 499 # minimal test 500 res2 = self.res2 # reference results 501 502 from statsmodels.base._constraints import fit_constrained_wrap 503 res_wrap = fit_constrained_wrap(self.res1m.model, self.constraints_rq) 504 assert_allclose(res_wrap.params, res2.params, rtol=1e-6) 505 assert_allclose(res_wrap.params, res2.params, rtol=1e-6) 506 507 508class TestGLMLogitConstrained2HC(CheckGLMConstrainedMixin): 509 510 @classmethod 511 def setup_class(cls): 512 cls.idx = slice(None) # params sequence same as Stata 513 cls.res2 = reslogit.results_constraint2_robust 514 515 mod1 = GLM(spector_data.endog, spector_data.exog, 516 family=families.Binomial()) 517 518 # not used to match Stata for HC 519 # nobs, k_params = mod1.exog.shape 520 # k_params -= 1 # one constraint 521 cov_type = 'HC0' 522 cov_kwds = {'scaling_factor': 32/31} 523 # looks like nobs / (nobs - 1) and not (nobs - 1.) / (nobs - k_params)} 524 constr = 'x1 - x3 = 0' 525 cls.res1m = mod1.fit_constrained(constr, cov_type=cov_type, 526 cov_kwds=cov_kwds, atol=1e-10) 527 528 R, q = cls.res1m.constraints 529 cls.res1 = fit_constrained(mod1, R, q, fit_kwds={'atol': 1e-10, 530 'cov_type': cov_type, 531 'cov_kwds': cov_kwds}) 532 cls.constraints_rq = (R, q) 533 534 535def junk(): # FIXME: make this into a test, or move/remove 536 # Singular Matrix in mod1a.fit() 537 538 # same as Stata default 539 formula2 = 'deaths ~ C(agecat) + C(smokes) : C(agecat)' 540 541 mod = Poisson.from_formula(formula2, data=data, 542 exposure=data['pyears'].values) 543 544 mod.fit() 545 546 constraints = 'C(smokes)[T.1]:C(agecat)[3] = C(smokes)[T.1]:C(agec`at)[4]' 547 548 import patsy 549 lc = patsy.DesignInfo(mod.exog_names).linear_constraint(constraints) 550 R, q = lc.coefs, lc.constants 551 552 mod.fit_constrained(R, q, fit_kwds={'method': 'bfgs'}) 553 554 # example without offset 555 formula1a = 'deaths ~ logpyears + smokes + C(agecat)' 556 mod1a = Poisson.from_formula(formula1a, data=data) 557 558 mod1a.fit() 559 lc_1a = patsy.DesignInfo(mod1a.exog_names).linear_constraint( 560 'C(agecat)[T.4] = C(agecat)[T.5]') 561 mod1a.fit_constrained(lc_1a.coefs, lc_1a.constants, 562 fit_kwds={'method': 'newton'}) 563