1""" 2Test functions for models.GLM 3""" 4import os 5import warnings 6 7import numpy as np 8from numpy.testing import ( 9 assert_, 10 assert_allclose, 11 assert_almost_equal, 12 assert_array_less, 13 assert_equal, 14 assert_raises, 15) 16import pandas as pd 17from pandas.testing import assert_series_equal 18import pytest 19from scipy import stats 20 21import statsmodels.api as sm 22from statsmodels.datasets import cpunish, longley 23from statsmodels.discrete import discrete_model as discrete 24from statsmodels.genmod.generalized_linear_model import GLM, SET_USE_BIC_LLF 25from statsmodels.tools.numdiff import ( 26 approx_fprime, 27 approx_fprime_cs, 28 approx_hess, 29 approx_hess_cs, 30) 31from statsmodels.tools.sm_exceptions import ( 32 DomainWarning, 33 PerfectSeparationError, 34 ValueWarning, 35) 36from statsmodels.tools.tools import add_constant 37 38# Test Precisions 39DECIMAL_4 = 4 40DECIMAL_3 = 3 41DECIMAL_2 = 2 42DECIMAL_1 = 1 43DECIMAL_0 = 0 44 45pdf_output = False 46 47if pdf_output: 48 from matplotlib.backends.backend_pdf import PdfPages 49 pdf = PdfPages("test_glm.pdf") 50else: 51 pdf = None 52 53 54def close_or_save(pdf, fig): 55 if pdf_output: 56 pdf.savefig(fig) 57 58 59def teardown_module(): 60 if pdf_output: 61 pdf.close() 62 63 64@pytest.fixture(scope="module") 65def iris(): 66 cur_dir = os.path.dirname(os.path.abspath(__file__)) 67 return np.genfromtxt(os.path.join(cur_dir, 'results', 'iris.csv'), 68 delimiter=",", skip_header=1) 69 70 71class CheckModelResultsMixin(object): 72 ''' 73 res2 should be either the results from RModelWrap 74 or the results as defined in model_results_data 75 ''' 76 77 decimal_params = DECIMAL_4 78 def test_params(self): 79 assert_almost_equal(self.res1.params, self.res2.params, 80 self.decimal_params) 81 82 decimal_bse = DECIMAL_4 83 def test_standard_errors(self): 84 assert_allclose(self.res1.bse, self.res2.bse, 85 atol=10**(-self.decimal_bse), rtol=1e-5) 86 87 decimal_resids = DECIMAL_4 88 def test_residuals(self): 89 # fix incorrect numbers in resid_working results 90 # residuals for Poisson are also tested in test_glm_weights.py 91 import copy 92 93 # new numpy would have copy method 94 resid2 = copy.copy(self.res2.resids) 95 resid2[:, 2] *= self.res1.family.link.deriv(self.res1.mu)**2 96 97 atol = 10**(-self.decimal_resids) 98 resid_a = self.res1.resid_anscombe_unscaled 99 resids = np.column_stack((self.res1.resid_pearson, 100 self.res1.resid_deviance, self.res1.resid_working, 101 resid_a, self.res1.resid_response)) 102 assert_allclose(resids, resid2, rtol=1e-6, atol=atol) 103 104 decimal_aic_R = DECIMAL_4 105 106 def test_aic_R(self): 107 # R includes the estimation of the scale as a lost dof 108 # Does not with Gamma though 109 if self.res1.scale != 1: 110 dof = 2 111 else: 112 dof = 0 113 if isinstance(self.res1.model.family, (sm.families.NegativeBinomial)): 114 llf = self.res1.model.family.loglike(self.res1.model.endog, 115 self.res1.mu, 116 self.res1.model.var_weights, 117 self.res1.model.freq_weights, 118 scale=1) 119 aic = (-2*llf+2*(self.res1.df_model+1)) 120 else: 121 aic = self.res1.aic 122 assert_almost_equal(aic+dof, self.res2.aic_R, 123 self.decimal_aic_R) 124 125 decimal_aic_Stata = DECIMAL_4 126 def test_aic_Stata(self): 127 # Stata uses the below llf for aic definition for these families 128 if isinstance(self.res1.model.family, (sm.families.Gamma, 129 sm.families.InverseGaussian, 130 sm.families.NegativeBinomial)): 131 llf = self.res1.model.family.loglike(self.res1.model.endog, 132 self.res1.mu, 133 self.res1.model.var_weights, 134 self.res1.model.freq_weights, 135 scale=1) 136 aic = (-2*llf+2*(self.res1.df_model+1))/self.res1.nobs 137 else: 138 aic = self.res1.aic/self.res1.nobs 139 assert_almost_equal(aic, self.res2.aic_Stata, self.decimal_aic_Stata) 140 141 decimal_deviance = DECIMAL_4 142 def test_deviance(self): 143 assert_almost_equal(self.res1.deviance, self.res2.deviance, 144 self.decimal_deviance) 145 146 decimal_scale = DECIMAL_4 147 def test_scale(self): 148 assert_almost_equal(self.res1.scale, self.res2.scale, 149 self.decimal_scale) 150 151 decimal_loglike = DECIMAL_4 152 def test_loglike(self): 153 # Stata uses the below llf for these families 154 # We differ with R for them 155 if isinstance(self.res1.model.family, (sm.families.Gamma, 156 sm.families.InverseGaussian, 157 sm.families.NegativeBinomial)): 158 llf = self.res1.model.family.loglike(self.res1.model.endog, 159 self.res1.mu, 160 self.res1.model.var_weights, 161 self.res1.model.freq_weights, 162 scale=1) 163 else: 164 llf = self.res1.llf 165 assert_almost_equal(llf, self.res2.llf, self.decimal_loglike) 166 167 decimal_null_deviance = DECIMAL_4 168 def test_null_deviance(self): 169 with warnings.catch_warnings(): 170 warnings.simplefilter("ignore", DomainWarning) 171 172 assert_almost_equal(self.res1.null_deviance, 173 self.res2.null_deviance, 174 self.decimal_null_deviance) 175 176 decimal_bic = DECIMAL_4 177 def test_bic(self): 178 with warnings.catch_warnings(): 179 warnings.simplefilter("ignore") 180 assert_almost_equal(self.res1.bic, 181 self.res2.bic_Stata, 182 self.decimal_bic) 183 184 def test_degrees(self): 185 assert_equal(self.res1.model.df_resid,self.res2.df_resid) 186 187 decimal_fittedvalues = DECIMAL_4 188 def test_fittedvalues(self): 189 assert_almost_equal(self.res1.fittedvalues, self.res2.fittedvalues, 190 self.decimal_fittedvalues) 191 192 def test_tpvalues(self): 193 # test comparing tvalues and pvalues with normal implementation 194 # make sure they use normal distribution (inherited in results class) 195 params = self.res1.params 196 tvalues = params / self.res1.bse 197 pvalues = stats.norm.sf(np.abs(tvalues)) * 2 198 half_width = stats.norm.isf(0.025) * self.res1.bse 199 conf_int = np.column_stack((params - half_width, params + half_width)) 200 if isinstance(tvalues, pd.Series): 201 assert_series_equal(self.res1.tvalues, tvalues) 202 else: 203 assert_almost_equal(self.res1.tvalues, tvalues) 204 assert_almost_equal(self.res1.pvalues, pvalues) 205 assert_almost_equal(self.res1.conf_int(), conf_int) 206 207 def test_pearson_chi2(self): 208 if hasattr(self.res2, 'pearson_chi2'): 209 assert_allclose(self.res1.pearson_chi2, self.res2.pearson_chi2, 210 atol=1e-6, rtol=1e-6) 211 212 def test_prsquared(self): 213 if hasattr(self.res2, 'prsquared'): 214 assert_allclose(self.res1.pseudo_rsquared(kind="mcf"), 215 self.res2.prsquared, rtol=0.05) 216 217 if hasattr(self.res2, 'prsquared_cox_snell'): 218 assert_allclose(float(self.res1.pseudo_rsquared(kind="cs")), 219 self.res2.prsquared_cox_snell, rtol=0.05) 220 221 @pytest.mark.smoke 222 def test_summary(self): 223 self.res1.summary() 224 225 @pytest.mark.smoke 226 def test_summary2(self): 227 with warnings.catch_warnings(): 228 warnings.simplefilter("ignore", DomainWarning) 229 self.res1.summary2() 230 231 def test_get_distribution(self): 232 res1 = self.res1 233 if not hasattr(res1.model.family, "get_distribution"): 234 # only Tweedie has not get_distribution 235 pytest.skip("get_distribution not available") 236 237 if isinstance(res1.model.family, sm.families.NegativeBinomial): 238 res_scale = 1 # QMLE scale can differ from 1 239 else: 240 res_scale = res1.scale 241 242 distr = res1.model.family.get_distribution(res1.fittedvalues, 243 res_scale) 244 var_endog = res1.model.family.variance(res1.fittedvalues) * res_scale 245 m, v = distr.stats() 246 assert_allclose(res1.fittedvalues, m, rtol=1e-13) 247 assert_allclose(var_endog, v, rtol=1e-13) 248 # check model method 249 distr2 = res1.model.get_distribution(res1.params, res_scale) 250 for k in distr2.kwds: 251 assert_allclose(distr.kwds[k], distr2.kwds[k], rtol=1e-13) 252 253 254class CheckComparisonMixin(object): 255 256 def test_compare_discrete(self): 257 res1 = self.res1 258 resd = self.resd 259 260 assert_allclose(res1.llf, resd.llf, rtol=1e-10) 261 score_obs1 = res1.model.score_obs(res1.params * 0.98) 262 score_obsd = resd.model.score_obs(resd.params * 0.98) 263 assert_allclose(score_obs1, score_obsd, rtol=1e-10) 264 265 # score 266 score1 = res1.model.score(res1.params * 0.98) 267 assert_allclose(score1, score_obs1.sum(0), atol=1e-20) 268 score0 = res1.model.score(res1.params) 269 assert_allclose(score0, np.zeros(score_obs1.shape[1]), atol=5e-7) 270 271 hessian1 = res1.model.hessian(res1.params * 0.98, observed=False) 272 hessiand = resd.model.hessian(resd.params * 0.98) 273 assert_allclose(hessian1, hessiand, rtol=1e-10) 274 275 hessian1 = res1.model.hessian(res1.params * 0.98, observed=True) 276 hessiand = resd.model.hessian(resd.params * 0.98) 277 assert_allclose(hessian1, hessiand, rtol=1e-9) 278 279 def test_score_test(self): 280 res1 = self.res1 281 # fake example, should be zero, k_constraint should be 0 282 st, pv, df = res1.model.score_test(res1.params, k_constraints=1) 283 assert_allclose(st, 0, atol=1e-20) 284 assert_allclose(pv, 1, atol=1e-10) 285 assert_equal(df, 1) 286 287 st, pv, df = res1.model.score_test(res1.params, k_constraints=0) 288 assert_allclose(st, 0, atol=1e-20) 289 assert_(np.isnan(pv), msg=repr(pv)) 290 assert_equal(df, 0) 291 292 # TODO: no verified numbers largely SMOKE test 293 exog_extra = res1.model.exog[:,1]**2 294 st, pv, df = res1.model.score_test(res1.params, exog_extra=exog_extra) 295 assert_array_less(0.1, st) 296 assert_array_less(0.1, pv) 297 assert_equal(df, 1) 298 299 300class TestGlmGaussian(CheckModelResultsMixin): 301 @classmethod 302 def setup_class(cls): 303 ''' 304 Test Gaussian family with canonical identity link 305 ''' 306 # Test Precisions 307 cls.decimal_resids = DECIMAL_3 308 cls.decimal_params = DECIMAL_2 309 cls.decimal_bic = DECIMAL_0 310 cls.decimal_bse = DECIMAL_3 311 312 from statsmodels.datasets.longley import load 313 cls.data = load() 314 cls.data.endog = np.asarray(cls.data.endog) 315 cls.data.exog = np.asarray(cls.data.exog) 316 cls.data.exog = add_constant(cls.data.exog, prepend=False) 317 cls.res1 = GLM(cls.data.endog, cls.data.exog, 318 family=sm.families.Gaussian()).fit() 319 from .results.results_glm import Longley 320 cls.res2 = Longley() 321 322 323 def test_compare_OLS(self): 324 res1 = self.res1 325 # OLS does not define score_obs 326 from statsmodels.regression.linear_model import OLS 327 resd = OLS(self.data.endog, self.data.exog).fit() 328 self.resd = resd # attach to access from the outside 329 330 assert_allclose(res1.llf, resd.llf, rtol=1e-10) 331 score_obs1 = res1.model.score_obs(res1.params, scale=None) 332 score_obsd = resd.resid[:, None] / resd.scale * resd.model.exog 333 # low precision because of badly scaled exog 334 assert_allclose(score_obs1, score_obsd, rtol=1e-8) 335 336 score_obs1 = res1.model.score_obs(res1.params, scale=1) 337 score_obsd = resd.resid[:, None] * resd.model.exog 338 assert_allclose(score_obs1, score_obsd, rtol=1e-8) 339 340 hess_obs1 = res1.model.hessian(res1.params, scale=None) 341 hess_obsd = -1. / resd.scale * resd.model.exog.T.dot(resd.model.exog) 342 # low precision because of badly scaled exog 343 assert_allclose(hess_obs1, hess_obsd, rtol=1e-8) 344 345# FIXME: enable or delete 346# def setup(self): 347# if skipR: 348# raise SkipTest, "Rpy not installed." 349# Gauss = r.gaussian 350# self.res2 = RModel(self.data.endog, self.data.exog, r.glm, family=Gauss) 351# self.res2.resids = np.array(self.res2.resid)[:,None]*np.ones((1,5)) 352# self.res2.null_deviance = 185008826 # taken from R. Rpy bug? 353 354 355class TestGlmGaussianGradient(TestGlmGaussian): 356 @classmethod 357 def setup_class(cls): 358 ''' 359 Test Gaussian family with canonical identity link 360 ''' 361 # Test Precisions 362 cls.decimal_resids = DECIMAL_3 363 cls.decimal_params = DECIMAL_2 364 cls.decimal_bic = DECIMAL_0 365 cls.decimal_bse = DECIMAL_2 366 367 from statsmodels.datasets.longley import load 368 cls.data = load() 369 cls.data.endog = np.asarray(cls.data.endog) 370 cls.data.exog = np.asarray(cls.data.exog) 371 cls.data.exog = add_constant(cls.data.exog, prepend=False) 372 cls.res1 = GLM(cls.data.endog, cls.data.exog, 373 family=sm.families.Gaussian()).fit(method='bfgs') 374 from .results.results_glm import Longley 375 cls.res2 = Longley() 376 377 378class TestGaussianLog(CheckModelResultsMixin): 379 @classmethod 380 def setup_class(cls): 381 # Test Precision 382 cls.decimal_aic_R = DECIMAL_0 383 cls.decimal_aic_Stata = DECIMAL_2 384 cls.decimal_loglike = DECIMAL_0 385 cls.decimal_null_deviance = DECIMAL_1 386 387 nobs = 100 388 x = np.arange(nobs) 389 np.random.seed(54321) 390# y = 1.0 - .02*x - .001*x**2 + 0.001 * np.random.randn(nobs) 391 cls.X = np.c_[np.ones((nobs,1)),x,x**2] 392 cls.lny = np.exp(-(-1.0 + 0.02*x + 0.0001*x**2)) +\ 393 0.001 * np.random.randn(nobs) 394 395 GaussLog_Model = GLM(cls.lny, cls.X, 396 family=sm.families.Gaussian(sm.families.links.log())) 397 cls.res1 = GaussLog_Model.fit() 398 from .results.results_glm import GaussianLog 399 cls.res2 = GaussianLog() 400 401# FIXME: enable or delete 402# def setup(cls): 403# if skipR: 404# raise SkipTest, "Rpy not installed" 405# GaussLogLink = r.gaussian(link = "log") 406# GaussLog_Res_R = RModel(cls.lny, cls.X, r.glm, family=GaussLogLink) 407# cls.res2 = GaussLog_Res_R 408 409class TestGaussianInverse(CheckModelResultsMixin): 410 @classmethod 411 def setup_class(cls): 412 # Test Precisions 413 cls.decimal_bic = DECIMAL_1 414 cls.decimal_aic_R = DECIMAL_1 415 cls.decimal_aic_Stata = DECIMAL_3 416 cls.decimal_loglike = DECIMAL_1 417 cls.decimal_resids = DECIMAL_3 418 419 nobs = 100 420 x = np.arange(nobs) 421 np.random.seed(54321) 422 y = 1.0 + 2.0 * x + x**2 + 0.1 * np.random.randn(nobs) 423 cls.X = np.c_[np.ones((nobs,1)),x,x**2] 424 cls.y_inv = (1. + .02*x + .001*x**2)**-1 + .001 * np.random.randn(nobs) 425 InverseLink_Model = GLM(cls.y_inv, cls.X, 426 family=sm.families.Gaussian(sm.families.links.inverse_power())) 427 InverseLink_Res = InverseLink_Model.fit() 428 cls.res1 = InverseLink_Res 429 from .results.results_glm import GaussianInverse 430 cls.res2 = GaussianInverse() 431 432# FIXME: enable or delete 433# def setup(cls): 434# if skipR: 435# raise SkipTest, "Rpy not installed." 436# InverseLink = r.gaussian(link = "inverse") 437# InverseLink_Res_R = RModel(cls.y_inv, cls.X, r.glm, family=InverseLink) 438# cls.res2 = InverseLink_Res_R 439 440class TestGlmBinomial(CheckModelResultsMixin): 441 @classmethod 442 def setup_class(cls): 443 ''' 444 Test Binomial family with canonical logit link using star98 dataset. 445 ''' 446 cls.decimal_resids = DECIMAL_1 447 cls.decimal_bic = DECIMAL_2 448 449 from statsmodels.datasets.star98 import load 450 451 from .results.results_glm import Star98 452 data = load() 453 data.endog = np.asarray(data.endog) 454 data.exog = np.asarray(data.exog) 455 data.exog = add_constant(data.exog, prepend=False) 456 cls.res1 = GLM(data.endog, data.exog, 457 family=sm.families.Binomial()).fit() 458 # NOTE: if you want to replicate with RModel 459 # res2 = RModel(data.endog[:,0]/trials, data.exog, r.glm, 460 # family=r.binomial, weights=trials) 461 462 cls.res2 = Star98() 463 464 def test_endog_dtype(self): 465 from statsmodels.datasets.star98 import load 466 data = load() 467 data.exog = add_constant(data.exog, prepend=False) 468 endog = data.endog.astype(int) 469 res2 = GLM(endog, data.exog, family=sm.families.Binomial()).fit() 470 assert_allclose(res2.params, self.res1.params) 471 endog = data.endog.astype(np.double) 472 res3 = GLM(endog, data.exog, family=sm.families.Binomial()).fit() 473 assert_allclose(res3.params, self.res1.params) 474 475 def test_invalid_endog(self, reset_randomstate): 476 # GH2733 inspired check 477 endog = np.random.randint(0, 100, size=(1000, 3)) 478 exog = np.random.standard_normal((1000, 2)) 479 with pytest.raises(ValueError, match='endog has more than 2 columns'): 480 GLM(endog, exog, family=sm.families.Binomial()) 481 482 def test_invalid_endog_formula(self, reset_randomstate): 483 # GH2733 484 n = 200 485 exog = np.random.normal(size=(n, 2)) 486 endog = np.random.randint(0, 3, size=n).astype(str) 487 # formula interface 488 data = pd.DataFrame({"y": endog, "x1": exog[:, 0], "x2": exog[:, 1]}) 489 with pytest.raises(ValueError, match='array with multiple columns'): 490 sm.GLM.from_formula("y ~ x1 + x2", data, 491 family=sm.families.Binomial()) 492 493 def test_get_distribution_binom_count(self): 494 # test for binomial counts with n_trials > 1 495 res1 = self.res1 496 res_scale = 1 # QMLE scale can differ from 1 497 498 mu_prob = res1.fittedvalues 499 n = res1.model.n_trials 500 distr = res1.model.family.get_distribution(mu_prob, res_scale, 501 n_trials=n) 502 var_endog = res1.model.family.variance(mu_prob) * res_scale 503 m, v = distr.stats() 504 assert_allclose(mu_prob * n, m, rtol=1e-13) 505 assert_allclose(var_endog * n, v, rtol=1e-13) 506 507 # check model method 508 distr2 = res1.model.get_distribution(res1.params, res_scale, 509 n_trials=n) 510 for k in distr2.kwds: 511 assert_allclose(distr.kwds[k], distr2.kwds[k], rtol=1e-13) 512 513 514# FIXME: enable/xfail/skip or delete 515# TODO: 516# Non-Canonical Links for the Binomial family require the algorithm to be 517# slightly changed 518# class TestGlmBinomialLog(CheckModelResultsMixin): 519# pass 520 521# class TestGlmBinomialLogit(CheckModelResultsMixin): 522# pass 523 524# class TestGlmBinomialProbit(CheckModelResultsMixin): 525# pass 526 527# class TestGlmBinomialCloglog(CheckModelResultsMixin): 528# pass 529 530# class TestGlmBinomialPower(CheckModelResultsMixin): 531# pass 532 533# class TestGlmBinomialLoglog(CheckModelResultsMixin): 534# pass 535 536# class TestGlmBinomialLogc(CheckModelResultsMixin): 537# TODO: need include logc link 538# pass 539 540 541class TestGlmBernoulli(CheckModelResultsMixin, CheckComparisonMixin): 542 @classmethod 543 def setup_class(cls): 544 from .results.results_glm import Lbw 545 cls.res2 = Lbw() 546 cls.res1 = GLM(cls.res2.endog, cls.res2.exog, 547 family=sm.families.Binomial()).fit() 548 549 modd = discrete.Logit(cls.res2.endog, cls.res2.exog) 550 cls.resd = modd.fit(start_params=cls.res1.params * 0.9, disp=False) 551 552 def test_score_r(self): 553 res1 = self.res1 554 res2 = self.res2 555 st, pv, df = res1.model.score_test(res1.params, 556 exog_extra=res1.model.exog[:, 1]**2) 557 st_res = 0.2837680293459376 # (-0.5326988167303712)**2 558 assert_allclose(st, st_res, rtol=1e-4) 559 560 st, pv, df = res1.model.score_test(res1.params, 561 exog_extra=res1.model.exog[:, 0]**2) 562 st_res = 0.6713492821514992 # (-0.8193590679009413)**2 563 assert_allclose(st, st_res, rtol=1e-4) 564 565 select = list(range(9)) 566 select.pop(7) 567 568 res1b = GLM(res2.endog, res2.exog.iloc[:, select], 569 family=sm.families.Binomial()).fit() 570 tres = res1b.model.score_test(res1b.params, 571 exog_extra=res1.model.exog[:, -2]) 572 tres = np.asarray(tres[:2]).ravel() 573 tres_r = (2.7864148487452, 0.0950667) 574 assert_allclose(tres, tres_r, rtol=1e-4) 575 576 cmd_r = """\ 577 data = read.csv("...statsmodels\\statsmodels\\genmod\\tests\\results\\stata_lbw_glm.csv") 578 579 data["race_black"] = data["race"] == "black" 580 data["race_other"] = data["race"] == "other" 581 mod = glm(low ~ age + lwt + race_black + race_other + smoke + ptl + ht + ui, family=binomial, data=data) 582 options(digits=16) 583 anova(mod, test="Rao") 584 585 library(statmod) 586 s = glm.scoretest(mod, data["age"]**2) 587 s**2 588 s = glm.scoretest(mod, data["lwt"]**2) 589 s**2 590 """ 591 592# class TestGlmBernoulliIdentity(CheckModelResultsMixin): 593# pass 594 595# class TestGlmBernoulliLog(CheckModelResultsMixin): 596# pass 597 598# class TestGlmBernoulliProbit(CheckModelResultsMixin): 599# pass 600 601# class TestGlmBernoulliCloglog(CheckModelResultsMixin): 602# pass 603 604# class TestGlmBernoulliPower(CheckModelResultsMixin): 605# pass 606 607# class TestGlmBernoulliLoglog(CheckModelResultsMixin): 608# pass 609 610# class test_glm_bernoulli_logc(CheckModelResultsMixin): 611# pass 612 613 614class TestGlmGamma(CheckModelResultsMixin): 615 616 @classmethod 617 def setup_class(cls): 618 ''' 619 Tests Gamma family with canonical inverse link (power -1) 620 ''' 621 # Test Precisions 622 cls.decimal_aic_R = -1 #TODO: off by about 1, we are right with Stata 623 cls.decimal_resids = DECIMAL_2 624 625 from statsmodels.datasets.scotland import load 626 627 from .results.results_glm import Scotvote 628 data = load() 629 data.exog = add_constant(data.exog, prepend=False) 630 with warnings.catch_warnings(): 631 warnings.simplefilter("ignore") 632 res1 = GLM(data.endog, data.exog, 633 family=sm.families.Gamma()).fit() 634 cls.res1 = res1 635# res2 = RModel(data.endog, data.exog, r.glm, family=r.Gamma) 636 res2 = Scotvote() 637 res2.aic_R += 2 # R does not count degree of freedom for scale with gamma 638 cls.res2 = res2 639 640 641class TestGlmGammaLog(CheckModelResultsMixin): 642 @classmethod 643 def setup_class(cls): 644 # Test Precisions 645 cls.decimal_resids = DECIMAL_3 646 cls.decimal_aic_R = DECIMAL_0 647 cls.decimal_fittedvalues = DECIMAL_3 648 649 from .results.results_glm import CancerLog 650 res2 = CancerLog() 651 cls.res1 = GLM(res2.endog, res2.exog, 652 family=sm.families.Gamma(link=sm.families.links.log())).fit() 653 cls.res2 = res2 654 655# FIXME: enable or delete 656# def setup(cls): 657# if skipR: 658# raise SkipTest, "Rpy not installed." 659# cls.res2 = RModel(cls.data.endog, cls.data.exog, r.glm, 660# family=r.Gamma(link="log")) 661# cls.res2.null_deviance = 27.92207137420696 # From R (bug in rpy) 662# cls.res2.bic = -154.1582089453923 # from Stata 663 664 665class TestGlmGammaIdentity(CheckModelResultsMixin): 666 @classmethod 667 def setup_class(cls): 668 # Test Precisions 669 cls.decimal_resids = -100 #TODO Very off from Stata? 670 cls.decimal_params = DECIMAL_2 671 cls.decimal_aic_R = DECIMAL_0 672 cls.decimal_loglike = DECIMAL_1 673 674 from .results.results_glm import CancerIdentity 675 res2 = CancerIdentity() 676 with warnings.catch_warnings(): 677 warnings.simplefilter("ignore") 678 fam = sm.families.Gamma(link=sm.families.links.identity()) 679 cls.res1 = GLM(res2.endog, res2.exog, family=fam).fit() 680 cls.res2 = res2 681 682# FIXME: enable or delete 683# def setup(cls): 684# if skipR: 685# raise SkipTest, "Rpy not installed." 686# cls.res2 = RModel(cls.data.endog, cls.data.exog, r.glm, 687# family=r.Gamma(link="identity")) 688# cls.res2.null_deviance = 27.92207137420696 # from R, Rpy bug 689 690class TestGlmPoisson(CheckModelResultsMixin, CheckComparisonMixin): 691 @classmethod 692 def setup_class(cls): 693 ''' 694 Tests Poisson family with canonical log link. 695 696 Test results were obtained by R. 697 ''' 698 from .results.results_glm import Cpunish 699 cls.data = cpunish.load() 700 cls.data.endog = np.asarray(cls.data.endog) 701 cls.data.exog = np.asarray(cls.data.exog) 702 cls.data.exog[:, 3] = np.log(cls.data.exog[:, 3]) 703 cls.data.exog = add_constant(cls.data.exog, prepend=False) 704 cls.res1 = GLM(cls.data.endog, cls.data.exog, 705 family=sm.families.Poisson()).fit() 706 cls.res2 = Cpunish() 707 # compare with discrete, start close to save time 708 modd = discrete.Poisson(cls.data.endog, cls.data.exog) 709 cls.resd = modd.fit(start_params=cls.res1.params * 0.9, disp=False) 710 711#class TestGlmPoissonIdentity(CheckModelResultsMixin): 712# pass 713 714#class TestGlmPoissonPower(CheckModelResultsMixin): 715# pass 716 717 718class TestGlmInvgauss(CheckModelResultsMixin): 719 @classmethod 720 def setup_class(cls): 721 ''' 722 Tests the Inverse Gaussian family in GLM. 723 724 Notes 725 ----- 726 Used the rndivgx.ado file provided by Hardin and Hilbe to 727 generate the data. Results are read from model_results, which 728 were obtained by running R_ig.s 729 ''' 730 # Test Precisions 731 cls.decimal_aic_R = DECIMAL_0 732 cls.decimal_loglike = DECIMAL_0 733 734 from .results.results_glm import InvGauss 735 res2 = InvGauss() 736 res1 = GLM(res2.endog, res2.exog, 737 family=sm.families.InverseGaussian()).fit() 738 cls.res1 = res1 739 cls.res2 = res2 740 741 def test_get_distribution(self): 742 res1 = self.res1 743 distr = res1.model.family.get_distribution(res1.fittedvalues, 744 res1.scale) 745 var_endog = res1.model.family.variance(res1.fittedvalues) * res1.scale 746 m, v = distr.stats() 747 assert_allclose(res1.fittedvalues, m, rtol=1e-13) 748 assert_allclose(var_endog, v, rtol=1e-13) 749 750 751class TestGlmInvgaussLog(CheckModelResultsMixin): 752 @classmethod 753 def setup_class(cls): 754 # Test Precisions 755 cls.decimal_aic_R = -10 # Big difference vs R. 756 cls.decimal_resids = DECIMAL_3 757 758 from .results.results_glm import InvGaussLog 759 res2 = InvGaussLog() 760 cls.res1 = GLM(res2.endog, res2.exog, 761 family=sm.families.InverseGaussian( 762 link=sm.families.links.log())).fit() 763 cls.res2 = res2 764 765# FIXME: enable or delete 766# def setup(cls): 767# if skipR: 768# raise SkipTest, "Rpy not installed." 769# cls.res2 = RModel(cls.data.endog, cls.data.exog, r.glm, 770# family=r.inverse_gaussian(link="log")) 771# cls.res2.null_deviance = 335.1539777981053 # from R, Rpy bug 772# cls.res2.llf = -12162.72308 # from Stata, R's has big rounding diff 773 774 775class TestGlmInvgaussIdentity(CheckModelResultsMixin): 776 @classmethod 777 def setup_class(cls): 778 # Test Precisions 779 cls.decimal_aic_R = -10 #TODO: Big difference vs R 780 cls.decimal_fittedvalues = DECIMAL_3 781 cls.decimal_params = DECIMAL_3 782 783 from .results.results_glm import Medpar1 784 data = Medpar1() 785 with warnings.catch_warnings(): 786 warnings.simplefilter("ignore") 787 cls.res1 = GLM(data.endog, data.exog, 788 family=sm.families.InverseGaussian( 789 link=sm.families.links.identity())).fit() 790 from .results.results_glm import InvGaussIdentity 791 cls.res2 = InvGaussIdentity() 792 793# FIXME: enable or delete 794# def setup(cls): 795# if skipR: 796# raise SkipTest, "Rpy not installed." 797# cls.res2 = RModel(cls.data.endog, cls.data.exog, r.glm, 798# family=r.inverse_gaussian(link="identity")) 799# cls.res2.null_deviance = 335.1539777981053 # from R, Rpy bug 800# cls.res2.llf = -12163.25545 # from Stata, big diff with R 801 802 803class TestGlmNegbinomial(CheckModelResultsMixin): 804 @classmethod 805 def setup_class(cls): 806 ''' 807 Test Negative Binomial family with log link 808 ''' 809 # Test Precision 810 cls.decimal_resid = DECIMAL_1 811 cls.decimal_params = DECIMAL_3 812 cls.decimal_resids = -1 # 1 % mismatch at 0 813 cls.decimal_fittedvalues = DECIMAL_1 814 815 from statsmodels.datasets.committee import load 816 cls.data = load() 817 cls.data.endog = np.asarray(cls.data.endog) 818 cls.data.exog = np.asarray(cls.data.exog) 819 cls.data.exog[:,2] = np.log(cls.data.exog[:,2]) 820 interaction = cls.data.exog[:,2]*cls.data.exog[:,1] 821 cls.data.exog = np.column_stack((cls.data.exog,interaction)) 822 cls.data.exog = add_constant(cls.data.exog, prepend=False) 823 with warnings.catch_warnings(): 824 warnings.simplefilter("ignore", category=DomainWarning) 825 fam = sm.families.NegativeBinomial() 826 827 cls.res1 = GLM(cls.data.endog, cls.data.exog, 828 family=fam).fit(scale='x2') 829 from .results.results_glm import Committee 830 res2 = Committee() 831 res2.aic_R += 2 # They do not count a degree of freedom for the scale 832 cls.res2 = res2 833 834# FIXME: enable or delete 835# def setup(self): 836# if skipR: 837# raise SkipTest, "Rpy not installed" 838# r.library('MASS') # this does not work when done in rmodelwrap? 839# self.res2 = RModel(self.data.endog, self.data.exog, r.glm, 840# family=r.negative_binomial(1)) 841# self.res2.null_deviance = 27.8110469364343 842 843# FIXME: enable/xfail/skip or delete 844#class TestGlmNegbinomial_log(CheckModelResultsMixin): 845# pass 846 847# FIXME: enable/xfail/skip or delete 848#class TestGlmNegbinomial_power(CheckModelResultsMixin): 849# pass 850 851# FIXME: enable/xfail/skip or delete 852#class TestGlmNegbinomial_nbinom(CheckModelResultsMixin): 853# pass 854 855 856class TestGlmPoissonOffset(CheckModelResultsMixin): 857 @classmethod 858 def setup_class(cls): 859 from .results.results_glm import Cpunish_offset 860 cls.decimal_params = DECIMAL_4 861 cls.decimal_bse = DECIMAL_4 862 cls.decimal_aic_R = 3 863 data = cpunish.load() 864 data.endog = np.asarray(data.endog) 865 data.exog = np.asarray(data.exog) 866 data.exog[:, 3] = np.log(data.exog[:, 3]) 867 data.exog = add_constant(data.exog, prepend=True) 868 exposure = [100] * len(data.endog) 869 cls.data = data 870 cls.exposure = exposure 871 cls.res1 = GLM(data.endog, data.exog, family=sm.families.Poisson(), 872 exposure=exposure).fit() 873 cls.res2 = Cpunish_offset() 874 875 def test_missing(self): 876 # make sure offset is dropped correctly 877 endog = self.data.endog.copy() 878 endog[[2,4,6,8]] = np.nan 879 mod = GLM(endog, self.data.exog, family=sm.families.Poisson(), 880 exposure=self.exposure, missing='drop') 881 assert_equal(mod.exposure.shape[0], 13) 882 883 def test_offset_exposure(self): 884 # exposure=x and offset=log(x) should have the same effect 885 np.random.seed(382304) 886 endog = np.random.randint(0, 10, 100) 887 exog = np.random.normal(size=(100,3)) 888 exposure = np.random.uniform(1, 2, 100) 889 offset = np.random.uniform(1, 2, 100) 890 mod1 = GLM(endog, exog, family=sm.families.Poisson(), 891 offset=offset, exposure=exposure).fit() 892 offset2 = offset + np.log(exposure) 893 mod2 = GLM(endog, exog, family=sm.families.Poisson(), 894 offset=offset2).fit() 895 assert_almost_equal(mod1.params, mod2.params) 896 assert_allclose(mod1.null, mod2.null, rtol=1e-10) 897 898 # test recreating model 899 mod1_ = mod1.model 900 kwds = mod1_._get_init_kwds() 901 assert_allclose(kwds['exposure'], exposure, rtol=1e-14) 902 assert_allclose(kwds['offset'], mod1_.offset, rtol=1e-14) 903 mod3 = mod1_.__class__(mod1_.endog, mod1_.exog, **kwds) 904 assert_allclose(mod3.exposure, mod1_.exposure, rtol=1e-14) 905 assert_allclose(mod3.offset, mod1_.offset, rtol=1e-14) 906 907 # test fit_regularized exposure, see #4605 908 resr1 = mod1.model.fit_regularized() 909 resr2 = mod2.model.fit_regularized() 910 assert_allclose(resr1.params, resr2.params, rtol=1e-10) 911 912 913 def test_predict(self): 914 np.random.seed(382304) 915 endog = np.random.randint(0, 10, 100) 916 exog = np.random.normal(size=(100,3)) 917 exposure = np.random.uniform(1, 2, 100) 918 mod1 = GLM(endog, exog, family=sm.families.Poisson(), 919 exposure=exposure).fit() 920 exog1 = np.random.normal(size=(10,3)) 921 exposure1 = np.random.uniform(1, 2, 10) 922 923 # Doubling exposure time should double expected response 924 pred1 = mod1.predict(exog=exog1, exposure=exposure1) 925 pred2 = mod1.predict(exog=exog1, exposure=2*exposure1) 926 assert_almost_equal(pred2, 2*pred1) 927 928 # Check exposure defaults 929 pred3 = mod1.predict() 930 pred4 = mod1.predict(exposure=exposure) 931 pred5 = mod1.predict(exog=exog, exposure=exposure) 932 assert_almost_equal(pred3, pred4) 933 assert_almost_equal(pred4, pred5) 934 935 # Check offset defaults 936 offset = np.random.uniform(1, 2, 100) 937 mod2 = GLM(endog, exog, offset=offset, family=sm.families.Poisson()).fit() 938 pred1 = mod2.predict() 939 pred2 = mod2.predict(offset=offset) 940 pred3 = mod2.predict(exog=exog, offset=offset) 941 assert_almost_equal(pred1, pred2) 942 assert_almost_equal(pred2, pred3) 943 944 # Check that offset shifts the linear predictor 945 mod3 = GLM(endog, exog, family=sm.families.Poisson()).fit() 946 offset = np.random.uniform(1, 2, 10) 947 pred1 = mod3.predict(exog=exog1, offset=offset, linear=True) 948 pred2 = mod3.predict(exog=exog1, offset=2*offset, linear=True) 949 assert_almost_equal(pred2, pred1+offset) 950 951 # Passing exposure as a pandas series should not effect output type 952 assert isinstance( 953 mod1.predict(exog=exog1, exposure=pd.Series(exposure1)), 954 np.ndarray 955 ) 956 957 958def test_perfect_pred(iris): 959 y = iris[:, -1] 960 X = iris[:, :-1] 961 X = X[y != 2] 962 y = y[y != 2] 963 X = add_constant(X, prepend=True) 964 glm = GLM(y, X, family=sm.families.Binomial()) 965 with warnings.catch_warnings(): 966 warnings.simplefilter("ignore", category=RuntimeWarning) 967 assert_raises(PerfectSeparationError, glm.fit) 968 969 970def test_score_test_ols(): 971 # nicer example than Longley 972 from statsmodels.regression.linear_model import OLS 973 np.random.seed(5) 974 nobs = 100 975 sige = 0.5 976 x = np.random.uniform(0, 1, size=(nobs, 5)) 977 x[:, 0] = 1 978 beta = 1. / np.arange(1., x.shape[1] + 1) 979 y = x.dot(beta) + sige * np.random.randn(nobs) 980 981 res_ols = OLS(y, x).fit() 982 res_olsc = OLS(y, x[:, :-2]).fit() 983 co = res_ols.compare_lm_test(res_olsc, demean=False) 984 985 res_glm = GLM(y, x[:, :-2], family=sm.families.Gaussian()).fit() 986 co2 = res_glm.model.score_test(res_glm.params, exog_extra=x[:, -2:]) 987 # difference in df_resid versus nobs in scale see #1786 988 assert_allclose(co[0] * 97 / 100., co2[0], rtol=1e-13) 989 990 991def test_attribute_writable_resettable(): 992 # Regression test for mutables and class constructors. 993 data = sm.datasets.longley.load() 994 endog, exog = data.endog, data.exog 995 glm_model = sm.GLM(endog, exog) 996 assert_equal(glm_model.family.link.power, 1.0) 997 glm_model.family.link.power = 2. 998 assert_equal(glm_model.family.link.power, 2.0) 999 glm_model2 = sm.GLM(endog, exog) 1000 assert_equal(glm_model2.family.link.power, 1.0) 1001 1002 1003class TestStartParams(CheckModelResultsMixin): 1004 @classmethod 1005 def setup_class(cls): 1006 ''' 1007 Test Gaussian family with canonical identity link 1008 ''' 1009 # Test Precisions 1010 cls.decimal_resids = DECIMAL_3 1011 cls.decimal_params = DECIMAL_2 1012 cls.decimal_bic = DECIMAL_0 1013 cls.decimal_bse = DECIMAL_3 1014 1015 from statsmodels.datasets.longley import load 1016 cls.data = load() 1017 cls.data.exog = add_constant(cls.data.exog, prepend=False) 1018 params = sm.OLS(cls.data.endog, cls.data.exog).fit().params 1019 cls.res1 = GLM(cls.data.endog, cls.data.exog, 1020 family=sm.families.Gaussian()).fit(start_params=params) 1021 from .results.results_glm import Longley 1022 cls.res2 = Longley() 1023 1024 1025def test_glm_start_params(): 1026 # see 1604 1027 y2 = np.array('0 1 0 0 0 1'.split(), int) 1028 wt = np.array([50,1,50,1,5,10]) 1029 y2 = np.repeat(y2, wt) 1030 x2 = np.repeat([0,0,0.001,100,-1,-1], wt) 1031 mod = sm.GLM(y2, sm.add_constant(x2), family=sm.families.Binomial()) 1032 res = mod.fit(start_params=[-4, -5]) 1033 np.testing.assert_almost_equal(res.params, [-4.60305022, -5.29634545], 6) 1034 1035 1036def test_loglike_no_opt(): 1037 # see 1728 1038 1039 y = np.asarray([0, 1, 0, 0, 1, 1, 0, 1, 1, 1]) 1040 x = np.arange(10, dtype=np.float64) 1041 1042 def llf(params): 1043 lin_pred = params[0] + params[1]*x 1044 pr = 1 / (1 + np.exp(-lin_pred)) 1045 return np.sum(y*np.log(pr) + (1-y)*np.log(1-pr)) 1046 1047 for params in [0,0], [0,1], [0.5,0.5]: 1048 mod = sm.GLM(y, sm.add_constant(x), family=sm.families.Binomial()) 1049 res = mod.fit(start_params=params, maxiter=0) 1050 like = llf(params) 1051 assert_almost_equal(like, res.llf) 1052 1053 1054def test_formula_missing_exposure(): 1055 # see 2083 1056 import statsmodels.formula.api as smf 1057 1058 d = {'Foo': [1, 2, 10, 149], 'Bar': [1, 2, 3, np.nan], 1059 'constant': [1] * 4, 'exposure': np.random.uniform(size=4), 1060 'x': [1, 3, 2, 1.5]} 1061 df = pd.DataFrame(d) 1062 1063 family = sm.families.Gaussian(link=sm.families.links.log()) 1064 1065 mod = smf.glm("Foo ~ Bar", data=df, exposure=df.exposure, 1066 family=family) 1067 assert_(type(mod.exposure) is np.ndarray, msg='Exposure is not ndarray') 1068 1069 exposure = pd.Series(np.random.uniform(size=5)) 1070 df.loc[3, 'Bar'] = 4 # nan not relevant for Valueerror for shape mismatch 1071 assert_raises(ValueError, smf.glm, "Foo ~ Bar", data=df, 1072 exposure=exposure, family=family) 1073 assert_raises(ValueError, GLM, df.Foo, df[['constant', 'Bar']], 1074 exposure=exposure, family=family) 1075 1076 1077@pytest.mark.matplotlib 1078def test_plots(close_figures): 1079 1080 np.random.seed(378) 1081 n = 200 1082 exog = np.random.normal(size=(n, 2)) 1083 lin_pred = exog[:, 0] + exog[:, 1]**2 1084 prob = 1 / (1 + np.exp(-lin_pred)) 1085 endog = 1 * (np.random.uniform(size=n) < prob) 1086 1087 model = sm.GLM(endog, exog, family=sm.families.Binomial()) 1088 result = model.fit() 1089 1090 import pandas as pd 1091 1092 from statsmodels.graphics.regressionplots import add_lowess 1093 1094 # array interface 1095 for j in 0,1: 1096 fig = result.plot_added_variable(j) 1097 add_lowess(fig.axes[0], frac=0.5) 1098 close_or_save(pdf, fig) 1099 fig = result.plot_partial_residuals(j) 1100 add_lowess(fig.axes[0], frac=0.5) 1101 close_or_save(pdf, fig) 1102 fig = result.plot_ceres_residuals(j) 1103 add_lowess(fig.axes[0], frac=0.5) 1104 close_or_save(pdf, fig) 1105 1106 # formula interface 1107 data = pd.DataFrame({"y": endog, "x1": exog[:, 0], "x2": exog[:, 1]}) 1108 model = sm.GLM.from_formula("y ~ x1 + x2", data, family=sm.families.Binomial()) 1109 result = model.fit() 1110 for j in 0,1: 1111 xname = ["x1", "x2"][j] 1112 fig = result.plot_added_variable(xname) 1113 add_lowess(fig.axes[0], frac=0.5) 1114 close_or_save(pdf, fig) 1115 fig = result.plot_partial_residuals(xname) 1116 add_lowess(fig.axes[0], frac=0.5) 1117 close_or_save(pdf, fig) 1118 fig = result.plot_ceres_residuals(xname) 1119 add_lowess(fig.axes[0], frac=0.5) 1120 close_or_save(pdf, fig) 1121 1122def gen_endog(lin_pred, family_class, link, binom_version=0): 1123 1124 np.random.seed(872) 1125 1126 fam = sm.families 1127 1128 mu = link().inverse(lin_pred) 1129 1130 if family_class == fam.Binomial: 1131 if binom_version == 0: 1132 endog = 1*(np.random.uniform(size=len(lin_pred)) < mu) 1133 else: 1134 endog = np.empty((len(lin_pred), 2)) 1135 n = 10 1136 endog[:, 0] = (np.random.uniform(size=(len(lin_pred), n)) < mu[:, None]).sum(1) 1137 endog[:, 1] = n - endog[:, 0] 1138 elif family_class == fam.Poisson: 1139 endog = np.random.poisson(mu) 1140 elif family_class == fam.Gamma: 1141 endog = np.random.gamma(2, mu) 1142 elif family_class == fam.Gaussian: 1143 endog = mu + 2 * np.random.normal(size=len(lin_pred)) 1144 elif family_class == fam.NegativeBinomial: 1145 from scipy.stats.distributions import nbinom 1146 endog = nbinom.rvs(mu, 0.5) 1147 elif family_class == fam.InverseGaussian: 1148 from scipy.stats.distributions import invgauss 1149 endog = invgauss.rvs(mu, scale=20) 1150 else: 1151 raise ValueError 1152 1153 return endog 1154 1155 1156@pytest.mark.smoke 1157def test_summary(): 1158 np.random.seed(4323) 1159 1160 n = 100 1161 exog = np.random.normal(size=(n, 2)) 1162 exog[:, 0] = 1 1163 endog = np.random.normal(size=n) 1164 1165 for method in ["irls", "cg"]: 1166 fa = sm.families.Gaussian() 1167 model = sm.GLM(endog, exog, family=fa) 1168 rslt = model.fit(method=method) 1169 s = rslt.summary() 1170 1171 1172def check_score_hessian(results): 1173 # compare models core and hessian with numerical derivatives 1174 1175 params = results.params 1176 # avoid checking score at MLE, score close to zero 1177 sc = results.model.score(params * 0.98, scale=1) 1178 # cs currently (0.9) does not work for all families 1179 llfunc = lambda x: results.model.loglike(x, scale=1) # noqa 1180 sc2 = approx_fprime(params * 0.98, llfunc) 1181 assert_allclose(sc, sc2, rtol=1e-4, atol=1e-4) 1182 1183 hess = results.model.hessian(params, scale=1) 1184 hess2 = approx_hess(params, llfunc) 1185 assert_allclose(hess, hess2, rtol=1e-4) 1186 scfunc = lambda x: results.model.score(x, scale=1) # noqa 1187 hess3 = approx_fprime(params, scfunc) 1188 assert_allclose(hess, hess3, rtol=1e-4) 1189 1190 1191def test_gradient_irls(): 1192 # Compare the results when using gradient optimization and IRLS. 1193 1194 # TODO: Find working examples for inverse_squared link 1195 1196 np.random.seed(87342) 1197 1198 fam = sm.families 1199 lnk = sm.families.links 1200 families = [(fam.Binomial, [lnk.logit, lnk.probit, lnk.cloglog, lnk.log, lnk.cauchy]), 1201 (fam.Poisson, [lnk.log, lnk.identity, lnk.sqrt]), 1202 (fam.Gamma, [lnk.log, lnk.identity, lnk.inverse_power]), 1203 (fam.Gaussian, [lnk.identity, lnk.log, lnk.inverse_power]), 1204 (fam.InverseGaussian, [lnk.log, lnk.identity, lnk.inverse_power, lnk.inverse_squared]), 1205 (fam.NegativeBinomial, [lnk.log, lnk.inverse_power, lnk.inverse_squared, lnk.identity])] 1206 1207 n = 100 1208 p = 3 1209 exog = np.random.normal(size=(n, p)) 1210 exog[:, 0] = 1 1211 1212 skip_one = False 1213 for family_class, family_links in families: 1214 for link in family_links: 1215 for binom_version in 0,1: 1216 1217 if family_class != fam.Binomial and binom_version == 1: 1218 continue 1219 1220 if (family_class, link) == (fam.Poisson, lnk.identity): 1221 lin_pred = 20 + exog.sum(1) 1222 elif (family_class, link) == (fam.Binomial, lnk.log): 1223 lin_pred = -1 + exog.sum(1) / 8 1224 elif (family_class, link) == (fam.Poisson, lnk.sqrt): 1225 lin_pred = 2 + exog.sum(1) 1226 elif (family_class, link) == (fam.InverseGaussian, lnk.log): 1227 #skip_zero = True 1228 lin_pred = -1 + exog.sum(1) 1229 elif (family_class, link) == (fam.InverseGaussian, lnk.identity): 1230 lin_pred = 20 + 5*exog.sum(1) 1231 lin_pred = np.clip(lin_pred, 1e-4, np.inf) 1232 elif (family_class, link) == (fam.InverseGaussian, lnk.inverse_squared): 1233 lin_pred = 0.5 + exog.sum(1) / 5 1234 continue # skip due to non-convergence 1235 elif (family_class, link) == (fam.InverseGaussian, lnk.inverse_power): 1236 lin_pred = 1 + exog.sum(1) / 5 1237 elif (family_class, link) == (fam.NegativeBinomial, lnk.identity): 1238 lin_pred = 20 + 5*exog.sum(1) 1239 lin_pred = np.clip(lin_pred, 1e-4, np.inf) 1240 elif (family_class, link) == (fam.NegativeBinomial, lnk.inverse_squared): 1241 lin_pred = 0.1 + np.random.uniform(size=exog.shape[0]) 1242 continue # skip due to non-convergence 1243 elif (family_class, link) == (fam.NegativeBinomial, lnk.inverse_power): 1244 lin_pred = 1 + exog.sum(1) / 5 1245 1246 elif (family_class, link) == (fam.Gaussian, lnk.inverse_power): 1247 # adding skip because of convergence failure 1248 skip_one = True 1249 # the following fails with identity link, because endog < 0 1250 # elif family_class == fam.Gamma: 1251 # lin_pred = 0.5 * exog.sum(1) + np.random.uniform(size=exog.shape[0]) 1252 else: 1253 lin_pred = np.random.uniform(size=exog.shape[0]) 1254 1255 endog = gen_endog(lin_pred, family_class, link, binom_version) 1256 1257 with warnings.catch_warnings(): 1258 warnings.simplefilter("ignore") 1259 mod_irls = sm.GLM(endog, exog, family=family_class(link=link())) 1260 rslt_irls = mod_irls.fit(method="IRLS") 1261 1262 if not (family_class, link) in [(fam.Poisson, lnk.sqrt), 1263 (fam.Gamma, lnk.inverse_power), 1264 (fam.InverseGaussian, lnk.identity) 1265 ]: 1266 check_score_hessian(rslt_irls) 1267 1268 # Try with and without starting values. 1269 for max_start_irls, start_params in (0, rslt_irls.params), (3, None): 1270 # TODO: skip convergence failures for now 1271 if max_start_irls > 0 and skip_one: 1272 continue 1273 with warnings.catch_warnings(): 1274 warnings.simplefilter("ignore") 1275 mod_gradient = sm.GLM(endog, exog, family=family_class(link=link())) 1276 rslt_gradient = mod_gradient.fit(max_start_irls=max_start_irls, 1277 start_params=start_params, 1278 method="newton", maxiter=300) 1279 1280 assert_allclose(rslt_gradient.params, 1281 rslt_irls.params, rtol=1e-6, atol=5e-5) 1282 1283 assert_allclose(rslt_gradient.llf, rslt_irls.llf, 1284 rtol=1e-6, atol=1e-6) 1285 1286 assert_allclose(rslt_gradient.scale, rslt_irls.scale, 1287 rtol=1e-6, atol=1e-6) 1288 1289 # Get the standard errors using expected information. 1290 gradient_bse = rslt_gradient.bse 1291 ehess = mod_gradient.hessian(rslt_gradient.params, observed=False) 1292 gradient_bse = np.sqrt(-np.diag(np.linalg.inv(ehess))) 1293 assert_allclose(gradient_bse, rslt_irls.bse, rtol=1e-6, atol=5e-5) 1294 # rslt_irls.bse corresponds to observed=True 1295 assert_allclose(rslt_gradient.bse, rslt_irls.bse, rtol=0.2, atol=5e-5) 1296 1297 rslt_gradient_eim = mod_gradient.fit(max_start_irls=0, 1298 cov_type='eim', 1299 start_params=rslt_gradient.params, 1300 method="newton", maxiter=300) 1301 assert_allclose(rslt_gradient_eim.bse, rslt_irls.bse, rtol=5e-5, atol=0) 1302 1303 1304def test_gradient_irls_eim(): 1305 # Compare the results when using eime gradient optimization and IRLS. 1306 1307 # TODO: Find working examples for inverse_squared link 1308 1309 np.random.seed(87342) 1310 1311 fam = sm.families 1312 lnk = sm.families.links 1313 families = [(fam.Binomial, [lnk.logit, lnk.probit, lnk.cloglog, lnk.log, 1314 lnk.cauchy]), 1315 (fam.Poisson, [lnk.log, lnk.identity, lnk.sqrt]), 1316 (fam.Gamma, [lnk.log, lnk.identity, lnk.inverse_power]), 1317 (fam.Gaussian, [lnk.identity, lnk.log, lnk.inverse_power]), 1318 (fam.InverseGaussian, [lnk.log, lnk.identity, 1319 lnk.inverse_power, 1320 lnk.inverse_squared]), 1321 (fam.NegativeBinomial, [lnk.log, lnk.inverse_power, 1322 lnk.inverse_squared, lnk.identity])] 1323 1324 n = 100 1325 p = 3 1326 exog = np.random.normal(size=(n, p)) 1327 exog[:, 0] = 1 1328 1329 skip_one = False 1330 for family_class, family_links in families: 1331 for link in family_links: 1332 for binom_version in 0, 1: 1333 1334 if family_class != fam.Binomial and binom_version == 1: 1335 continue 1336 1337 if (family_class, link) == (fam.Poisson, lnk.identity): 1338 lin_pred = 20 + exog.sum(1) 1339 elif (family_class, link) == (fam.Binomial, lnk.log): 1340 lin_pred = -1 + exog.sum(1) / 8 1341 elif (family_class, link) == (fam.Poisson, lnk.sqrt): 1342 lin_pred = 2 + exog.sum(1) 1343 elif (family_class, link) == (fam.InverseGaussian, lnk.log): 1344 # skip_zero = True 1345 lin_pred = -1 + exog.sum(1) 1346 elif (family_class, link) == (fam.InverseGaussian, 1347 lnk.identity): 1348 lin_pred = 20 + 5*exog.sum(1) 1349 lin_pred = np.clip(lin_pred, 1e-4, np.inf) 1350 elif (family_class, link) == (fam.InverseGaussian, 1351 lnk.inverse_squared): 1352 lin_pred = 0.5 + exog.sum(1) / 5 1353 continue # skip due to non-convergence 1354 elif (family_class, link) == (fam.InverseGaussian, 1355 lnk.inverse_power): 1356 lin_pred = 1 + exog.sum(1) / 5 1357 elif (family_class, link) == (fam.NegativeBinomial, 1358 lnk.identity): 1359 lin_pred = 20 + 5*exog.sum(1) 1360 lin_pred = np.clip(lin_pred, 1e-4, np.inf) 1361 elif (family_class, link) == (fam.NegativeBinomial, 1362 lnk.inverse_squared): 1363 lin_pred = 0.1 + np.random.uniform(size=exog.shape[0]) 1364 continue # skip due to non-convergence 1365 elif (family_class, link) == (fam.NegativeBinomial, 1366 lnk.inverse_power): 1367 lin_pred = 1 + exog.sum(1) / 5 1368 1369 elif (family_class, link) == (fam.Gaussian, lnk.inverse_power): 1370 # adding skip because of convergence failure 1371 skip_one = True 1372 else: 1373 lin_pred = np.random.uniform(size=exog.shape[0]) 1374 1375 endog = gen_endog(lin_pred, family_class, link, binom_version) 1376 1377 with warnings.catch_warnings(): 1378 warnings.simplefilter("ignore") 1379 mod_irls = sm.GLM(endog, exog, 1380 family=family_class(link=link())) 1381 rslt_irls = mod_irls.fit(method="IRLS") 1382 1383 # Try with and without starting values. 1384 for max_start_irls, start_params in ((0, rslt_irls.params), 1385 (3, None)): 1386 # TODO: skip convergence failures for now 1387 if max_start_irls > 0 and skip_one: 1388 continue 1389 with warnings.catch_warnings(): 1390 warnings.simplefilter("ignore") 1391 mod_gradient = sm.GLM(endog, exog, 1392 family=family_class(link=link())) 1393 rslt_gradient = mod_gradient.fit( 1394 max_start_irls=max_start_irls, 1395 start_params=start_params, 1396 method="newton", 1397 optim_hessian='eim' 1398 ) 1399 1400 assert_allclose(rslt_gradient.params, rslt_irls.params, 1401 rtol=1e-6, atol=5e-5) 1402 1403 assert_allclose(rslt_gradient.llf, rslt_irls.llf, 1404 rtol=1e-6, atol=1e-6) 1405 1406 assert_allclose(rslt_gradient.scale, rslt_irls.scale, 1407 rtol=1e-6, atol=1e-6) 1408 1409 # Get the standard errors using expected information. 1410 ehess = mod_gradient.hessian(rslt_gradient.params, 1411 observed=False) 1412 gradient_bse = np.sqrt(-np.diag(np.linalg.inv(ehess))) 1413 1414 assert_allclose(gradient_bse, rslt_irls.bse, rtol=1e-6, 1415 atol=5e-5) 1416 1417 1418def test_glm_irls_method(): 1419 nobs, k_vars = 50, 4 1420 np.random.seed(987126) 1421 x = np.random.randn(nobs, k_vars - 1) 1422 exog = add_constant(x, has_constant='add') 1423 y = exog.sum(1) + np.random.randn(nobs) 1424 1425 mod = GLM(y, exog) 1426 res1 = mod.fit() 1427 res2 = mod.fit(wls_method='pinv', attach_wls=True) 1428 res3 = mod.fit(wls_method='qr', attach_wls=True) 1429 # fit_gradient does not attach mle_settings 1430 res_g1 = mod.fit(start_params=res1.params, method='bfgs') 1431 1432 for r in [res1, res2, res3]: 1433 assert_equal(r.mle_settings['optimizer'], 'IRLS') 1434 assert_equal(r.method, 'IRLS') 1435 1436 assert_equal(res1.mle_settings['wls_method'], 'lstsq') 1437 assert_equal(res2.mle_settings['wls_method'], 'pinv') 1438 assert_equal(res3.mle_settings['wls_method'], 'qr') 1439 1440 assert_(hasattr(res2.results_wls.model, 'pinv_wexog')) 1441 assert_(hasattr(res3.results_wls.model, 'exog_Q')) 1442 1443 # fit_gradient currently does not attach mle_settings 1444 assert_equal(res_g1.method, 'bfgs') 1445 1446 1447class CheckWtdDuplicationMixin(object): 1448 decimal_params = DECIMAL_4 1449 1450 @classmethod 1451 def setup_class(cls): 1452 cls.data = cpunish.load() 1453 cls.data.endog = np.asarray(cls.data.endog) 1454 cls.data.exog = np.asarray(cls.data.exog) 1455 cls.endog = cls.data.endog 1456 cls.exog = cls.data.exog 1457 np.random.seed(1234) 1458 cls.weight = np.random.randint(5, 100, len(cls.endog)) 1459 cls.endog_big = np.repeat(cls.endog, cls.weight) 1460 cls.exog_big = np.repeat(cls.exog, cls.weight, axis=0) 1461 1462 def test_params(self): 1463 assert_allclose(self.res1.params, self.res2.params, atol=1e-6, 1464 rtol=1e-6) 1465 1466 decimal_bse = DECIMAL_4 1467 1468 def test_standard_errors(self): 1469 assert_allclose(self.res1.bse, self.res2.bse, rtol=1e-5, atol=1e-6) 1470 1471 decimal_resids = DECIMAL_4 1472 1473 # TODO: This does not work... Arrays are of different shape. 1474 # Perhaps we use self.res1.model.family.resid_XXX()? 1475 """ 1476 def test_residuals(self): 1477 resids1 = np.column_stack((self.res1.resid_pearson, 1478 self.res1.resid_deviance, 1479 self.res1.resid_working, 1480 self.res1.resid_anscombe, 1481 self.res1.resid_response)) 1482 resids2 = np.column_stack((self.res1.resid_pearson, 1483 self.res2.resid_deviance, 1484 self.res2.resid_working, 1485 self.res2.resid_anscombe, 1486 self.res2.resid_response)) 1487 assert_allclose(resids1, resids2, self.decimal_resids) 1488 """ 1489 1490 def test_aic(self): 1491 # R includes the estimation of the scale as a lost dof 1492 # Does not with Gamma though 1493 assert_allclose(self.res1.aic, self.res2.aic, atol=1e-6, rtol=1e-6) 1494 1495 def test_deviance(self): 1496 assert_allclose(self.res1.deviance, self.res2.deviance, atol=1e-6, 1497 rtol=1e-6) 1498 1499 def test_scale(self): 1500 assert_allclose(self.res1.scale, self.res2.scale, atol=1e-6, rtol=1e-6) 1501 1502 def test_loglike(self): 1503 # Stata uses the below llf for these families 1504 # We differ with R for them 1505 assert_allclose(self.res1.llf, self.res2.llf, 1e-6) 1506 1507 decimal_null_deviance = DECIMAL_4 1508 1509 def test_null_deviance(self): 1510 with warnings.catch_warnings(): 1511 warnings.simplefilter("ignore", DomainWarning) 1512 1513 assert_allclose(self.res1.null_deviance, 1514 self.res2.null_deviance, 1515 atol=1e-6, 1516 rtol=1e-6) 1517 1518 decimal_bic = DECIMAL_4 1519 1520 def test_bic(self): 1521 with warnings.catch_warnings(): 1522 warnings.simplefilter("ignore") 1523 assert_allclose(self.res1.bic, self.res2.bic, atol=1e-6, rtol=1e-6) 1524 1525 decimal_fittedvalues = DECIMAL_4 1526 1527 def test_fittedvalues(self): 1528 res2_fitted = self.res2.predict(self.res1.model.exog) 1529 assert_allclose(self.res1.fittedvalues, res2_fitted, atol=1e-5, 1530 rtol=1e-5) 1531 1532 decimal_tpvalues = DECIMAL_4 1533 1534 def test_tpvalues(self): 1535 # test comparing tvalues and pvalues with normal implementation 1536 # make sure they use normal distribution (inherited in results class) 1537 assert_allclose(self.res1.tvalues, self.res2.tvalues, atol=1e-6, 1538 rtol=2e-4) 1539 assert_allclose(self.res1.pvalues, self.res2.pvalues, atol=1e-6, 1540 rtol=1e-6) 1541 assert_allclose(self.res1.conf_int(), self.res2.conf_int(), atol=1e-6, 1542 rtol=1e-6) 1543 1544 1545class TestWtdGlmPoisson(CheckWtdDuplicationMixin): 1546 1547 @classmethod 1548 def setup_class(cls): 1549 ''' 1550 Tests Poisson family with canonical log link. 1551 ''' 1552 super(TestWtdGlmPoisson, cls).setup_class() 1553 cls.endog = np.asarray(cls.endog) 1554 cls.exog = np.asarray(cls.exog) 1555 1556 cls.res1 = GLM(cls.endog, cls.exog, 1557 freq_weights=cls.weight, 1558 family=sm.families.Poisson()).fit() 1559 cls.res2 = GLM(cls.endog_big, cls.exog_big, 1560 family=sm.families.Poisson()).fit() 1561 1562 1563class TestWtdGlmPoissonNewton(CheckWtdDuplicationMixin): 1564 @classmethod 1565 def setup_class(cls): 1566 ''' 1567 Tests Poisson family with canonical log link. 1568 ''' 1569 super(TestWtdGlmPoissonNewton, cls).setup_class() 1570 1571 start_params = np.array([1.82794424e-04, -4.76785037e-02, 1572 -9.48249717e-02, -2.92293226e-04, 1573 2.63728909e+00, -2.05934384e+01]) 1574 1575 fit_kwds = dict(method='newton') 1576 cls.res1 = GLM(cls.endog, cls.exog, 1577 freq_weights=cls.weight, 1578 family=sm.families.Poisson()).fit(**fit_kwds) 1579 fit_kwds = dict(method='newton', start_params=start_params) 1580 cls.res2 = GLM(cls.endog_big, cls.exog_big, 1581 family=sm.families.Poisson()).fit(**fit_kwds) 1582 1583 1584class TestWtdGlmPoissonHC0(CheckWtdDuplicationMixin): 1585 @classmethod 1586 def setup_class(cls): 1587 1588 ''' 1589 Tests Poisson family with canonical log link. 1590 ''' 1591 super(TestWtdGlmPoissonHC0, cls).setup_class() 1592 1593 start_params = np.array([1.82794424e-04, -4.76785037e-02, 1594 -9.48249717e-02, -2.92293226e-04, 1595 2.63728909e+00, -2.05934384e+01]) 1596 1597 fit_kwds = dict(cov_type='HC0') 1598 cls.res1 = GLM(cls.endog, cls.exog, 1599 freq_weights=cls.weight, 1600 family=sm.families.Poisson()).fit(**fit_kwds) 1601 fit_kwds = dict(cov_type='HC0', start_params=start_params) 1602 cls.res2 = GLM(cls.endog_big, cls.exog_big, 1603 family=sm.families.Poisson()).fit(**fit_kwds) 1604 1605 1606class TestWtdGlmPoissonClu(CheckWtdDuplicationMixin): 1607 @classmethod 1608 def setup_class(cls): 1609 1610 ''' 1611 Tests Poisson family with canonical log link. 1612 ''' 1613 super(TestWtdGlmPoissonClu, cls).setup_class() 1614 1615 start_params = np.array([1.82794424e-04, -4.76785037e-02, 1616 -9.48249717e-02, -2.92293226e-04, 1617 2.63728909e+00, -2.05934384e+01]) 1618 1619 gid = np.arange(1, len(cls.endog) + 1) // 2 1620 fit_kwds = dict(cov_type='cluster', cov_kwds={'groups': gid, 'use_correction':False}) 1621 1622 import warnings 1623 with warnings.catch_warnings(): 1624 warnings.simplefilter("ignore") 1625 cls.res1 = GLM(cls.endog, cls.exog, 1626 freq_weights=cls.weight, 1627 family=sm.families.Poisson()).fit(**fit_kwds) 1628 gidr = np.repeat(gid, cls.weight) 1629 fit_kwds = dict(cov_type='cluster', cov_kwds={'groups': gidr, 'use_correction':False}) 1630 cls.res2 = GLM(cls.endog_big, cls.exog_big, 1631 family=sm.families.Poisson()).fit(start_params=start_params, 1632 **fit_kwds) 1633 1634 1635class TestWtdGlmBinomial(CheckWtdDuplicationMixin): 1636 @classmethod 1637 def setup_class(cls): 1638 1639 ''' 1640 Tests Binomial family with canonical logit link. 1641 ''' 1642 super(TestWtdGlmBinomial, cls).setup_class() 1643 cls.endog = cls.endog / 100 1644 cls.endog_big = cls.endog_big / 100 1645 cls.res1 = GLM(cls.endog, cls.exog, 1646 freq_weights=cls.weight, 1647 family=sm.families.Binomial()).fit() 1648 cls.res2 = GLM(cls.endog_big, cls.exog_big, 1649 family=sm.families.Binomial()).fit() 1650 1651 1652class TestWtdGlmNegativeBinomial(CheckWtdDuplicationMixin): 1653 @classmethod 1654 def setup_class(cls): 1655 1656 ''' 1657 Tests Negative Binomial family with canonical link 1658 g(p) = log(p/(p + 1/alpha)) 1659 ''' 1660 super(TestWtdGlmNegativeBinomial, cls).setup_class() 1661 alpha = 1. 1662 1663 with warnings.catch_warnings(): 1664 warnings.simplefilter("ignore", category=DomainWarning) 1665 family_link = sm.families.NegativeBinomial( 1666 link=sm.families.links.nbinom(alpha=alpha), 1667 alpha=alpha) 1668 cls.res1 = GLM(cls.endog, cls.exog, 1669 freq_weights=cls.weight, 1670 family=family_link).fit() 1671 cls.res2 = GLM(cls.endog_big, cls.exog_big, 1672 family=family_link).fit() 1673 1674 1675class TestWtdGlmGamma(CheckWtdDuplicationMixin): 1676 @classmethod 1677 def setup_class(cls): 1678 1679 ''' 1680 Tests Gamma family with log link. 1681 ''' 1682 super(TestWtdGlmGamma, cls).setup_class() 1683 family_link = sm.families.Gamma(sm.families.links.log()) 1684 cls.res1 = GLM(cls.endog, cls.exog, 1685 freq_weights=cls.weight, 1686 family=family_link).fit() 1687 cls.res2 = GLM(cls.endog_big, cls.exog_big, 1688 family=family_link).fit() 1689 1690 1691class TestWtdGlmGaussian(CheckWtdDuplicationMixin): 1692 @classmethod 1693 def setup_class(cls): 1694 ''' 1695 Tests Gaussian family with log link. 1696 ''' 1697 super(TestWtdGlmGaussian, cls).setup_class() 1698 family_link = sm.families.Gaussian(sm.families.links.log()) 1699 cls.res1 = GLM(cls.endog, cls.exog, 1700 freq_weights=cls.weight, 1701 family=family_link).fit() 1702 cls.res2 = GLM(cls.endog_big, cls.exog_big, 1703 family=family_link).fit() 1704 1705 1706class TestWtdGlmInverseGaussian(CheckWtdDuplicationMixin): 1707 @classmethod 1708 def setup_class(cls): 1709 ''' 1710 Tests InverseGaussian family with log link. 1711 ''' 1712 super(TestWtdGlmInverseGaussian, cls).setup_class() 1713 family_link = sm.families.InverseGaussian(sm.families.links.log()) 1714 cls.res1 = GLM(cls.endog, cls.exog, 1715 freq_weights=cls.weight, 1716 family=family_link).fit() 1717 cls.res2 = GLM(cls.endog_big, cls.exog_big, 1718 family=family_link).fit() 1719 1720 1721class TestWtdGlmGammaNewton(CheckWtdDuplicationMixin): 1722 @classmethod 1723 def setup_class(cls): 1724 ''' 1725 Tests Gamma family with log link. 1726 ''' 1727 super(TestWtdGlmGammaNewton, cls).setup_class() 1728 family_link = sm.families.Gamma(sm.families.links.log()) 1729 cls.res1 = GLM(cls.endog, cls.exog, 1730 freq_weights=cls.weight, 1731 family=family_link 1732 ).fit(method='newton') 1733 cls.res2 = GLM(cls.endog_big, cls.exog_big, 1734 family=family_link 1735 ).fit(method='newton') 1736 1737 def test_init_kwargs(self): 1738 family_link = sm.families.Gamma(sm.families.links.log()) 1739 1740 with pytest.warns(ValueWarning, match="unknown kwargs"): 1741 GLM(self.endog, self.exog, family=family_link, 1742 weights=self.weight, # incorrect keyword 1743 ) 1744 1745 1746class TestWtdGlmGammaScale_X2(CheckWtdDuplicationMixin): 1747 @classmethod 1748 def setup_class(cls): 1749 ''' 1750 Tests Gamma family with log link. 1751 ''' 1752 super(TestWtdGlmGammaScale_X2, cls).setup_class() 1753 family_link = sm.families.Gamma(sm.families.links.log()) 1754 cls.res1 = GLM(cls.endog, cls.exog, 1755 freq_weights=cls.weight, 1756 family=family_link, 1757 ).fit(scale='X2') 1758 cls.res2 = GLM(cls.endog_big, cls.exog_big, 1759 family=family_link, 1760 ).fit(scale='X2') 1761 1762 1763class TestWtdGlmGammaScale_dev(CheckWtdDuplicationMixin): 1764 @classmethod 1765 def setup_class(cls): 1766 ''' 1767 Tests Gamma family with log link. 1768 ''' 1769 super(TestWtdGlmGammaScale_dev, cls).setup_class() 1770 family_link = sm.families.Gamma(sm.families.links.log()) 1771 cls.res1 = GLM(cls.endog, cls.exog, 1772 freq_weights=cls.weight, 1773 family=family_link, 1774 ).fit(scale='dev') 1775 cls.res2 = GLM(cls.endog_big, cls.exog_big, 1776 family=family_link, 1777 ).fit(scale='dev') 1778 1779 def test_missing(self): 1780 endog = self.data.endog.copy() 1781 exog = self.data.exog.copy() 1782 exog[0, 0] = np.nan 1783 endog[[2, 4, 6, 8]] = np.nan 1784 freq_weights = self.weight 1785 mod_misisng = GLM(endog, exog, family=self.res1.model.family, 1786 freq_weights=freq_weights, missing='drop') 1787 assert_equal(mod_misisng.freq_weights.shape[0], 1788 mod_misisng.endog.shape[0]) 1789 assert_equal(mod_misisng.freq_weights.shape[0], 1790 mod_misisng.exog.shape[0]) 1791 keep_idx = np.array([1, 3, 5, 7, 9, 10, 11, 12, 13, 14, 15, 16]) 1792 assert_equal(mod_misisng.freq_weights, self.weight[keep_idx]) 1793 1794 1795class TestWtdTweedieLog(CheckWtdDuplicationMixin): 1796 @classmethod 1797 def setup_class(cls): 1798 ''' 1799 Tests Tweedie family with log link and var_power=1. 1800 ''' 1801 super(TestWtdTweedieLog, cls).setup_class() 1802 family_link = sm.families.Tweedie(link=sm.families.links.log(), 1803 var_power=1) 1804 cls.res1 = GLM(cls.endog, cls.exog, 1805 freq_weights=cls.weight, 1806 family=family_link).fit() 1807 cls.res2 = GLM(cls.endog_big, cls.exog_big, 1808 family=family_link).fit() 1809 1810 1811class TestWtdTweediePower2(CheckWtdDuplicationMixin): 1812 @classmethod 1813 def setup_class(cls): 1814 ''' 1815 Tests Tweedie family with Power(1) link and var_power=2. 1816 ''' 1817 cls.data = cpunish.load_pandas() 1818 cls.endog = cls.data.endog 1819 cls.exog = cls.data.exog[['INCOME', 'SOUTH']] 1820 np.random.seed(1234) 1821 cls.weight = np.random.randint(5, 100, len(cls.endog)) 1822 cls.endog_big = np.repeat(cls.endog.values, cls.weight) 1823 cls.exog_big = np.repeat(cls.exog.values, cls.weight, axis=0) 1824 link = sm.families.links.Power() 1825 family_link = sm.families.Tweedie(link=link, var_power=2) 1826 cls.res1 = GLM(cls.endog, cls.exog, 1827 freq_weights=cls.weight, 1828 family=family_link).fit() 1829 cls.res2 = GLM(cls.endog_big, cls.exog_big, 1830 family=family_link).fit() 1831 1832 1833class TestWtdTweediePower15(CheckWtdDuplicationMixin): 1834 @classmethod 1835 def setup_class(cls): 1836 ''' 1837 Tests Tweedie family with Power(0.5) link and var_power=1.5. 1838 ''' 1839 super(TestWtdTweediePower15, cls).setup_class() 1840 family_link = sm.families.Tweedie(link=sm.families.links.Power(0.5), 1841 var_power=1.5) 1842 cls.res1 = GLM(cls.endog, cls.exog, 1843 freq_weights=cls.weight, 1844 family=family_link).fit() 1845 cls.res2 = GLM(cls.endog_big, cls.exog_big, 1846 family=family_link).fit() 1847 1848 1849def test_wtd_patsy_missing(): 1850 import pandas as pd 1851 data = cpunish.load() 1852 data.endog = np.asarray(data.endog) 1853 data.exog = np.asarray(data.exog) 1854 data.exog[0, 0] = np.nan 1855 data.endog[[2, 4, 6, 8]] = np.nan 1856 data.pandas = pd.DataFrame(data.exog, columns=data.exog_name) 1857 data.pandas['EXECUTIONS'] = data.endog 1858 weights = np.arange(1, len(data.endog)+1) 1859 formula = """EXECUTIONS ~ INCOME + PERPOVERTY + PERBLACK + VC100k96 + 1860 SOUTH + DEGREE""" 1861 mod_misisng = GLM.from_formula(formula, data=data.pandas, 1862 freq_weights=weights) 1863 assert_equal(mod_misisng.freq_weights.shape[0], 1864 mod_misisng.endog.shape[0]) 1865 assert_equal(mod_misisng.freq_weights.shape[0], 1866 mod_misisng.exog.shape[0]) 1867 assert_equal(mod_misisng.freq_weights.shape[0], 12) 1868 keep_weights = np.array([2, 4, 6, 8, 10, 11, 12, 13, 14, 15, 16, 17]) 1869 assert_equal(mod_misisng.freq_weights, keep_weights) 1870 1871 1872class CheckTweedie(object): 1873 def test_resid(self): 1874 idx1 = len(self.res1.resid_response) - 1 1875 idx2 = len(self.res2.resid_response) - 1 1876 assert_allclose(np.concatenate((self.res1.resid_response[:17], 1877 [self.res1.resid_response[idx1]])), 1878 np.concatenate((self.res2.resid_response[:17], 1879 [self.res2.resid_response[idx2]])), 1880 rtol=1e-5, atol=1e-5) 1881 assert_allclose(np.concatenate((self.res1.resid_pearson[:17], 1882 [self.res1.resid_pearson[idx1]])), 1883 np.concatenate((self.res2.resid_pearson[:17], 1884 [self.res2.resid_pearson[idx2]])), 1885 rtol=1e-5, atol=1e-5) 1886 assert_allclose(np.concatenate((self.res1.resid_deviance[:17], 1887 [self.res1.resid_deviance[idx1]])), 1888 np.concatenate((self.res2.resid_deviance[:17], 1889 [self.res2.resid_deviance[idx2]])), 1890 rtol=1e-5, atol=1e-5) 1891 1892 assert_allclose(np.concatenate((self.res1.resid_working[:17], 1893 [self.res1.resid_working[idx1]])), 1894 np.concatenate((self.res2.resid_working[:17], 1895 [self.res2.resid_working[idx2]])), 1896 rtol=1e-5, atol=1e-5) 1897 1898 1899 def test_bse(self): 1900 assert_allclose(self.res1.bse, self.res2.bse, atol=1e-6, rtol=1e6) 1901 1902 def test_params(self): 1903 assert_allclose(self.res1.params, self.res2.params, atol=1e-5, 1904 rtol=1e-5) 1905 1906 def test_deviance(self): 1907 assert_allclose(self.res1.deviance, self.res2.deviance, atol=1e-6, 1908 rtol=1e-6) 1909 1910 def test_df(self): 1911 assert_equal(self.res1.df_model, self.res2.df_model) 1912 assert_equal(self.res1.df_resid, self.res2.df_resid) 1913 1914 def test_fittedvalues(self): 1915 idx1 = len(self.res1.fittedvalues) - 1 1916 idx2 = len(self.res2.resid_response) - 1 1917 assert_allclose(np.concatenate((self.res1.fittedvalues[:17], 1918 [self.res1.fittedvalues[idx1]])), 1919 np.concatenate((self.res2.fittedvalues[:17], 1920 [self.res2.fittedvalues[idx2]])), 1921 atol=1e-4, rtol=1e-4) 1922 1923 def test_summary(self): 1924 self.res1.summary() 1925 self.res1.summary2() 1926 1927 1928class TestTweediePower15(CheckTweedie): 1929 @classmethod 1930 def setup_class(cls): 1931 from .results.results_glm import CpunishTweediePower15 1932 cls.data = cpunish.load_pandas() 1933 cls.exog = cls.data.exog[['INCOME', 'SOUTH']] 1934 cls.endog = cls.data.endog 1935 family_link = sm.families.Tweedie(link=sm.families.links.Power(1), 1936 var_power=1.5) 1937 cls.res1 = sm.GLM(endog=cls.data.endog, 1938 exog=cls.data.exog[['INCOME', 'SOUTH']], 1939 family=family_link).fit() 1940 cls.res2 = CpunishTweediePower15() 1941 1942 1943class TestTweediePower2(CheckTweedie): 1944 @classmethod 1945 def setup_class(cls): 1946 from .results.results_glm import CpunishTweediePower2 1947 cls.data = cpunish.load_pandas() 1948 cls.exog = cls.data.exog[['INCOME', 'SOUTH']] 1949 cls.endog = cls.data.endog 1950 family_link = sm.families.Tweedie(link=sm.families.links.Power(1), 1951 var_power=2.) 1952 cls.res1 = sm.GLM(endog=cls.data.endog, 1953 exog=cls.data.exog[['INCOME', 'SOUTH']], 1954 family=family_link).fit() 1955 cls.res2 = CpunishTweediePower2() 1956 1957 1958class TestTweedieLog1(CheckTweedie): 1959 @classmethod 1960 def setup_class(cls): 1961 from .results.results_glm import CpunishTweedieLog1 1962 cls.data = cpunish.load_pandas() 1963 cls.exog = cls.data.exog[['INCOME', 'SOUTH']] 1964 cls.endog = cls.data.endog 1965 family_link = sm.families.Tweedie(link=sm.families.links.log(), 1966 var_power=1.) 1967 cls.res1 = sm.GLM(endog=cls.data.endog, 1968 exog=cls.data.exog[['INCOME', 'SOUTH']], 1969 family=family_link).fit() 1970 cls.res2 = CpunishTweedieLog1() 1971 1972 1973class TestTweedieLog15Fair(CheckTweedie): 1974 @classmethod 1975 def setup_class(cls): 1976 from statsmodels.datasets.fair import load_pandas 1977 1978 from .results.results_glm import FairTweedieLog15 1979 data = load_pandas() 1980 family_link = sm.families.Tweedie(link=sm.families.links.log(), 1981 var_power=1.5) 1982 cls.res1 = sm.GLM(endog=data.endog, 1983 exog=data.exog[['rate_marriage', 'age', 1984 'yrs_married']], 1985 family=family_link).fit() 1986 cls.res2 = FairTweedieLog15() 1987 1988 1989class CheckTweedieSpecial(object): 1990 def test_mu(self): 1991 assert_allclose(self.res1.mu, self.res2.mu, rtol=1e-5, atol=1e-5) 1992 1993 def test_resid(self): 1994 assert_allclose(self.res1.resid_response, self.res2.resid_response, 1995 rtol=1e-5, atol=1e-5) 1996 assert_allclose(self.res1.resid_pearson, self.res2.resid_pearson, 1997 rtol=1e-5, atol=1e-5) 1998 assert_allclose(self.res1.resid_deviance, self.res2.resid_deviance, 1999 rtol=1e-5, atol=1e-5) 2000 assert_allclose(self.res1.resid_working, self.res2.resid_working, 2001 rtol=1e-5, atol=1e-5) 2002 assert_allclose(self.res1.resid_anscombe_unscaled, 2003 self.res2.resid_anscombe_unscaled, 2004 rtol=1e-5, atol=1e-5) 2005 2006 2007class TestTweedieSpecialLog0(CheckTweedieSpecial): 2008 @classmethod 2009 def setup_class(cls): 2010 cls.data = cpunish.load_pandas() 2011 cls.exog = cls.data.exog[['INCOME', 'SOUTH']] 2012 cls.endog = cls.data.endog 2013 family1 = sm.families.Gaussian(link=sm.families.links.log()) 2014 cls.res1 = sm.GLM(endog=cls.data.endog, 2015 exog=cls.data.exog[['INCOME', 'SOUTH']], 2016 family=family1).fit() 2017 family2 = sm.families.Tweedie(link=sm.families.links.log(), 2018 var_power=0) 2019 cls.res2 = sm.GLM(endog=cls.data.endog, 2020 exog=cls.data.exog[['INCOME', 'SOUTH']], 2021 family=family2).fit() 2022 2023 2024class TestTweedieSpecialLog1(CheckTweedieSpecial): 2025 @classmethod 2026 def setup_class(cls): 2027 cls.data = cpunish.load_pandas() 2028 cls.exog = cls.data.exog[['INCOME', 'SOUTH']] 2029 cls.endog = cls.data.endog 2030 family1 = sm.families.Poisson(link=sm.families.links.log()) 2031 cls.res1 = sm.GLM(endog=cls.data.endog, 2032 exog=cls.data.exog[['INCOME', 'SOUTH']], 2033 family=family1).fit() 2034 family2 = sm.families.Tweedie(link=sm.families.links.log(), 2035 var_power=1) 2036 cls.res2 = sm.GLM(endog=cls.data.endog, 2037 exog=cls.data.exog[['INCOME', 'SOUTH']], 2038 family=family2).fit() 2039 2040 2041class TestTweedieSpecialLog2(CheckTweedieSpecial): 2042 @classmethod 2043 def setup_class(cls): 2044 cls.data = cpunish.load_pandas() 2045 cls.exog = cls.data.exog[['INCOME', 'SOUTH']] 2046 cls.endog = cls.data.endog 2047 family1 = sm.families.Gamma(link=sm.families.links.log()) 2048 cls.res1 = sm.GLM(endog=cls.data.endog, 2049 exog=cls.data.exog[['INCOME', 'SOUTH']], 2050 family=family1).fit() 2051 family2 = sm.families.Tweedie(link=sm.families.links.log(), 2052 var_power=2) 2053 cls.res2 = sm.GLM(endog=cls.data.endog, 2054 exog=cls.data.exog[['INCOME', 'SOUTH']], 2055 family=family2).fit() 2056 2057 2058class TestTweedieSpecialLog3(CheckTweedieSpecial): 2059 @classmethod 2060 def setup_class(cls): 2061 cls.data = cpunish.load_pandas() 2062 cls.exog = cls.data.exog[['INCOME', 'SOUTH']] 2063 cls.endog = cls.data.endog 2064 family1 = sm.families.InverseGaussian(link=sm.families.links.log()) 2065 cls.res1 = sm.GLM(endog=cls.data.endog, 2066 exog=cls.data.exog[['INCOME', 'SOUTH']], 2067 family=family1).fit() 2068 family2 = sm.families.Tweedie(link=sm.families.links.log(), 2069 var_power=3) 2070 cls.res2 = sm.GLM(endog=cls.data.endog, 2071 exog=cls.data.exog[['INCOME', 'SOUTH']], 2072 family=family2).fit() 2073 2074def gen_tweedie(p): 2075 2076 np.random.seed(3242) 2077 n = 500 2078 x = np.random.normal(size=(n, 4)) 2079 lpr = np.dot(x, np.r_[1, -1, 0, 0.5]) 2080 mu = np.exp(lpr) 2081 lam = 10 * mu**(2 - p) / (2 - p) 2082 alp = (2 - p) / (p - 1) 2083 bet = 10 * mu**(1 - p) / (p - 1) 2084 2085 # Generate Tweedie values using commpound Poisson distribution 2086 y = np.empty(n) 2087 N = np.random.poisson(lam) 2088 for i in range(n): 2089 y[i] = np.random.gamma(alp, 1 / bet[i], N[i]).sum() 2090 2091 return y, x 2092 2093@pytest.mark.filterwarnings("ignore:GLM ridge optimization") 2094def test_tweedie_EQL(): 2095 # All tests below are regression tests, but the results 2096 # are very close to the population values. 2097 2098 p = 1.5 2099 y, x = gen_tweedie(p) 2100 2101 # Un-regularized fit using gradients 2102 fam = sm.families.Tweedie(var_power=p, eql=True) 2103 model1 = sm.GLM(y, x, family=fam) 2104 result1 = model1.fit(method="newton") 2105 assert_allclose(result1.params, 2106 np.array([1.00350497, -0.99656954, 0.00802702, 0.50713209]), 2107 rtol=1e-5, atol=1e-5) 2108 2109 # Un-regularized fit using IRLS 2110 model1x = sm.GLM(y, x, family=fam) 2111 result1x = model1x.fit(method="irls") 2112 assert_allclose(result1.params, result1x.params) 2113 assert_allclose(result1.bse, result1x.bse, rtol=1e-2) 2114 2115 # Lasso fit using coordinate-wise descent 2116 # TODO: The search gets trapped in an infinite oscillation, so use 2117 # a slack convergence tolerance. 2118 model2 = sm.GLM(y, x, family=fam) 2119 result2 = model2.fit_regularized(L1_wt=1, alpha=0.07, maxiter=200, 2120 cnvrg_tol=0.01) 2121 2122 rtol, atol = 1e-2, 1e-4 2123 assert_allclose(result2.params, 2124 np.array([0.976831, -0.952854, 0., 0.470171]), 2125 rtol=rtol, atol=atol) 2126 2127 # Series of ridge fits using gradients 2128 ev = (np.array([1.001778, -0.99388, 0.00797, 0.506183]), 2129 np.array([0.98586638, -0.96953481, 0.00749983, 0.4975267]), 2130 np.array([0.206429, -0.164547, 0.000235, 0.102489])) 2131 for j, alpha in enumerate([0.05, 0.5, 0.7]): 2132 model3 = sm.GLM(y, x, family=fam) 2133 result3 = model3.fit_regularized(L1_wt=0, alpha=alpha) 2134 assert_allclose(result3.params, ev[j], rtol=rtol, atol=atol) 2135 result4 = model3.fit_regularized(L1_wt=0, alpha=alpha * np.ones(x.shape[1])) 2136 assert_allclose(result4.params, result3.params, rtol=rtol, atol=atol) 2137 alpha = alpha * np.ones(x.shape[1]) 2138 alpha[0] = 0 2139 result5 = model3.fit_regularized(L1_wt=0, alpha=alpha) 2140 assert not np.allclose(result5.params, result4.params) 2141 2142def test_tweedie_elastic_net(): 2143 # Check that the coefficients vanish one-by-one 2144 # when using the elastic net. 2145 2146 p = 1.5 # Tweedie variance exponent 2147 y, x = gen_tweedie(p) 2148 2149 # Un-regularized fit using gradients 2150 fam = sm.families.Tweedie(var_power=p, eql=True) 2151 model1 = sm.GLM(y, x, family=fam) 2152 2153 nnz = [] 2154 for alpha in np.linspace(0, 10, 20): 2155 result1 = model1.fit_regularized(L1_wt=0.5, alpha=alpha) 2156 nnz.append((np.abs(result1.params) > 0).sum()) 2157 nnz = np.unique(nnz) 2158 assert len(nnz) == 5 2159 2160 2161def test_tweedie_EQL_poisson_limit(): 2162 # Test the limiting Poisson case of the Nelder/Pregibon/Tweedie 2163 # EQL. 2164 2165 np.random.seed(3242) 2166 n = 500 2167 2168 x = np.random.normal(size=(n, 3)) 2169 x[:, 0] = 1 2170 lpr = 4 + x[:, 1:].sum(1) 2171 mn = np.exp(lpr) 2172 y = np.random.poisson(mn) 2173 2174 for scale in 1.0, 'x2', 'dev': 2175 2176 # Un-regularized fit using gradients not IRLS 2177 fam = sm.families.Tweedie(var_power=1, eql=True) 2178 model1 = sm.GLM(y, x, family=fam) 2179 result1 = model1.fit(method="newton", scale=scale) 2180 2181 # Poisson GLM 2182 model2 = sm.GLM(y, x, family=sm.families.Poisson()) 2183 result2 = model2.fit(method="newton", scale=scale) 2184 2185 assert_allclose(result1.params, result2.params, atol=1e-6, rtol=1e-6) 2186 assert_allclose(result1.bse, result2.bse, 1e-6, 1e-6) 2187 2188 2189def test_tweedie_EQL_upper_limit(): 2190 # Test the limiting case of the Nelder/Pregibon/Tweedie 2191 # EQL with var = mean^2. These are tests against population 2192 # values so accuracy is not high. 2193 2194 np.random.seed(3242) 2195 n = 500 2196 2197 x = np.random.normal(size=(n, 3)) 2198 x[:, 0] = 1 2199 lpr = 4 + x[:, 1:].sum(1) 2200 mn = np.exp(lpr) 2201 y = np.random.poisson(mn) 2202 2203 for scale in 'x2', 'dev', 1.0: 2204 2205 # Un-regularized fit using gradients not IRLS 2206 fam = sm.families.Tweedie(var_power=2, eql=True) 2207 model1 = sm.GLM(y, x, family=fam) 2208 result1 = model1.fit(method="newton", scale=scale) 2209 assert_allclose(result1.params, np.r_[4, 1, 1], atol=1e-3, rtol=1e-1) 2210 2211 2212def testTweediePowerEstimate(): 2213 # Test the Pearson estimate of the Tweedie variance and scale parameters. 2214 # 2215 # Ideally, this would match the following R code, but I cannot make it work... 2216 # 2217 # setwd('c:/workspace') 2218 # data <- read.csv('cpunish.csv', sep=",") 2219 # 2220 # library(tweedie) 2221 # 2222 # y <- c(1.00113835e+05, 6.89668315e+03, 6.15726842e+03, 2223 # 1.41718806e+03, 5.11776456e+02, 2.55369154e+02, 2224 # 1.07147443e+01, 3.56874698e+00, 4.06797842e-02, 2225 # 7.06996731e-05, 2.10165106e-07, 4.34276938e-08, 2226 # 1.56354040e-09, 0.00000000e+00, 0.00000000e+00, 2227 # 0.00000000e+00, 0.00000000e+00) 2228 # 2229 # data$NewY <- y 2230 # 2231 # out <- tweedie.profile( NewY ~ INCOME + SOUTH - 1, 2232 # p.vec=c(1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 2233 # 1.9), link.power=0, 2234 # data=data,do.plot = TRUE) 2235 data = cpunish.load_pandas() 2236 y = [1.00113835e+05, 6.89668315e+03, 6.15726842e+03, 2237 1.41718806e+03, 5.11776456e+02, 2.55369154e+02, 2238 1.07147443e+01, 3.56874698e+00, 4.06797842e-02, 2239 7.06996731e-05, 2.10165106e-07, 4.34276938e-08, 2240 1.56354040e-09, 0.00000000e+00, 0.00000000e+00, 2241 0.00000000e+00, 0.00000000e+00] 2242 model1 = sm.GLM(y, data.exog[['INCOME', 'SOUTH']], 2243 family=sm.families.Tweedie(link=sm.families.links.log(), 2244 var_power=1.5)) 2245 res1 = model1.fit() 2246 model2 = sm.GLM((y - res1.mu) ** 2, 2247 np.column_stack((np.ones(len(res1.mu)), np.log(res1.mu))), 2248 family=sm.families.Gamma(sm.families.links.log())) 2249 res2 = model2.fit() 2250 # Sample may be too small for this... 2251 # assert_allclose(res1.scale, np.exp(res2.params[0]), rtol=0.25) 2252 p = model1.estimate_tweedie_power(res1.mu) 2253 assert_allclose(p, res2.params[1], rtol=0.25) 2254 2255def test_glm_lasso_6431(): 2256 2257 # Based on issue #6431 2258 # Fails with newton-cg as optimizer 2259 np.random.seed(123) 2260 2261 from statsmodels.regression.linear_model import OLS 2262 2263 n = 50 2264 x = np.ones((n, 2)) 2265 x[:, 1] = np.arange(0, n) 2266 y = 1000 + x[:, 1] + np.random.normal(0, 1, n) 2267 2268 params = np.r_[999.82244338, 1.0077889] 2269 2270 for method in "bfgs", None: 2271 for fun in [OLS, GLM]: 2272 2273 # Changing L1_wtValue from 0 to 1e-9 changes 2274 # the algorithm from scipy gradient optimization 2275 # to statsmodels coordinate descent 2276 for L1_wtValue in [0, 1e-9]: 2277 model = fun(y, x) 2278 if fun == OLS: 2279 fit = model.fit_regularized(alpha=0, L1_wt=L1_wtValue) 2280 else: 2281 fit = model._fit_ridge(alpha=0, start_params=None, method=method) 2282 assert_allclose(params, fit.params, atol=1e-6, rtol=1e-6) 2283 2284class TestRegularized(object): 2285 2286 def test_regularized(self): 2287 2288 import os 2289 2290 from .results import glmnet_r_results 2291 2292 for dtype in "binomial", "poisson": 2293 2294 cur_dir = os.path.dirname(os.path.abspath(__file__)) 2295 data = np.loadtxt(os.path.join(cur_dir, "results", "enet_%s.csv" % dtype), 2296 delimiter=",") 2297 2298 endog = data[:, 0] 2299 exog = data[:, 1:] 2300 2301 fam = {"binomial" : sm.families.Binomial, 2302 "poisson" : sm.families.Poisson}[dtype] 2303 2304 for j in range(9): 2305 2306 vn = "rslt_%s_%d" % (dtype, j) 2307 r_result = getattr(glmnet_r_results, vn) 2308 L1_wt = r_result[0] 2309 alpha = r_result[1] 2310 params = r_result[2:] 2311 2312 model = GLM(endog, exog, family=fam()) 2313 sm_result = model.fit_regularized(L1_wt=L1_wt, alpha=alpha) 2314 2315 # Agreement is OK, see below for further check 2316 assert_allclose(params, sm_result.params, atol=1e-2, rtol=0.3) 2317 2318 # The penalized log-likelihood that we are maximizing. 2319 def plf(params): 2320 llf = model.loglike(params) / len(endog) 2321 llf = llf - alpha * ((1 - L1_wt)*np.sum(params**2) / 2 + L1_wt*np.sum(np.abs(params))) 2322 return llf 2323 2324 # Confirm that we are doing better than glmnet. 2325 llf_r = plf(params) 2326 llf_sm = plf(sm_result.params) 2327 assert_equal(np.sign(llf_sm - llf_r), 1) 2328 2329 2330class TestConvergence(object): 2331 @classmethod 2332 def setup_class(cls): 2333 ''' 2334 Test Binomial family with canonical logit link using star98 dataset. 2335 ''' 2336 from statsmodels.datasets.star98 import load 2337 data = load() 2338 data.exog = add_constant(data.exog, prepend=False) 2339 cls.model = GLM(data.endog, data.exog, 2340 family=sm.families.Binomial()) 2341 2342 def _when_converged(self, atol=1e-8, rtol=0, tol_criterion='deviance'): 2343 for i, dev in enumerate(self.res.fit_history[tol_criterion]): 2344 orig = self.res.fit_history[tol_criterion][i] 2345 new = self.res.fit_history[tol_criterion][i + 1] 2346 if np.allclose(orig, new, atol=atol, rtol=rtol): 2347 return i 2348 raise ValueError('CONVERGENCE CHECK: It seems this doens\'t converge!') 2349 2350 def test_convergence_atol_only(self): 2351 atol = 1e-8 2352 rtol = 0 2353 self.res = self.model.fit(atol=atol, rtol=rtol) 2354 expected_iterations = self._when_converged(atol=atol, rtol=rtol) 2355 actual_iterations = self.res.fit_history['iteration'] 2356 # Note the first value is the list is np.inf. The second value 2357 # is the initial guess based off of start_params or the 2358 # estimate thereof. The third value (index = 2) is the actual "first 2359 # iteration" 2360 assert_equal(expected_iterations, actual_iterations) 2361 assert_equal(len(self.res.fit_history['deviance']) - 2, 2362 actual_iterations) 2363 2364 def test_convergence_rtol_only(self): 2365 atol = 0 2366 rtol = 1e-8 2367 self.res = self.model.fit(atol=atol, rtol=rtol) 2368 expected_iterations = self._when_converged(atol=atol, rtol=rtol) 2369 actual_iterations = self.res.fit_history['iteration'] 2370 # Note the first value is the list is np.inf. The second value 2371 # is the initial guess based off of start_params or the 2372 # estimate thereof. The third value (index = 2) is the actual "first 2373 # iteration" 2374 assert_equal(expected_iterations, actual_iterations) 2375 assert_equal(len(self.res.fit_history['deviance']) - 2, 2376 actual_iterations) 2377 2378 def test_convergence_atol_rtol(self): 2379 atol = 1e-8 2380 rtol = 1e-8 2381 self.res = self.model.fit(atol=atol, rtol=rtol) 2382 expected_iterations = self._when_converged(atol=atol, rtol=rtol) 2383 actual_iterations = self.res.fit_history['iteration'] 2384 # Note the first value is the list is np.inf. The second value 2385 # is the initial guess based off of start_params or the 2386 # estimate thereof. The third value (index = 2) is the actual "first 2387 # iteration" 2388 assert_equal(expected_iterations, actual_iterations) 2389 assert_equal(len(self.res.fit_history['deviance']) - 2, 2390 actual_iterations) 2391 2392 def test_convergence_atol_only_params(self): 2393 atol = 1e-8 2394 rtol = 0 2395 self.res = self.model.fit(atol=atol, rtol=rtol, tol_criterion='params') 2396 expected_iterations = self._when_converged(atol=atol, rtol=rtol, 2397 tol_criterion='params') 2398 actual_iterations = self.res.fit_history['iteration'] 2399 # Note the first value is the list is np.inf. The second value 2400 # is the initial guess based off of start_params or the 2401 # estimate thereof. The third value (index = 2) is the actual "first 2402 # iteration" 2403 assert_equal(expected_iterations, actual_iterations) 2404 assert_equal(len(self.res.fit_history['deviance']) - 2, 2405 actual_iterations) 2406 2407 def test_convergence_rtol_only_params(self): 2408 atol = 0 2409 rtol = 1e-8 2410 self.res = self.model.fit(atol=atol, rtol=rtol, tol_criterion='params') 2411 expected_iterations = self._when_converged(atol=atol, rtol=rtol, 2412 tol_criterion='params') 2413 actual_iterations = self.res.fit_history['iteration'] 2414 # Note the first value is the list is np.inf. The second value 2415 # is the initial guess based off of start_params or the 2416 # estimate thereof. The third value (index = 2) is the actual "first 2417 # iteration" 2418 assert_equal(expected_iterations, actual_iterations) 2419 assert_equal(len(self.res.fit_history['deviance']) - 2, 2420 actual_iterations) 2421 2422 def test_convergence_atol_rtol_params(self): 2423 atol = 1e-8 2424 rtol = 1e-8 2425 self.res = self.model.fit(atol=atol, rtol=rtol, tol_criterion='params') 2426 expected_iterations = self._when_converged(atol=atol, rtol=rtol, 2427 tol_criterion='params') 2428 actual_iterations = self.res.fit_history['iteration'] 2429 # Note the first value is the list is np.inf. The second value 2430 # is the initial guess based off of start_params or the 2431 # estimate thereof. The third value (index = 2) is the actual "first 2432 # iteration" 2433 assert_equal(expected_iterations, actual_iterations) 2434 assert_equal(len(self.res.fit_history['deviance']) - 2, 2435 actual_iterations) 2436 2437 2438def test_poisson_deviance(): 2439 # see #3355 missing term in deviance if resid_response.sum() != 0 2440 np.random.seed(123987) 2441 nobs, k_vars = 50, 3-1 2442 x = sm.add_constant(np.random.randn(nobs, k_vars)) 2443 2444 mu_true = np.exp(x.sum(1)) 2445 y = np.random.poisson(mu_true, size=nobs) 2446 2447 mod = sm.GLM(y, x[:, :], family=sm.genmod.families.Poisson()) 2448 res = mod.fit() 2449 2450 d_i = res.resid_deviance 2451 d = res.deviance 2452 lr = (mod.family.loglike(y, y+1e-20) - 2453 mod.family.loglike(y, res.fittedvalues)) * 2 2454 2455 assert_allclose(d, (d_i**2).sum(), rtol=1e-12) 2456 assert_allclose(d, lr, rtol=1e-12) 2457 2458 # case without constant, resid_response.sum() != 0 2459 mod_nc = sm.GLM(y, x[:, 1:], family=sm.genmod.families.Poisson()) 2460 res_nc = mod_nc.fit() 2461 2462 d_i = res_nc.resid_deviance 2463 d = res_nc.deviance 2464 lr = (mod.family.loglike(y, y+1e-20) - 2465 mod.family.loglike(y, res_nc.fittedvalues)) * 2 2466 2467 assert_allclose(d, (d_i**2).sum(), rtol=1e-12) 2468 assert_allclose(d, lr, rtol=1e-12) 2469 2470 2471def test_non_invertible_hessian_fails_summary(): 2472 # Test when the hessian fails the summary is still available. 2473 data = cpunish.load_pandas() 2474 2475 data.endog[:] = 1 2476 with warnings.catch_warnings(): 2477 # we filter DomainWarning, the convergence problems 2478 # and warnings in summary 2479 warnings.simplefilter("ignore") 2480 mod = sm.GLM(data.endog, data.exog, family=sm.families.Gamma()) 2481 res = mod.fit(maxiter=1, method='bfgs', max_start_irls=0) 2482 res.summary() 2483 2484 2485def test_int_scale(): 2486 # GH-6627, make sure it works with int scale 2487 data = longley.load() 2488 mod = GLM(data.endog, data.exog, family=sm.families.Gaussian()) 2489 res = mod.fit(scale=1) 2490 assert isinstance(res.params, pd.Series) 2491 assert res.scale.dtype == np.float64 2492 2493 2494@pytest.mark.parametrize("dtype", [np.int8, np.int16, np.int32, np.int64]) 2495def test_int_exog(dtype): 2496 # GH-6627, make use of floats internally 2497 count1, n1, count2, n2 = 60, 51477.5, 30, 54308.7 2498 y = [count1, count2] 2499 x = np.asarray([[1, 1], [1, 0]]).astype(dtype) 2500 exposure = np.asarray([n1, n2]) 2501 mod = GLM(y, x, exposure=exposure, family=sm.families.Poisson()) 2502 res = mod.fit(method='bfgs', max_start_irls=0) 2503 assert isinstance(res.params, np.ndarray) 2504 2505 2506def test_glm_bic(iris): 2507 X = np.c_[np.ones(100), iris[50:, :4]] 2508 y = np.array(iris)[50:, 4].astype(np.int32) 2509 y -= 1 2510 SET_USE_BIC_LLF(True) 2511 model = GLM(y, X, family=sm.families.Binomial()).fit() 2512 # 34.9244 is what glm() of R yields 2513 assert_almost_equal(model.bic, 34.9244, decimal=3) 2514 assert_almost_equal(model.bic_llf, 34.9244, decimal=3) 2515 SET_USE_BIC_LLF(False) 2516 assert_almost_equal(model.bic, model.bic_deviance, decimal=3) 2517 SET_USE_BIC_LLF(None) 2518 2519 2520def test_glm_bic_warning(iris): 2521 X = np.c_[np.ones(100), iris[50:, :4]] 2522 y = np.array(iris)[50:, 4].astype(np.int32) 2523 y -= 1 2524 model = GLM(y, X, family=sm.families.Binomial()).fit() 2525 with pytest.warns(FutureWarning, match="The bic"): 2526 assert isinstance(model.bic, float) 2527 2528 2529def test_output_exposure_null(reset_randomstate): 2530 # GH 6953 2531 2532 x0 = [np.sin(i / 20) + 2 for i in range(1000)] 2533 rs = np.random.RandomState(0) 2534 # Variable exposures for each observation 2535 exposure = rs.randint(100, 200, size=1000) 2536 y = [np.sum(rs.poisson(x, size=e)) for x, e in zip(x0, exposure)] 2537 x = add_constant(x0) 2538 2539 model = GLM( 2540 endog=y, exog=x, exposure=exposure, family=sm.families.Poisson() 2541 ).fit() 2542 null_model = GLM( 2543 endog=y, exog=x[:, 0], exposure=exposure, family=sm.families.Poisson() 2544 ).fit() 2545 null_model_without_exposure = GLM( 2546 endog=y, exog=x[:, 0], family=sm.families.Poisson() 2547 ).fit() 2548 assert_allclose(model.llnull, null_model.llf) 2549 # Check that they are different 2550 assert np.abs(null_model_without_exposure.llf - model.llnull) > 1 2551 2552 2553def test_qaic(): 2554 2555 # Example from documentation of R package MuMIn 2556 import patsy 2557 ldose = np.concatenate((np.arange(6), np.arange(6))) 2558 sex = ["M"]*6 + ["F"]*6 2559 numdead = [10, 4, 9, 12, 18, 20, 0, 2, 6, 10, 12, 16] 2560 df = pd.DataFrame({"ldose": ldose, "sex": sex, "numdead": numdead}) 2561 df["numalive"] = 20 - df["numdead"] 2562 df["SF"] = df["numdead"] 2563 2564 y = df[["numalive", "numdead"]].values 2565 x = patsy.dmatrix("sex*ldose", data=df, return_type='dataframe') 2566 m = GLM(y, x, family=sm.families.Binomial()) 2567 r = m.fit() 2568 scale = 2.412699 2569 qaic = r.info_criteria(crit="qaic", scale=scale) 2570 2571 # R gives 31.13266 because it uses a df that is 1 greater, 2572 # presumably because they count the scale parameter in df. 2573 # This won't matter when comparing models by differencing 2574 # QAICs. 2575 # Binomial doesn't have a scale parameter, so adding +1 is not correct. 2576 assert_allclose(qaic, 29.13266, rtol=1e-5, atol=1e-5) 2577 qaic1 = r.info_criteria(crit="qaic", scale=scale, dk_params=1) 2578 assert_allclose(qaic1, 31.13266, rtol=1e-5, atol=1e-5) 2579 2580 2581def test_tweedie_score(): 2582 2583 np.random.seed(3242) 2584 n = 500 2585 x = np.random.normal(size=(n, 4)) 2586 lpr = np.dot(x, np.r_[1, -1, 0, 0.5]) 2587 mu = np.exp(lpr) 2588 2589 p0 = 1.5 2590 lam = 10 * mu**(2 - p0) / (2 - p0) 2591 alp = (2 - p0) / (p0 - 1) 2592 bet = 10 * mu**(1 - p0) / (p0 - 1) 2593 y = np.empty(n) 2594 N = np.random.poisson(lam) 2595 for i in range(n): 2596 y[i] = np.random.gamma(alp, 1 / bet[i], N[i]).sum() 2597 2598 for p in [1, 1.5, 2]: 2599 2600 fam = sm.families.Tweedie(var_power=p, eql=True) 2601 model = GLM(y, x, family=fam) 2602 result = model.fit() 2603 2604 pa = result.params + 0.2*np.random.normal(size=result.params.size) 2605 2606 ngrad = approx_fprime_cs(pa, lambda x: model.loglike(x, scale=1)) 2607 agrad = model.score(pa, scale=1) 2608 assert_allclose(ngrad, agrad, atol=1e-8, rtol=1e-8) 2609 2610 nhess = approx_hess_cs(pa, lambda x: model.loglike(x, scale=1)) 2611 ahess = model.hessian(pa, scale=1) 2612 assert_allclose(nhess, ahess, atol=5e-8, rtol=5e-8) 2613