1""" 2Test functions for GEE 3 4External comparisons are to R and Stata. The statsmodels GEE 5implementation should generally agree with the R GEE implementation 6for the independence and exchangeable correlation structures. For 7other correlation structures, the details of the correlation 8estimation differ among implementations and the results will not agree 9exactly. 10""" 11 12from statsmodels.compat import lrange 13import os 14import numpy as np 15import pytest 16from numpy.testing import (assert_almost_equal, assert_equal, assert_allclose, 17 assert_array_less, assert_raises, assert_warns, 18 assert_) 19import statsmodels.genmod.generalized_estimating_equations as gee 20import statsmodels.tools as tools 21import statsmodels.regression.linear_model as lm 22from statsmodels.genmod import families 23from statsmodels.genmod import cov_struct 24import statsmodels.discrete.discrete_model as discrete 25 26import pandas as pd 27from scipy.stats.distributions import norm 28import warnings 29 30try: 31 import matplotlib.pyplot as plt 32except ImportError: 33 pass 34 35pdf_output = False 36 37if pdf_output: 38 from matplotlib.backends.backend_pdf import PdfPages 39 pdf = PdfPages("test_glm.pdf") 40else: 41 pdf = None 42 43 44def close_or_save(pdf, fig): 45 if pdf_output: 46 pdf.savefig(fig) 47 48 49def load_data(fname, icept=True): 50 """ 51 Load a data set from the results directory. The data set should 52 be a CSV file with the following format: 53 54 Column 0: Group indicator 55 Column 1: endog variable 56 Columns 2-end: exog variables 57 58 If `icept` is True, an intercept is prepended to the exog 59 variables. 60 """ 61 62 cur_dir = os.path.dirname(os.path.abspath(__file__)) 63 Z = np.genfromtxt(os.path.join(cur_dir, 'results', fname), 64 delimiter=",") 65 66 group = Z[:, 0] 67 endog = Z[:, 1] 68 exog = Z[:, 2:] 69 70 if icept: 71 exog = np.concatenate((np.ones((exog.shape[0], 1)), exog), 72 axis=1) 73 74 return endog, exog, group 75 76 77def check_wrapper(results): 78 # check wrapper 79 assert_(isinstance(results.params, pd.Series)) 80 assert_(isinstance(results.fittedvalues, pd.Series)) 81 assert_(isinstance(results.resid, pd.Series)) 82 assert_(isinstance(results.centered_resid, pd.Series)) 83 84 assert_(isinstance(results._results.params, np.ndarray)) 85 assert_(isinstance(results._results.fittedvalues, np.ndarray)) 86 assert_(isinstance(results._results.resid, np.ndarray)) 87 assert_(isinstance(results._results.centered_resid, np.ndarray)) 88 89 90class TestGEE(object): 91 92 def test_margins_gaussian(self): 93 # Check marginal effects for a Gaussian GEE fit. Marginal 94 # effects and ordinary effects should be equal. 95 96 n = 40 97 np.random.seed(34234) 98 exog = np.random.normal(size=(n, 3)) 99 exog[:, 0] = 1 100 101 groups = np.kron(np.arange(n / 4), np.r_[1, 1, 1, 1]) 102 endog = exog[:, 1] + np.random.normal(size=n) 103 104 model = gee.GEE(endog, exog, groups) 105 result = model.fit( 106 start_params=[-4.88085602e-04, 1.18501903, 4.78820100e-02]) 107 108 marg = result.get_margeff() 109 110 assert_allclose(marg.margeff, result.params[1:]) 111 assert_allclose(marg.margeff_se, result.bse[1:]) 112 113 # smoke test 114 marg.summary() 115 116 def test_margins_logistic(self): 117 # Check marginal effects for a binomial GEE fit. Comparison 118 # comes from Stata. 119 120 np.random.seed(34234) 121 endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1] 122 exog = np.ones((8, 2)) 123 exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2] 124 125 groups = np.arange(8) 126 127 model = gee.GEE(endog, exog, groups, family=families.Binomial()) 128 result = model.fit( 129 cov_type='naive', start_params=[-3.29583687, 2.19722458]) 130 131 marg = result.get_margeff() 132 133 assert_allclose(marg.margeff, np.r_[0.4119796]) 134 assert_allclose(marg.margeff_se, np.r_[0.1379962], rtol=1e-6) 135 136 def test_margins_multinomial(self): 137 # Check marginal effects for a 2-class multinomial GEE fit, 138 # which should be equivalent to logistic regression. Comparison 139 # comes from Stata. 140 141 np.random.seed(34234) 142 endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1] 143 exog = np.ones((8, 2)) 144 exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2] 145 146 groups = np.arange(8) 147 148 model = gee.NominalGEE(endog, exog, groups) 149 result = model.fit(cov_type='naive', start_params=[ 150 3.295837, -2.197225]) 151 152 marg = result.get_margeff() 153 154 assert_allclose(marg.margeff, np.r_[-0.41197961], rtol=1e-5) 155 assert_allclose(marg.margeff_se, np.r_[0.1379962], rtol=1e-6) 156 157 @pytest.mark.smoke 158 @pytest.mark.matplotlib 159 def test_nominal_plot(self, close_figures): 160 np.random.seed(34234) 161 endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1] 162 exog = np.ones((8, 2)) 163 exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2] 164 165 groups = np.arange(8) 166 167 model = gee.NominalGEE(endog, exog, groups) 168 result = model.fit(cov_type='naive', 169 start_params=[3.295837, -2.197225]) 170 171 fig = result.plot_distribution() 172 assert_equal(isinstance(fig, plt.Figure), True) 173 174 def test_margins_poisson(self): 175 # Check marginal effects for a Poisson GEE fit. 176 177 np.random.seed(34234) 178 endog = np.r_[10, 15, 12, 13, 20, 18, 26, 29] 179 exog = np.ones((8, 2)) 180 exog[:, 1] = np.r_[0, 0, 0, 0, 1, 1, 1, 1] 181 182 groups = np.arange(8) 183 184 model = gee.GEE(endog, exog, groups, family=families.Poisson()) 185 result = model.fit(cov_type='naive', start_params=[ 186 2.52572864, 0.62057649]) 187 188 marg = result.get_margeff() 189 190 assert_allclose(marg.margeff, np.r_[11.0928], rtol=1e-6) 191 assert_allclose(marg.margeff_se, np.r_[3.269015], rtol=1e-6) 192 193 def test_multinomial(self): 194 """ 195 Check the 2-class multinomial (nominal) GEE fit against 196 logistic regression. 197 """ 198 199 np.random.seed(34234) 200 endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1] 201 exog = np.ones((8, 2)) 202 exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2] 203 204 groups = np.arange(8) 205 206 model = gee.NominalGEE(endog, exog, groups) 207 results = model.fit(cov_type='naive', start_params=[ 208 3.295837, -2.197225]) 209 210 logit_model = gee.GEE(endog, exog, groups, 211 family=families.Binomial()) 212 logit_results = logit_model.fit(cov_type='naive') 213 214 assert_allclose(results.params, -logit_results.params, rtol=1e-5) 215 assert_allclose(results.bse, logit_results.bse, rtol=1e-5) 216 217 def test_weighted(self): 218 219 # Simple check where the answer can be computed by hand. 220 exog = np.ones(20) 221 weights = np.ones(20) 222 weights[0:10] = 2 223 endog = np.zeros(20) 224 endog[0:10] += 1 225 groups = np.kron(np.arange(10), np.r_[1, 1]) 226 model = gee.GEE(endog, exog, groups, weights=weights) 227 result = model.fit() 228 assert_allclose(result.params, np.r_[2 / 3.]) 229 230 # Comparison against stata using groups with different sizes. 231 weights = np.ones(20) 232 weights[10:] = 2 233 endog = np.r_[1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6, 234 7, 8, 7, 8] 235 exog1 = np.r_[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 236 3, 3, 3, 3] 237 groups = np.r_[1, 1, 2, 2, 2, 2, 4, 4, 5, 5, 6, 6, 6, 6, 238 8, 8, 9, 9, 10, 10] 239 exog = np.column_stack((np.ones(20), exog1)) 240 241 # Comparison using independence model 242 model = gee.GEE(endog, exog, groups, weights=weights, 243 cov_struct=cov_struct.Independence()) 244 g = np.mean([2, 4, 2, 2, 4, 2, 2, 2]) 245 fac = 20 / float(20 - g) 246 result = model.fit(ddof_scale=0, scaling_factor=fac) 247 248 assert_allclose(result.params, np.r_[1.247573, 1.436893], atol=1e-6) 249 assert_allclose(result.scale, 1.808576) 250 251 # Stata multiples robust SE by sqrt(N / (N - g)), where N is 252 # the total sample size and g is the average group size. 253 assert_allclose(result.bse, np.r_[0.895366, 0.3425498], atol=1e-5) 254 255 # Comparison using exchangeable model 256 # Smoke test for now 257 model = gee.GEE(endog, exog, groups, weights=weights, 258 cov_struct=cov_struct.Exchangeable()) 259 model.fit(ddof_scale=0) 260 261 # This is in the release announcement for version 0.6. 262 def test_poisson_epil(self): 263 264 cur_dir = os.path.dirname(os.path.abspath(__file__)) 265 fname = os.path.join(cur_dir, "results", "epil.csv") 266 data = pd.read_csv(fname) 267 268 fam = families.Poisson() 269 ind = cov_struct.Independence() 270 mod1 = gee.GEE.from_formula("y ~ age + trt + base", data["subject"], 271 data, cov_struct=ind, family=fam) 272 rslt1 = mod1.fit(cov_type='naive') 273 274 # Coefficients should agree with GLM 275 from statsmodels.genmod.generalized_linear_model import GLM 276 277 mod2 = GLM.from_formula("y ~ age + trt + base", data, 278 family=families.Poisson()) 279 rslt2 = mod2.fit() 280 281 # do not use wrapper, asserts_xxx do not work 282 rslt1 = rslt1._results 283 rslt2 = rslt2._results 284 285 assert_allclose(rslt1.params, rslt2.params, rtol=1e-6, atol=1e-6) 286 assert_allclose(rslt1.bse, rslt2.bse, rtol=1e-6, atol=1e-6) 287 288 def test_missing(self): 289 # Test missing data handling for calling from the api. Missing 290 # data handling does not currently work for formulas. 291 292 np.random.seed(34234) 293 endog = np.random.normal(size=100) 294 exog = np.random.normal(size=(100, 3)) 295 exog[:, 0] = 1 296 groups = np.kron(lrange(20), np.ones(5)) 297 298 endog[0] = np.nan 299 endog[5:7] = np.nan 300 exog[10:12, 1] = np.nan 301 302 mod1 = gee.GEE(endog, exog, groups, missing='drop') 303 rslt1 = mod1.fit() 304 305 assert_almost_equal(len(mod1.endog), 95) 306 assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 3]) 307 308 ii = np.isfinite(endog) & np.isfinite(exog).all(1) 309 310 mod2 = gee.GEE(endog[ii], exog[ii, :], groups[ii], missing='none') 311 rslt2 = mod2.fit() 312 313 assert_almost_equal(rslt1.params, rslt2.params) 314 assert_almost_equal(rslt1.bse, rslt2.bse) 315 316 def test_missing_formula(self): 317 # Test missing data handling for formulas. 318 319 np.random.seed(34234) 320 endog = np.random.normal(size=100) 321 exog1 = np.random.normal(size=100) 322 exog2 = np.random.normal(size=100) 323 exog3 = np.random.normal(size=100) 324 groups = np.kron(lrange(20), np.ones(5)) 325 326 endog[0] = np.nan 327 endog[5:7] = np.nan 328 exog2[10:12] = np.nan 329 330 data0 = pd.DataFrame({"endog": endog, "exog1": exog1, "exog2": exog2, 331 "exog3": exog3, "groups": groups}) 332 333 for k in 0, 1: 334 data = data0.copy() 335 kwargs = {} 336 if k == 1: 337 data["offset"] = 0 338 data["time"] = 0 339 kwargs["offset"] = "offset" 340 kwargs["time"] = "time" 341 342 mod1 = gee.GEE.from_formula("endog ~ exog1 + exog2 + exog3", 343 groups="groups", data=data, 344 missing='drop', **kwargs) 345 rslt1 = mod1.fit() 346 347 assert_almost_equal(len(mod1.endog), 95) 348 assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 4]) 349 350 data = data.dropna() 351 352 kwargs = {} 353 if k == 1: 354 kwargs["offset"] = data["offset"] 355 kwargs["time"] = data["time"] 356 357 mod2 = gee.GEE.from_formula("endog ~ exog1 + exog2 + exog3", 358 groups=data["groups"], data=data, 359 missing='none', **kwargs) 360 rslt2 = mod2.fit() 361 362 assert_almost_equal(rslt1.params.values, rslt2.params.values) 363 assert_almost_equal(rslt1.bse.values, rslt2.bse.values) 364 365 @pytest.mark.parametrize("k1", [False, True]) 366 @pytest.mark.parametrize("k2", [False, True]) 367 def test_invalid_args(self, k1, k2): 368 369 for j in range(3): 370 371 p = [20, 20, 20] 372 p[j] = 18 373 374 endog = np.zeros(p[0]) 375 exog = np.zeros((p[1], 2)) 376 377 kwargs = {} 378 kwargs["groups"] = np.zeros(p[2]) 379 if k1: 380 kwargs["exposure"] = np.zeros(18) 381 if k2: 382 kwargs["time"] = np.zeros(18) 383 with assert_raises(ValueError): 384 gee.GEE(endog, exog, **kwargs) 385 386 def test_default_time(self): 387 # Check that the time defaults work correctly. 388 389 endog, exog, group = load_data("gee_logistic_1.csv") 390 391 # Time values for the autoregressive model 392 T = np.zeros(len(endog)) 393 idx = set(group) 394 for ii in idx: 395 jj = np.flatnonzero(group == ii) 396 T[jj] = lrange(len(jj)) 397 398 family = families.Binomial() 399 va = cov_struct.Autoregressive(grid=False) 400 401 md1 = gee.GEE(endog, exog, group, family=family, cov_struct=va) 402 mdf1 = md1.fit() 403 404 md2 = gee.GEE(endog, exog, group, time=T, family=family, 405 cov_struct=va) 406 mdf2 = md2.fit() 407 408 assert_almost_equal(mdf1.params, mdf2.params, decimal=6) 409 assert_almost_equal(mdf1.standard_errors(), 410 mdf2.standard_errors(), decimal=6) 411 412 def test_logistic(self): 413 # R code for comparing results: 414 415 # library(gee) 416 # Z = read.csv("results/gee_logistic_1.csv", header=FALSE) 417 # Y = Z[,2] 418 # Id = Z[,1] 419 # X1 = Z[,3] 420 # X2 = Z[,4] 421 # X3 = Z[,5] 422 423 # mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, 424 # corstr="independence") 425 # smi = summary(mi) 426 # u = coefficients(smi) 427 # cfi = paste(u[,1], collapse=",") 428 # sei = paste(u[,4], collapse=",") 429 430 # me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, 431 # corstr="exchangeable") 432 # sme = summary(me) 433 # u = coefficients(sme) 434 # cfe = paste(u[,1], collapse=",") 435 # see = paste(u[,4], collapse=",") 436 437 # ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial, 438 # corstr="AR-M") 439 # sma = summary(ma) 440 # u = coefficients(sma) 441 # cfa = paste(u[,1], collapse=",") 442 # sea = paste(u[,4], collapse=",") 443 444 # sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa) 445 # sprintf("se = [[%s],[%s],[%s]]", sei, see, sea) 446 447 endog, exog, group = load_data("gee_logistic_1.csv") 448 449 # Time values for the autoregressive model 450 T = np.zeros(len(endog)) 451 idx = set(group) 452 for ii in idx: 453 jj = np.flatnonzero(group == ii) 454 T[jj] = lrange(len(jj)) 455 456 family = families.Binomial() 457 ve = cov_struct.Exchangeable() 458 vi = cov_struct.Independence() 459 va = cov_struct.Autoregressive(grid=False) 460 461 # From R gee 462 cf = [[0.0167272965285882, 1.13038654425893, 463 -1.86896345082962, 1.09397608331333], 464 [0.0178982283915449, 1.13118798191788, 465 -1.86133518416017, 1.08944256230299], 466 [0.0109621937947958, 1.13226505028438, 467 -1.88278757333046, 1.09954623769449]] 468 se = [[0.127291720283049, 0.166725808326067, 469 0.192430061340865, 0.173141068839597], 470 [0.127045031730155, 0.165470678232842, 471 0.192052750030501, 0.173174779369249], 472 [0.127240302296444, 0.170554083928117, 473 0.191045527104503, 0.169776150974586]] 474 475 for j, v in enumerate((vi, ve, va)): 476 md = gee.GEE(endog, exog, group, T, family, v) 477 mdf = md.fit() 478 if id(v) != id(va): 479 assert_almost_equal(mdf.params, cf[j], decimal=6) 480 assert_almost_equal(mdf.standard_errors(), se[j], 481 decimal=6) 482 483 # Test with formulas 484 D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]), 485 axis=1) 486 D = pd.DataFrame(D) 487 D.columns = ["Y", "Id", ] + ["X%d" % (k + 1) 488 for k in range(exog.shape[1] - 1)] 489 for j, v in enumerate((vi, ve)): 490 md = gee.GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D, 491 family=family, cov_struct=v) 492 mdf = md.fit() 493 assert_almost_equal(mdf.params, cf[j], decimal=6) 494 assert_almost_equal(mdf.standard_errors(), se[j], 495 decimal=6) 496 497 # FIXME: do not leave commented-out 498 # Check for run-time exceptions in summary 499 # print(mdf.summary()) 500 501 def test_autoregressive(self): 502 503 dep_params_true = [0, 0.589208623896, 0.559823804948] 504 505 params_true = [[1.08043787, 1.12709319, 0.90133927], 506 [0.9613677, 1.05826987, 0.90832055], 507 [1.05370439, 0.96084864, 0.93923374]] 508 509 np.random.seed(342837482) 510 511 num_group = 100 512 ar_param = 0.5 513 k = 3 514 515 ga = families.Gaussian() 516 517 for gsize in 1, 2, 3: 518 519 ix = np.arange(gsize)[:, None] - np.arange(gsize)[None, :] 520 ix = np.abs(ix) 521 cmat = ar_param ** ix 522 cmat_r = np.linalg.cholesky(cmat) 523 524 endog = [] 525 exog = [] 526 groups = [] 527 for i in range(num_group): 528 x = np.random.normal(size=(gsize, k)) 529 exog.append(x) 530 expval = x.sum(1) 531 errors = np.dot(cmat_r, np.random.normal(size=gsize)) 532 endog.append(expval + errors) 533 groups.append(i * np.ones(gsize)) 534 535 endog = np.concatenate(endog) 536 groups = np.concatenate(groups) 537 exog = np.concatenate(exog, axis=0) 538 539 ar = cov_struct.Autoregressive(grid=False) 540 md = gee.GEE(endog, exog, groups, family=ga, cov_struct=ar) 541 mdf = md.fit() 542 assert_almost_equal(ar.dep_params, dep_params_true[gsize - 1]) 543 assert_almost_equal(mdf.params, params_true[gsize - 1]) 544 545 def test_post_estimation(self): 546 547 family = families.Gaussian() 548 endog, exog, group = load_data("gee_linear_1.csv") 549 550 ve = cov_struct.Exchangeable() 551 552 md = gee.GEE(endog, exog, group, None, family, ve) 553 mdf = md.fit() 554 555 assert_almost_equal(np.dot(exog, mdf.params), 556 mdf.fittedvalues) 557 assert_almost_equal(endog - np.dot(exog, mdf.params), 558 mdf.resid) 559 560 def test_scoretest(self): 561 # Regression tests 562 563 np.random.seed(6432) 564 n = 200 # Must be divisible by 4 565 exog = np.random.normal(size=(n, 4)) 566 endog = exog[:, 0] + exog[:, 1] + exog[:, 2] 567 endog += 3 * np.random.normal(size=n) 568 group = np.kron(np.arange(n / 4), np.ones(4)) 569 570 # Test under the null. 571 L = np.array([[1., -1, 0, 0]]) 572 R = np.array([0., ]) 573 family = families.Gaussian() 574 va = cov_struct.Independence() 575 mod1 = gee.GEE(endog, exog, group, family=family, 576 cov_struct=va, constraint=(L, R)) 577 res1 = mod1.fit() 578 assert_almost_equal(res1.score_test()["statistic"], 579 1.08126334) 580 assert_almost_equal(res1.score_test()["p-value"], 581 0.2984151086) 582 583 # Test under the alternative. 584 L = np.array([[1., -1, 0, 0]]) 585 R = np.array([1.0, ]) 586 family = families.Gaussian() 587 va = cov_struct.Independence() 588 mod2 = gee.GEE(endog, exog, group, family=family, 589 cov_struct=va, constraint=(L, R)) 590 res2 = mod2.fit() 591 assert_almost_equal(res2.score_test()["statistic"], 592 3.491110965) 593 assert_almost_equal(res2.score_test()["p-value"], 594 0.0616991659) 595 596 # Compare to Wald tests 597 exog = np.random.normal(size=(n, 2)) 598 L = np.array([[1, -1]]) 599 R = np.array([0.]) 600 f = np.r_[1, -1] 601 for i in range(10): 602 endog = exog[:, 0] + (0.5 + i / 10.) * exog[:, 1] +\ 603 np.random.normal(size=n) 604 family = families.Gaussian() 605 va = cov_struct.Independence() 606 mod0 = gee.GEE(endog, exog, group, family=family, 607 cov_struct=va) 608 rslt0 = mod0.fit() 609 family = families.Gaussian() 610 va = cov_struct.Independence() 611 mod1 = gee.GEE(endog, exog, group, family=family, 612 cov_struct=va, constraint=(L, R)) 613 res1 = mod1.fit() 614 se = np.sqrt(np.dot(f, np.dot(rslt0.cov_params(), f))) 615 wald_z = np.dot(f, rslt0.params) / se 616 wald_p = 2 * norm.cdf(-np.abs(wald_z)) 617 score_p = res1.score_test()["p-value"] 618 assert_array_less(np.abs(wald_p - score_p), 0.02) 619 620 @pytest.mark.parametrize("cov_struct", [cov_struct.Independence, 621 cov_struct.Exchangeable]) 622 def test_compare_score_test(self, cov_struct): 623 624 np.random.seed(6432) 625 n = 200 # Must be divisible by 4 626 exog = np.random.normal(size=(n, 4)) 627 group = np.kron(np.arange(n / 4), np.ones(4)) 628 629 exog_sub = exog[:, [0, 3]] 630 endog = exog_sub.sum(1) + 3 * np.random.normal(size=n) 631 632 L = np.asarray([[0, 1, 0, 0], [0, 0, 1, 0]]) 633 R = np.zeros(2) 634 mod_lr = gee.GEE(endog, exog, group, constraint=(L, R), 635 cov_struct=cov_struct()) 636 mod_lr.fit() 637 638 mod_sub = gee.GEE(endog, exog_sub, group, cov_struct=cov_struct()) 639 res_sub = mod_sub.fit() 640 641 for call_fit in [False, True]: 642 mod = gee.GEE(endog, exog, group, cov_struct=cov_struct()) 643 if call_fit: 644 # Should work with or without fitting the parent model 645 mod.fit() 646 score_results = mod.compare_score_test(res_sub) 647 assert_almost_equal( 648 score_results["statistic"], 649 mod_lr.score_test_results["statistic"]) 650 assert_almost_equal( 651 score_results["p-value"], 652 mod_lr.score_test_results["p-value"]) 653 assert_almost_equal( 654 score_results["df"], 655 mod_lr.score_test_results["df"]) 656 657 def test_compare_score_test_warnings(self): 658 659 np.random.seed(6432) 660 n = 200 # Must be divisible by 4 661 exog = np.random.normal(size=(n, 4)) 662 group = np.kron(np.arange(n / 4), np.ones(4)) 663 exog_sub = exog[:, [0, 3]] 664 endog = exog_sub.sum(1) + 3 * np.random.normal(size=n) 665 666 # Mismatched cov_struct 667 with assert_warns(UserWarning): 668 mod_sub = gee.GEE(endog, exog_sub, group, 669 cov_struct=cov_struct.Exchangeable()) 670 res_sub = mod_sub.fit() 671 mod = gee.GEE(endog, exog, group, 672 cov_struct=cov_struct.Independence()) 673 mod.compare_score_test(res_sub) # smoketest 674 675 # Mismatched family 676 with assert_warns(UserWarning): 677 mod_sub = gee.GEE(endog, exog_sub, group, 678 family=families.Gaussian()) 679 res_sub = mod_sub.fit() 680 mod = gee.GEE(endog, exog, group, family=families.Poisson()) 681 mod.compare_score_test(res_sub) # smoketest 682 683 # Mismatched size 684 with assert_raises(Exception): 685 mod_sub = gee.GEE(endog, exog_sub, group) 686 res_sub = mod_sub.fit() 687 mod = gee.GEE(endog[0:100], exog[:100, :], group[0:100]) 688 mod.compare_score_test(res_sub) # smoketest 689 690 # Mismatched weights 691 with assert_warns(UserWarning): 692 w = np.random.uniform(size=n) 693 mod_sub = gee.GEE(endog, exog_sub, group, weights=w) 694 res_sub = mod_sub.fit() 695 mod = gee.GEE(endog, exog, group) 696 mod.compare_score_test(res_sub) # smoketest 697 698 # Parent and submodel are the same dimension 699 with pytest.warns(UserWarning): 700 w = np.random.uniform(size=n) 701 mod_sub = gee.GEE(endog, exog, group) 702 res_sub = mod_sub.fit() 703 mod = gee.GEE(endog, exog, group) 704 mod.compare_score_test(res_sub) # smoketest 705 706 def test_constraint_covtype(self): 707 # Test constraints with different cov types 708 np.random.seed(6432) 709 n = 200 710 exog = np.random.normal(size=(n, 4)) 711 endog = exog[:, 0] + exog[:, 1] + exog[:, 2] 712 endog += 3 * np.random.normal(size=n) 713 group = np.kron(np.arange(n / 4), np.ones(4)) 714 L = np.array([[1., -1, 0, 0]]) 715 R = np.array([0., ]) 716 family = families.Gaussian() 717 va = cov_struct.Independence() 718 for cov_type in "robust", "naive", "bias_reduced": 719 model = gee.GEE(endog, exog, group, family=family, 720 cov_struct=va, constraint=(L, R)) 721 result = model.fit(cov_type=cov_type) 722 result.standard_errors(cov_type=cov_type) 723 assert_allclose(result.cov_robust.shape, np.r_[4, 4]) 724 assert_allclose(result.cov_naive.shape, np.r_[4, 4]) 725 if cov_type == "bias_reduced": 726 assert_allclose(result.cov_robust_bc.shape, np.r_[4, 4]) 727 728 def test_linear(self): 729 # library(gee) 730 731 # Z = read.csv("results/gee_linear_1.csv", header=FALSE) 732 # Y = Z[,2] 733 # Id = Z[,1] 734 # X1 = Z[,3] 735 # X2 = Z[,4] 736 # X3 = Z[,5] 737 # mi = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian, 738 # corstr="independence", tol=1e-8, maxit=100) 739 # smi = summary(mi) 740 # u = coefficients(smi) 741 742 # cfi = paste(u[,1], collapse=",") 743 # sei = paste(u[,4], collapse=",") 744 745 # me = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian, 746 # corstr="exchangeable", tol=1e-8, maxit=100) 747 # sme = summary(me) 748 # u = coefficients(sme) 749 750 # cfe = paste(u[,1], collapse=",") 751 # see = paste(u[,4], collapse=",") 752 753 # sprintf("cf = [[%s],[%s]]", cfi, cfe) 754 # sprintf("se = [[%s],[%s]]", sei, see) 755 756 family = families.Gaussian() 757 758 endog, exog, group = load_data("gee_linear_1.csv") 759 760 vi = cov_struct.Independence() 761 ve = cov_struct.Exchangeable() 762 763 # From R gee 764 cf = [[-0.01850226507491, 0.81436304278962, 765 -1.56167635393184, 0.794239361055003], 766 [-0.0182920577154767, 0.814898414022467, 767 -1.56194040106201, 0.793499517527478]] 768 se = [[0.0440733554189401, 0.0479993639119261, 769 0.0496045952071308, 0.0479467597161284], 770 [0.0440369906460754, 0.0480069787567662, 771 0.049519758758187, 0.0479760443027526]] 772 773 for j, v in enumerate((vi, ve)): 774 md = gee.GEE(endog, exog, group, None, family, v) 775 mdf = md.fit() 776 assert_almost_equal(mdf.params, cf[j], decimal=10) 777 assert_almost_equal(mdf.standard_errors(), se[j], 778 decimal=10) 779 780 # Test with formulas 781 D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]), 782 axis=1) 783 D = pd.DataFrame(D) 784 D.columns = ["Y", "Id", ] + ["X%d" % (k + 1) 785 for k in range(exog.shape[1] - 1)] 786 for j, v in enumerate((vi, ve)): 787 md = gee.GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D, 788 family=family, cov_struct=v) 789 mdf = md.fit() 790 assert_almost_equal(mdf.params, cf[j], decimal=10) 791 assert_almost_equal(mdf.standard_errors(), se[j], 792 decimal=10) 793 794 def test_linear_constrained(self): 795 796 family = families.Gaussian() 797 798 np.random.seed(34234) 799 exog = np.random.normal(size=(300, 4)) 800 exog[:, 0] = 1 801 endog = np.dot(exog, np.r_[1, 1, 0, 0.2]) +\ 802 np.random.normal(size=300) 803 group = np.kron(np.arange(100), np.r_[1, 1, 1]) 804 805 vi = cov_struct.Independence() 806 ve = cov_struct.Exchangeable() 807 808 L = np.r_[[[0, 0, 0, 1]]] 809 R = np.r_[0, ] 810 811 for j, v in enumerate((vi, ve)): 812 md = gee.GEE(endog, exog, group, None, family, v, 813 constraint=(L, R)) 814 mdf = md.fit() 815 assert_almost_equal(mdf.params[3], 0, decimal=10) 816 817 def test_nested_linear(self): 818 819 family = families.Gaussian() 820 821 endog, exog, group = load_data("gee_nested_linear_1.csv") 822 823 group_n = [] 824 for i in range(endog.shape[0] // 10): 825 group_n.extend([0, ] * 5) 826 group_n.extend([1, ] * 5) 827 group_n = np.array(group_n)[:, None] 828 829 dp = cov_struct.Independence() 830 md = gee.GEE(endog, exog, group, None, family, dp) 831 mdf1 = md.fit() 832 833 # From statsmodels.GEE (not an independent test) 834 cf = np.r_[-0.1671073, 1.00467426, -2.01723004, 0.97297106] 835 se = np.r_[0.08629606, 0.04058653, 0.04067038, 0.03777989] 836 assert_almost_equal(mdf1.params, cf, decimal=6) 837 assert_almost_equal(mdf1.standard_errors(), se, 838 decimal=6) 839 840 ne = cov_struct.Nested() 841 md = gee.GEE(endog, exog, group, None, family, ne, 842 dep_data=group_n) 843 mdf2 = md.fit(start_params=mdf1.params) 844 845 # From statsmodels.GEE (not an independent test) 846 cf = np.r_[-0.16655319, 1.02183688, -2.00858719, 1.00101969] 847 se = np.r_[0.08632616, 0.02913582, 0.03114428, 0.02893991] 848 assert_almost_equal(mdf2.params, cf, decimal=6) 849 assert_almost_equal(mdf2.standard_errors(), se, 850 decimal=6) 851 852 smry = mdf2.cov_struct.summary() 853 assert_allclose( 854 smry.Variance, 855 np.r_[1.043878, 0.611656, 1.421205], 856 atol=1e-5, rtol=1e-5) 857 858 def test_nested_pandas(self): 859 860 np.random.seed(4234) 861 n = 10000 862 863 # Outer groups 864 groups = np.kron(np.arange(n // 100), np.ones(100)).astype(int) 865 866 # Inner groups 867 groups1 = np.kron(np.arange(n // 50), np.ones(50)).astype(int) 868 groups2 = np.kron(np.arange(n // 10), np.ones(10)).astype(int) 869 870 # Group effects 871 groups_e = np.random.normal(size=n // 100) 872 groups1_e = 2 * np.random.normal(size=n // 50) 873 groups2_e = 3 * np.random.normal(size=n // 10) 874 875 y = groups_e[groups] + groups1_e[groups1] + groups2_e[groups2] 876 y += 0.5 * np.random.normal(size=n) 877 878 df = pd.DataFrame({"y": y, "TheGroups": groups, 879 "groups1": groups1, "groups2": groups2}) 880 881 model = gee.GEE.from_formula("y ~ 1", groups="TheGroups", 882 dep_data="0 + groups1 + groups2", 883 cov_struct=cov_struct.Nested(), 884 data=df) 885 result = model.fit() 886 887 # The true variances are 1, 4, 9, 0.25 888 smry = result.cov_struct.summary() 889 assert_allclose( 890 smry.Variance, 891 np.r_[1.437299, 4.421543, 8.905295, 0.258480], 892 atol=1e-5, rtol=1e-5) 893 894 def test_ordinal(self): 895 896 family = families.Binomial() 897 898 endog, exog, groups = load_data("gee_ordinal_1.csv", 899 icept=False) 900 901 va = cov_struct.GlobalOddsRatio("ordinal") 902 903 mod = gee.OrdinalGEE(endog, exog, groups, None, family, va) 904 rslt = mod.fit() 905 906 # Regression test 907 cf = np.r_[1.09250002, 0.0217443, -0.39851092, -0.01812116, 908 0.03023969, 1.18258516, 0.01803453, -1.10203381] 909 assert_almost_equal(rslt.params, cf, decimal=5) 910 911 # Regression test 912 se = np.r_[0.10883461, 0.10330197, 0.11177088, 0.05486569, 913 0.05997153, 0.09168148, 0.05953324, 0.0853862] 914 assert_almost_equal(rslt.bse, se, decimal=5) 915 916 # Check that we get the correct results type 917 assert_equal(type(rslt), gee.OrdinalGEEResultsWrapper) 918 assert_equal(type(rslt._results), gee.OrdinalGEEResults) 919 920 @pytest.mark.smoke 921 def test_ordinal_formula(self): 922 923 np.random.seed(434) 924 n = 40 925 y = np.random.randint(0, 3, n) 926 groups = np.arange(n) 927 x1 = np.random.normal(size=n) 928 x2 = np.random.normal(size=n) 929 930 df = pd.DataFrame({"y": y, "groups": groups, "x1": x1, "x2": x2}) 931 932 model = gee.OrdinalGEE.from_formula("y ~ 0 + x1 + x2", groups, data=df) 933 model.fit() 934 935 with warnings.catch_warnings(): 936 warnings.simplefilter("ignore") 937 model = gee.NominalGEE.from_formula("y ~ 0 + x1 + x2", groups, 938 data=df) 939 model.fit() 940 941 @pytest.mark.smoke 942 def test_ordinal_independence(self): 943 944 np.random.seed(434) 945 n = 40 946 y = np.random.randint(0, 3, n) 947 groups = np.kron(np.arange(n / 2), np.r_[1, 1]) 948 x = np.random.normal(size=(n, 1)) 949 950 odi = cov_struct.OrdinalIndependence() 951 model1 = gee.OrdinalGEE(y, x, groups, cov_struct=odi) 952 model1.fit() 953 954 @pytest.mark.smoke 955 def test_nominal_independence(self): 956 957 np.random.seed(434) 958 n = 40 959 y = np.random.randint(0, 3, n) 960 groups = np.kron(np.arange(n / 2), np.r_[1, 1]) 961 x = np.random.normal(size=(n, 1)) 962 963 with warnings.catch_warnings(): 964 warnings.simplefilter("ignore") 965 nmi = cov_struct.NominalIndependence() 966 model1 = gee.NominalGEE(y, x, groups, cov_struct=nmi) 967 model1.fit() 968 969 @pytest.mark.smoke 970 @pytest.mark.matplotlib 971 def test_ordinal_plot(self, close_figures): 972 family = families.Binomial() 973 974 endog, exog, groups = load_data("gee_ordinal_1.csv", 975 icept=False) 976 977 va = cov_struct.GlobalOddsRatio("ordinal") 978 979 mod = gee.OrdinalGEE(endog, exog, groups, None, family, va) 980 rslt = mod.fit() 981 982 fig = rslt.plot_distribution() 983 assert_equal(isinstance(fig, plt.Figure), True) 984 985 def test_nominal(self): 986 987 endog, exog, groups = load_data("gee_nominal_1.csv", 988 icept=False) 989 990 # Test with independence correlation 991 va = cov_struct.Independence() 992 mod1 = gee.NominalGEE(endog, exog, groups, cov_struct=va) 993 rslt1 = mod1.fit() 994 995 # Regression test 996 cf1 = np.r_[0.450009, 0.451959, -0.918825, -0.468266] 997 se1 = np.r_[0.08915936, 0.07005046, 0.12198139, 0.08281258] 998 assert_allclose(rslt1.params, cf1, rtol=1e-5, atol=1e-5) 999 assert_allclose(rslt1.standard_errors(), se1, rtol=1e-5, atol=1e-5) 1000 1001 # Test with global odds ratio dependence 1002 va = cov_struct.GlobalOddsRatio("nominal") 1003 mod2 = gee.NominalGEE(endog, exog, groups, cov_struct=va) 1004 rslt2 = mod2.fit(start_params=rslt1.params) 1005 1006 # Regression test 1007 cf2 = np.r_[0.455365, 0.415334, -0.916589, -0.502116] 1008 se2 = np.r_[0.08803614, 0.06628179, 0.12259726, 0.08411064] 1009 assert_allclose(rslt2.params, cf2, rtol=1e-5, atol=1e-5) 1010 assert_allclose(rslt2.standard_errors(), se2, rtol=1e-5, atol=1e-5) 1011 1012 # Make sure we get the correct results type 1013 assert_equal(type(rslt1), gee.NominalGEEResultsWrapper) 1014 assert_equal(type(rslt1._results), gee.NominalGEEResults) 1015 1016 def test_poisson(self): 1017 # library(gee) 1018 # Z = read.csv("results/gee_poisson_1.csv", header=FALSE) 1019 # Y = Z[,2] 1020 # Id = Z[,1] 1021 # X1 = Z[,3] 1022 # X2 = Z[,4] 1023 # X3 = Z[,5] 1024 # X4 = Z[,6] 1025 # X5 = Z[,7] 1026 1027 # mi = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson, 1028 # corstr="independence", scale.fix=TRUE) 1029 # smi = summary(mi) 1030 # u = coefficients(smi) 1031 # cfi = paste(u[,1], collapse=",") 1032 # sei = paste(u[,4], collapse=",") 1033 1034 # me = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson, 1035 # corstr="exchangeable", scale.fix=TRUE) 1036 # sme = summary(me) 1037 1038 # u = coefficients(sme) 1039 # cfe = paste(u[,1], collapse=",") 1040 # see = paste(u[,4], collapse=",") 1041 1042 # sprintf("cf = [[%s],[%s]]", cfi, cfe) 1043 # sprintf("se = [[%s],[%s]]", sei, see) 1044 1045 family = families.Poisson() 1046 1047 endog, exog, group_n = load_data("gee_poisson_1.csv") 1048 1049 vi = cov_struct.Independence() 1050 ve = cov_struct.Exchangeable() 1051 1052 # From R gee 1053 cf = [[-0.0364450410793481, -0.0543209391301178, 1054 0.0156642711741052, 0.57628591338724, 1055 -0.00465659951186211, -0.477093153099256], 1056 [-0.0315615554826533, -0.0562589480840004, 1057 0.0178419412298561, 0.571512795340481, 1058 -0.00363255566297332, -0.475971696727736]] 1059 se = [[0.0611309237214186, 0.0390680524493108, 1060 0.0334234174505518, 0.0366860768962715, 1061 0.0304758505008105, 0.0316348058881079], 1062 [0.0610840153582275, 0.0376887268649102, 1063 0.0325168379415177, 0.0369786751362213, 1064 0.0296141014225009, 0.0306115470200955]] 1065 1066 for j, v in enumerate((vi, ve)): 1067 md = gee.GEE(endog, exog, group_n, None, family, v) 1068 mdf = md.fit() 1069 assert_almost_equal(mdf.params, cf[j], decimal=5) 1070 assert_almost_equal(mdf.standard_errors(), se[j], 1071 decimal=6) 1072 1073 # Test with formulas 1074 D = np.concatenate((endog[:, None], group_n[:, None], 1075 exog[:, 1:]), axis=1) 1076 D = pd.DataFrame(D) 1077 D.columns = ["Y", "Id", ] + ["X%d" % (k + 1) 1078 for k in range(exog.shape[1] - 1)] 1079 for j, v in enumerate((vi, ve)): 1080 md = gee.GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id", 1081 D, family=family, cov_struct=v) 1082 mdf = md.fit() 1083 assert_almost_equal(mdf.params, cf[j], decimal=5) 1084 assert_almost_equal(mdf.standard_errors(), se[j], 1085 decimal=6) 1086 # print(mdf.params) 1087 1088 def test_groups(self): 1089 # Test various group structures (nonconsecutive, different 1090 # group sizes, not ordered, string labels) 1091 1092 np.random.seed(234) 1093 n = 40 1094 x = np.random.normal(size=(n, 2)) 1095 y = np.random.normal(size=n) 1096 1097 # groups with unequal group sizes 1098 groups = np.kron(np.arange(n / 4), np.ones(4)) 1099 groups[8:12] = 3 1100 groups[34:36] = 9 1101 1102 model1 = gee.GEE(y, x, groups=groups) 1103 result1 = model1.fit() 1104 1105 # Unordered groups 1106 ix = np.random.permutation(n) 1107 y1 = y[ix] 1108 x1 = x[ix, :] 1109 groups1 = groups[ix] 1110 1111 model2 = gee.GEE(y1, x1, groups=groups1) 1112 result2 = model2.fit() 1113 1114 assert_allclose(result1.params, result2.params) 1115 assert_allclose(result1.tvalues, result2.tvalues) 1116 1117 # group labels are strings 1118 mp = {} 1119 import string 1120 for j, g in enumerate(set(groups)): 1121 mp[g] = string.ascii_letters[j:j + 4] 1122 groups2 = [mp[g] for g in groups] 1123 1124 model3 = gee.GEE(y, x, groups=groups2) 1125 result3 = model3.fit() 1126 1127 assert_allclose(result1.params, result3.params) 1128 assert_allclose(result1.tvalues, result3.tvalues) 1129 1130 def test_compare_OLS(self): 1131 # Gaussian GEE with independence correlation should agree 1132 # exactly with OLS for parameter estimates and standard errors 1133 # derived from the naive covariance estimate. 1134 1135 vs = cov_struct.Independence() 1136 family = families.Gaussian() 1137 1138 np.random.seed(34234) 1139 Y = np.random.normal(size=100) 1140 X1 = np.random.normal(size=100) 1141 X2 = np.random.normal(size=100) 1142 X3 = np.random.normal(size=100) 1143 groups = np.kron(lrange(20), np.ones(5)) 1144 1145 D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) 1146 1147 md = gee.GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, 1148 family=family, cov_struct=vs) 1149 mdf = md.fit() 1150 1151 ols = lm.OLS.from_formula("Y ~ X1 + X2 + X3", data=D).fit() 1152 1153 # do not use wrapper, asserts_xxx do not work 1154 ols = ols._results 1155 1156 assert_almost_equal(ols.params, mdf.params, decimal=10) 1157 1158 se = mdf.standard_errors(cov_type="naive") 1159 assert_almost_equal(ols.bse, se, decimal=10) 1160 1161 naive_tvalues = mdf.params / np.sqrt(np.diag(mdf.cov_naive)) 1162 assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10) 1163 1164 def test_formulas(self): 1165 # Check formulas, especially passing groups and time as either 1166 # variable names or arrays. 1167 1168 n = 100 1169 np.random.seed(34234) 1170 Y = np.random.normal(size=n) 1171 X1 = np.random.normal(size=n) 1172 mat = np.concatenate((np.ones((n, 1)), X1[:, None]), axis=1) 1173 Time = np.random.uniform(size=n) 1174 groups = np.kron(lrange(20), np.ones(5)) 1175 1176 data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups}) 1177 1178 va = cov_struct.Autoregressive(grid=False) 1179 family = families.Gaussian() 1180 1181 mod1 = gee.GEE(Y, mat, groups, time=Time, family=family, 1182 cov_struct=va) 1183 rslt1 = mod1.fit() 1184 1185 mod2 = gee.GEE.from_formula("Y ~ X1", groups, data, time=Time, 1186 family=family, cov_struct=va) 1187 rslt2 = mod2.fit() 1188 1189 mod3 = gee.GEE.from_formula("Y ~ X1", groups, data, time="Time", 1190 family=family, cov_struct=va) 1191 rslt3 = mod3.fit() 1192 1193 mod4 = gee.GEE.from_formula("Y ~ X1", "groups", data, time=Time, 1194 family=family, cov_struct=va) 1195 rslt4 = mod4.fit() 1196 1197 mod5 = gee.GEE.from_formula("Y ~ X1", "groups", data, time="Time", 1198 family=family, cov_struct=va) 1199 rslt5 = mod5.fit() 1200 1201 assert_almost_equal(rslt1.params, rslt2.params, decimal=8) 1202 assert_almost_equal(rslt1.params, rslt3.params, decimal=8) 1203 assert_almost_equal(rslt1.params, rslt4.params, decimal=8) 1204 assert_almost_equal(rslt1.params, rslt5.params, decimal=8) 1205 1206 check_wrapper(rslt2) 1207 1208 def test_compare_logit(self): 1209 1210 vs = cov_struct.Independence() 1211 family = families.Binomial() 1212 1213 np.random.seed(34234) 1214 Y = 1 * (np.random.normal(size=100) < 0) 1215 X1 = np.random.normal(size=100) 1216 X2 = np.random.normal(size=100) 1217 X3 = np.random.normal(size=100) 1218 groups = np.random.randint(0, 4, size=100) 1219 1220 D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) 1221 1222 mod1 = gee.GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, 1223 family=family, cov_struct=vs) 1224 rslt1 = mod1.fit() 1225 1226 mod2 = discrete.Logit.from_formula("Y ~ X1 + X2 + X3", data=D) 1227 rslt2 = mod2.fit(disp=False) 1228 1229 assert_almost_equal(rslt1.params.values, rslt2.params.values, 1230 decimal=10) 1231 1232 def test_compare_poisson(self): 1233 1234 vs = cov_struct.Independence() 1235 family = families.Poisson() 1236 1237 np.random.seed(34234) 1238 Y = np.ceil(-np.log(np.random.uniform(size=100))) 1239 X1 = np.random.normal(size=100) 1240 X2 = np.random.normal(size=100) 1241 X3 = np.random.normal(size=100) 1242 groups = np.random.randint(0, 4, size=100) 1243 1244 D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3}) 1245 1246 mod1 = gee.GEE.from_formula("Y ~ X1 + X2 + X3", groups, D, 1247 family=family, cov_struct=vs) 1248 rslt1 = mod1.fit() 1249 1250 mod2 = discrete.Poisson.from_formula("Y ~ X1 + X2 + X3", data=D) 1251 rslt2 = mod2.fit(disp=False) 1252 1253 assert_almost_equal(rslt1.params.values, rslt2.params.values, 1254 decimal=10) 1255 1256 def test_predict(self): 1257 1258 n = 50 1259 np.random.seed(4324) 1260 X1 = np.random.normal(size=n) 1261 X2 = np.random.normal(size=n) 1262 groups = np.kron(np.arange(n / 2), np.r_[1, 1]) 1263 offset = np.random.uniform(1, 2, size=n) 1264 Y = np.random.normal(0.1 * (X1 + X2) + offset, size=n) 1265 data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, 1266 "offset": offset}) 1267 1268 fml = "Y ~ X1 + X2" 1269 model = gee.GEE.from_formula(fml, groups, data, 1270 family=families.Gaussian(), 1271 offset="offset") 1272 result = model.fit(start_params=[0, 0.1, 0.1]) 1273 assert_equal(result.converged, True) 1274 1275 pred1 = result.predict() 1276 pred2 = result.predict(offset=data.offset) 1277 pred3 = result.predict(exog=data[["X1", "X2"]], offset=data.offset) 1278 pred4 = result.predict(exog=data[["X1", "X2"]], offset=0 * data.offset) 1279 pred5 = result.predict(offset=0 * data.offset) 1280 1281 assert_allclose(pred1, pred2) 1282 assert_allclose(pred1, pred3) 1283 assert_allclose(pred1, pred4 + data.offset) 1284 assert_allclose(pred1, pred5 + data.offset) 1285 1286 x1_new = np.random.normal(size=10) 1287 x2_new = np.random.normal(size=10) 1288 new_exog = pd.DataFrame({"X1": x1_new, "X2": x2_new}) 1289 pred6 = result.predict(exog=new_exog) 1290 params = result.params 1291 pred6_correct = params[0] + params[1] * x1_new + params[2] * x2_new 1292 assert_allclose(pred6, pred6_correct) 1293 1294 def test_stationary_grid(self): 1295 1296 endog = np.r_[4, 2, 3, 1, 4, 5, 6, 7, 8, 3, 2, 4.] 1297 exog = np.r_[2, 3, 1, 4, 3, 2, 5, 4, 5, 6, 3, 2] 1298 group = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3] 1299 exog = tools.add_constant(exog) 1300 1301 cs = cov_struct.Stationary(max_lag=2, grid=True) 1302 model = gee.GEE(endog, exog, group, cov_struct=cs) 1303 result = model.fit() 1304 se = result.bse * np.sqrt(12 / 9.) # Stata adjustment 1305 1306 assert_allclose(cs.covariance_matrix(np.r_[1, 1, 1], 0)[0].sum(), 1307 6.4633538285149452) 1308 1309 # Obtained from Stata using: 1310 # xtgee y x, i(g) vce(robust) corr(Stationary2) 1311 assert_allclose(result.params, np.r_[ 1312 4.463968, -0.0386674], rtol=1e-5, atol=1e-5) 1313 assert_allclose(se, np.r_[0.5217202, 0.2800333], rtol=1e-5, atol=1e-5) 1314 1315 def test_stationary_nogrid(self): 1316 1317 # First test special case where the data follow a grid but we 1318 # fit using nogrid 1319 endog = np.r_[4, 2, 3, 1, 4, 5, 6, 7, 8, 3, 2, 4.] 1320 exog = np.r_[2, 3, 1, 4, 3, 2, 5, 4, 5, 6, 3, 2] 1321 time = np.r_[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2] 1322 group = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3] 1323 1324 exog = tools.add_constant(exog) 1325 1326 model = gee.GEE(endog, exog, group, 1327 cov_struct=cov_struct.Stationary(max_lag=2, 1328 grid=False)) 1329 result = model.fit() 1330 se = result.bse * np.sqrt(12 / 9.) # Stata adjustment 1331 1332 # Obtained from Stata using: 1333 # xtgee y x, i(g) vce(robust) corr(Stationary2) 1334 assert_allclose(result.params, np.r_[ 1335 4.463968, -0.0386674], rtol=1e-5, atol=1e-5) 1336 assert_allclose(se, np.r_[0.5217202, 0.2800333], rtol=1e-5, atol=1e-5) 1337 1338 # Smoke test for no grid # TODO: pytest.mark.smoke> 1339 time = np.r_[0, 1, 3, 0, 2, 3, 0, 2, 3, 0, 1, 2][:, None] 1340 model = gee.GEE(endog, exog, group, time=time, 1341 cov_struct=cov_struct.Stationary(max_lag=4, 1342 grid=False)) 1343 model.fit() 1344 1345 def test_predict_exposure(self): 1346 1347 n = 50 1348 np.random.seed(34234) 1349 X1 = np.random.normal(size=n) 1350 X2 = np.random.normal(size=n) 1351 groups = np.kron(np.arange(25), np.r_[1, 1]) 1352 offset = np.random.uniform(1, 2, size=n) 1353 exposure = np.random.uniform(1, 2, size=n) 1354 Y = np.random.poisson(0.1 * (X1 + X2) + offset + 1355 np.log(exposure), size=n) 1356 data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, 1357 "offset": offset, "exposure": exposure}) 1358 1359 fml = "Y ~ X1 + X2" 1360 model = gee.GEE.from_formula(fml, groups, data, 1361 family=families.Poisson(), 1362 offset="offset", exposure="exposure") 1363 result = model.fit() 1364 assert_equal(result.converged, True) 1365 1366 pred1 = result.predict() 1367 pred2 = result.predict(offset=data["offset"]) 1368 pred3 = result.predict(exposure=data["exposure"]) 1369 pred4 = result.predict( 1370 offset=data["offset"], exposure=data["exposure"]) 1371 pred5 = result.predict(exog=data[-10:], 1372 offset=data["offset"][-10:], 1373 exposure=data["exposure"][-10:]) 1374 # without patsy 1375 pred6 = result.predict(exog=result.model.exog[-10:], 1376 offset=data["offset"][-10:], 1377 exposure=data["exposure"][-10:], 1378 transform=False) 1379 assert_allclose(pred1, pred2) 1380 assert_allclose(pred1, pred3) 1381 assert_allclose(pred1, pred4) 1382 assert_allclose(pred1[-10:], pred5) 1383 assert_allclose(pred1[-10:], pred6) 1384 1385 def test_offset_formula(self): 1386 # Test various ways of passing offset and exposure to `from_formula`. 1387 1388 n = 50 1389 np.random.seed(34234) 1390 X1 = np.random.normal(size=n) 1391 X2 = np.random.normal(size=n) 1392 groups = np.kron(np.arange(25), np.r_[1, 1]) 1393 offset = np.random.uniform(1, 2, size=n) 1394 exposure = np.exp(offset) 1395 Y = np.random.poisson(0.1 * (X1 + X2) + 2 * offset, size=n) 1396 data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups, 1397 "offset": offset, "exposure": exposure}) 1398 1399 fml = "Y ~ X1 + X2" 1400 model1 = gee.GEE.from_formula(fml, groups, data, 1401 family=families.Poisson(), 1402 offset="offset") 1403 result1 = model1.fit() 1404 assert_equal(result1.converged, True) 1405 1406 model2 = gee.GEE.from_formula(fml, groups, data, 1407 family=families.Poisson(), 1408 offset=offset) 1409 result2 = model2.fit(start_params=result1.params) 1410 assert_allclose(result1.params, result2.params) 1411 assert_equal(result2.converged, True) 1412 1413 model3 = gee.GEE.from_formula(fml, groups, data, 1414 family=families.Poisson(), 1415 exposure=exposure) 1416 result3 = model3.fit(start_params=result1.params) 1417 assert_allclose(result1.params, result3.params) 1418 assert_equal(result3.converged, True) 1419 1420 model4 = gee.GEE.from_formula(fml, groups, data, 1421 family=families.Poisson(), 1422 exposure="exposure") 1423 result4 = model4.fit(start_params=result1.params) 1424 assert_allclose(result1.params, result4.params) 1425 assert_equal(result4.converged, True) 1426 1427 model5 = gee.GEE.from_formula(fml, groups, data, 1428 family=families.Poisson(), 1429 exposure="exposure", offset="offset") 1430 result5 = model5.fit() 1431 assert_equal(result5.converged, True) 1432 1433 model6 = gee.GEE.from_formula(fml, groups, data, 1434 family=families.Poisson(), 1435 offset=2 * offset) 1436 result6 = model6.fit(start_params=result5.params) 1437 assert_allclose(result5.params, result6.params) 1438 assert_equal(result6.converged, True) 1439 1440 def test_sensitivity(self): 1441 1442 va = cov_struct.Exchangeable() 1443 family = families.Gaussian() 1444 1445 np.random.seed(34234) 1446 n = 100 1447 Y = np.random.normal(size=n) 1448 X1 = np.random.normal(size=n) 1449 X2 = np.random.normal(size=n) 1450 groups = np.kron(np.arange(50), np.r_[1, 1]) 1451 1452 D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2}) 1453 1454 mod = gee.GEE.from_formula("Y ~ X1 + X2", groups, D, 1455 family=family, cov_struct=va) 1456 rslt = mod.fit() 1457 ps = rslt.params_sensitivity(0, 0.5, 2) 1458 assert_almost_equal(len(ps), 2) 1459 assert_almost_equal([x.cov_struct.dep_params for x in ps], 1460 [0.0, 0.5]) 1461 1462 # Regression test 1463 assert_almost_equal([x.params[0] for x in ps], 1464 [0.1696214707458818, 0.17836097387799127]) 1465 1466 def test_equivalence(self): 1467 """ 1468 The Equivalence covariance structure can represent an 1469 exchangeable covariance structure. Here we check that the 1470 results are identical using the two approaches. 1471 """ 1472 1473 np.random.seed(3424) 1474 endog = np.random.normal(size=20) 1475 exog = np.random.normal(size=(20, 2)) 1476 exog[:, 0] = 1 1477 groups = np.kron(np.arange(5), np.ones(4)) 1478 groups[12:] = 3 # Create unequal size groups 1479 1480 # Set up an Equivalence covariance structure to mimic an 1481 # Exchangeable covariance structure. 1482 pairs = {} 1483 start = [0, 4, 8, 12] 1484 for k in range(4): 1485 pairs[k] = {} 1486 1487 # Diagonal values (variance parameters) 1488 if k < 3: 1489 pairs[k][0] = (start[k] + np.r_[0, 1, 2, 3], 1490 start[k] + np.r_[0, 1, 2, 3]) 1491 else: 1492 pairs[k][0] = (start[k] + np.r_[0, 1, 2, 3, 4, 5, 6, 7], 1493 start[k] + np.r_[0, 1, 2, 3, 4, 5, 6, 7]) 1494 1495 # Off-diagonal pairs (covariance parameters) 1496 if k < 3: 1497 a, b = np.tril_indices(4, -1) 1498 pairs[k][1] = (start[k] + a, start[k] + b) 1499 else: 1500 a, b = np.tril_indices(8, -1) 1501 pairs[k][1] = (start[k] + a, start[k] + b) 1502 1503 ex = cov_struct.Exchangeable() 1504 model1 = gee.GEE(endog, exog, groups, cov_struct=ex) 1505 result1 = model1.fit() 1506 1507 for return_cov in False, True: 1508 1509 ec = cov_struct.Equivalence(pairs, return_cov=return_cov) 1510 model2 = gee.GEE(endog, exog, groups, cov_struct=ec) 1511 result2 = model2.fit() 1512 1513 # Use large atol/rtol for the correlation case since there 1514 # are some small differences in the results due to degree 1515 # of freedom differences. 1516 if return_cov is True: 1517 atol, rtol = 1e-6, 1e-6 1518 else: 1519 atol, rtol = 1e-3, 1e-3 1520 assert_allclose(result1.params, result2.params, 1521 atol=atol, rtol=rtol) 1522 assert_allclose(result1.bse, result2.bse, atol=atol, rtol=rtol) 1523 assert_allclose(result1.scale, result2.scale, atol=atol, rtol=rtol) 1524 1525 def test_equivalence_from_pairs(self): 1526 1527 np.random.seed(3424) 1528 endog = np.random.normal(size=50) 1529 exog = np.random.normal(size=(50, 2)) 1530 exog[:, 0] = 1 1531 groups = np.kron(np.arange(5), np.ones(10)) 1532 groups[30:] = 3 # Create unequal size groups 1533 1534 # Set up labels. 1535 labels = np.kron(np.arange(5), np.ones(10)).astype(np.int32) 1536 labels = labels[np.random.permutation(len(labels))] 1537 1538 eq = cov_struct.Equivalence(labels=labels, return_cov=True) 1539 model1 = gee.GEE(endog, exog, groups, cov_struct=eq) 1540 1541 # Call this directly instead of letting init do it to get the 1542 # result before reindexing. 1543 eq._pairs_from_labels() 1544 1545 # Make sure the size is correct to hold every element. 1546 for g in model1.group_labels: 1547 p = eq.pairs[g] 1548 vl = [len(x[0]) for x in p.values()] 1549 m = sum(groups == g) 1550 assert_allclose(sum(vl), m * (m + 1) / 2) 1551 1552 # Check for duplicates. 1553 ixs = set([]) 1554 for g in model1.group_labels: 1555 for v in eq.pairs[g].values(): 1556 for a, b in zip(v[0], v[1]): 1557 ky = (a, b) 1558 assert(ky not in ixs) 1559 ixs.add(ky) 1560 1561 # Smoke test # TODO: pytest.mark.smoke? 1562 eq = cov_struct.Equivalence(labels=labels, return_cov=True) 1563 model1 = gee.GEE(endog, exog, groups, cov_struct=eq) 1564 with warnings.catch_warnings(): 1565 warnings.simplefilter('ignore') 1566 model1.fit(maxiter=2) 1567 1568 1569class CheckConsistency(object): 1570 1571 start_params = None 1572 1573 def test_cov_type(self): 1574 mod = self.mod 1575 res_robust = mod.fit(start_params=self.start_params) 1576 res_naive = mod.fit(start_params=self.start_params, 1577 cov_type='naive') 1578 res_robust_bc = mod.fit(start_params=self.start_params, 1579 cov_type='bias_reduced') 1580 1581 # call summary to make sure it does not change cov_type 1582 res_naive.summary() 1583 res_robust_bc.summary() 1584 1585 # check cov_type 1586 assert_equal(res_robust.cov_type, 'robust') 1587 assert_equal(res_naive.cov_type, 'naive') 1588 assert_equal(res_robust_bc.cov_type, 'bias_reduced') 1589 1590 # check bse and cov_params 1591 # we are comparing different runs of the optimization 1592 # bse in ordinal and multinomial have an atol around 5e-10 for two 1593 # consecutive calls to fit. 1594 rtol = 1e-8 1595 for (res, cov_type, cov) in [ 1596 (res_robust, 'robust', res_robust.cov_robust), 1597 (res_naive, 'naive', res_robust.cov_naive), 1598 (res_robust_bc, 'bias_reduced', res_robust_bc.cov_robust_bc) 1599 ]: 1600 bse = np.sqrt(np.diag(cov)) 1601 assert_allclose(res.bse, bse, rtol=rtol) 1602 if cov_type != 'bias_reduced': 1603 # cov_type=naive shortcuts calculation of bias reduced 1604 # covariance for efficiency 1605 bse = res_naive.standard_errors(cov_type=cov_type) 1606 assert_allclose(res.bse, bse, rtol=rtol) 1607 assert_allclose(res.cov_params(), cov, rtol=rtol, atol=1e-10) 1608 assert_allclose(res.cov_params_default, cov, rtol=rtol, atol=1e-10) 1609 1610 # assert that we do not have a copy 1611 assert_(res_robust.cov_params_default is res_robust.cov_robust) 1612 assert_(res_naive.cov_params_default is res_naive.cov_naive) 1613 assert_(res_robust_bc.cov_params_default is 1614 res_robust_bc.cov_robust_bc) 1615 1616 # check exception for misspelled cov_type 1617 assert_raises(ValueError, mod.fit, cov_type='robust_bc') 1618 1619 1620class TestGEEPoissonCovType(CheckConsistency): 1621 1622 @classmethod 1623 def setup_class(cls): 1624 1625 endog, exog, group_n = load_data("gee_poisson_1.csv") 1626 1627 family = families.Poisson() 1628 vi = cov_struct.Independence() 1629 1630 cls.mod = gee.GEE(endog, exog, group_n, None, family, vi) 1631 1632 cls.start_params = np.array([-0.03644504, -0.05432094, 0.01566427, 1633 0.57628591, -0.0046566, -0.47709315]) 1634 1635 def test_wrapper(self): 1636 1637 endog, exog, group_n = load_data("gee_poisson_1.csv", 1638 icept=False) 1639 endog = pd.Series(endog) 1640 exog = pd.DataFrame(exog) 1641 group_n = pd.Series(group_n) 1642 1643 family = families.Poisson() 1644 vi = cov_struct.Independence() 1645 1646 mod = gee.GEE(endog, exog, group_n, None, family, vi) 1647 rslt2 = mod.fit() 1648 1649 check_wrapper(rslt2) 1650 1651 1652class TestGEEPoissonFormulaCovType(CheckConsistency): 1653 1654 @classmethod 1655 def setup_class(cls): 1656 1657 endog, exog, group_n = load_data("gee_poisson_1.csv") 1658 1659 family = families.Poisson() 1660 vi = cov_struct.Independence() 1661 # Test with formulas 1662 D = np.concatenate((endog[:, None], group_n[:, None], 1663 exog[:, 1:]), axis=1) 1664 D = pd.DataFrame(D) 1665 D.columns = ["Y", "Id", ] + ["X%d" % (k + 1) 1666 for k in range(exog.shape[1] - 1)] 1667 1668 cls.mod = gee.GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id", 1669 D, family=family, cov_struct=vi) 1670 1671 cls.start_params = np.array([-0.03644504, -0.05432094, 0.01566427, 1672 0.57628591, -0.0046566, -0.47709315]) 1673 1674 1675class TestGEEOrdinalCovType(CheckConsistency): 1676 1677 @classmethod 1678 def setup_class(cls): 1679 1680 family = families.Binomial() 1681 1682 endog, exog, groups = load_data("gee_ordinal_1.csv", 1683 icept=False) 1684 1685 va = cov_struct.GlobalOddsRatio("ordinal") 1686 1687 cls.mod = gee.OrdinalGEE(endog, exog, groups, None, family, va) 1688 cls.start_params = np.array([1.09250002, 0.0217443, -0.39851092, 1689 -0.01812116, 0.03023969, 1.18258516, 1690 0.01803453, -1.10203381]) 1691 1692 def test_wrapper(self): 1693 1694 endog, exog, groups = load_data("gee_ordinal_1.csv", 1695 icept=False) 1696 1697 endog = pd.Series(endog, name='yendog') 1698 exog = pd.DataFrame(exog) 1699 groups = pd.Series(groups, name='the_group') 1700 1701 family = families.Binomial() 1702 va = cov_struct.GlobalOddsRatio("ordinal") 1703 mod = gee.OrdinalGEE(endog, exog, groups, None, family, va) 1704 rslt2 = mod.fit() 1705 1706 check_wrapper(rslt2) 1707 1708 1709class TestGEEMultinomialCovType(CheckConsistency): 1710 1711 @classmethod 1712 def setup_class(cls): 1713 1714 endog, exog, groups = load_data("gee_nominal_1.csv", 1715 icept=False) 1716 1717 # Test with independence correlation 1718 va = cov_struct.Independence() 1719 cls.mod = gee.NominalGEE(endog, exog, groups, cov_struct=va) 1720 cls.start_params = np.array([0.44944752, 0.45569985, -0.92007064, 1721 -0.46766728]) 1722 1723 def test_wrapper(self): 1724 1725 endog, exog, groups = load_data("gee_nominal_1.csv", 1726 icept=False) 1727 endog = pd.Series(endog, name='yendog') 1728 exog = pd.DataFrame(exog) 1729 groups = pd.Series(groups, name='the_group') 1730 1731 va = cov_struct.Independence() 1732 mod = gee.NominalGEE(endog, exog, groups, cov_struct=va) 1733 rslt2 = mod.fit() 1734 1735 check_wrapper(rslt2) 1736 1737 1738def test_regularized_poisson(): 1739 1740 np.random.seed(8735) 1741 1742 ng, gs, p = 1000, 5, 5 1743 1744 x = np.random.normal(size=(ng*gs, p)) 1745 r = 0.5 1746 x[:, 2] = r*x[:, 1] + np.sqrt(1-r**2)*x[:, 2] 1747 lpr = 0.7*(x[:, 1] - x[:, 3]) 1748 mean = np.exp(lpr) 1749 y = np.random.poisson(mean) 1750 1751 groups = np.kron(np.arange(ng), np.ones(gs)) 1752 1753 model = gee.GEE(y, x, groups=groups, family=families.Poisson()) 1754 result = model.fit_regularized(0.0000001) 1755 1756 assert_allclose(result.params, 0.7 * np.r_[0, 1, 0, -1, 0], 1757 rtol=0.01, atol=0.12) 1758 1759 1760def test_regularized_gaussian(): 1761 1762 # Example 1 from Wang et al. 1763 1764 np.random.seed(8735) 1765 1766 ng, gs, p = 200, 4, 200 1767 1768 groups = np.kron(np.arange(ng), np.ones(gs)) 1769 1770 x = np.zeros((ng*gs, p)) 1771 x[:, 0] = 1 * (np.random.uniform(size=ng*gs) < 0.5) 1772 x[:, 1] = np.random.normal(size=ng*gs) 1773 r = 0.5 1774 for j in range(2, p): 1775 eps = np.random.normal(size=ng*gs) 1776 x[:, j] = r * x[:, j-1] + np.sqrt(1 - r**2) * eps 1777 lpr = np.dot(x[:, 0:4], np.r_[2, 3, 1.5, 2]) 1778 s = 0.4 1779 e = np.sqrt(s) * np.kron(np.random.normal(size=ng), np.ones(gs)) 1780 e += np.sqrt(1 - s) * np.random.normal(size=ng*gs) 1781 1782 y = lpr + e 1783 1784 model = gee.GEE(y, x, cov_struct=cov_struct.Exchangeable(), groups=groups) 1785 result = model.fit_regularized(0.01, maxiter=100) 1786 1787 ex = np.zeros(200) 1788 ex[0:4] = np.r_[2, 3, 1.5, 2] 1789 assert_allclose(result.params, ex, rtol=0.01, atol=0.2) 1790 1791 assert_allclose(model.cov_struct.dep_params, np.r_[s], 1792 rtol=0.01, atol=0.05) 1793 1794 1795@pytest.mark.smoke 1796@pytest.mark.matplotlib 1797def test_plots(close_figures): 1798 1799 np.random.seed(378) 1800 exog = np.random.normal(size=100) 1801 endog = np.random.normal(size=(100, 2)) 1802 groups = np.kron(np.arange(50), np.r_[1, 1]) 1803 1804 model = gee.GEE(exog, endog, groups) 1805 result = model.fit() 1806 fig = result.plot_added_variable(1) 1807 assert_equal(isinstance(fig, plt.Figure), True) 1808 fig = result.plot_partial_residuals(1) 1809 assert_equal(isinstance(fig, plt.Figure), True) 1810 fig = result.plot_ceres_residuals(1) 1811 assert_equal(isinstance(fig, plt.Figure), True) 1812 fig = result.plot_isotropic_dependence() 1813 assert_equal(isinstance(fig, plt.Figure), True) 1814 1815 1816def test_missing(): 1817 # gh-1877 1818 data = [['id', 'al', 'status', 'fake', 'grps'], 1819 ['4A', 'A', 1, 1, 0], 1820 ['5A', 'A', 1, 2.0, 1], 1821 ['6A', 'A', 1, 3, 2], 1822 ['7A', 'A', 1, 2.0, 3], 1823 ['8A', 'A', 1, 1, 4], 1824 ['9A', 'A', 1, 2.0, 5], 1825 ['11A', 'A', 1, 1, 6], 1826 ['12A', 'A', 1, 2.0, 7], 1827 ['13A', 'A', 1, 1, 8], 1828 ['14A', 'A', 1, 1, 9], 1829 ['15A', 'A', 1, 1, 10], 1830 ['16A', 'A', 1, 2.0, 11], 1831 ['17A', 'A', 1, 3.0, 12], 1832 ['18A', 'A', 1, 3.0, 13], 1833 ['19A', 'A', 1, 2.0, 14], 1834 ['20A', 'A', 1, 2.0, 15], 1835 ['2C', 'C', 0, 3.0, 0], 1836 ['3C', 'C', 0, 1, 1], 1837 ['4C', 'C', 0, 1, 2], 1838 ['5C', 'C', 0, 2.0, 3], 1839 ['6C', 'C', 0, 1, 4], 1840 ['9C', 'C', 0, 1, 5], 1841 ['10C', 'C', 0, 3, 6], 1842 ['12C', 'C', 0, 3, 7], 1843 ['14C', 'C', 0, 2.5, 8], 1844 ['15C', 'C', 0, 1, 9], 1845 ['17C', 'C', 0, 1, 10], 1846 ['22C', 'C', 0, 1, 11], 1847 ['23C', 'C', 0, 1, 12], 1848 ['24C', 'C', 0, 1, 13], 1849 ['32C', 'C', 0, 2.0, 14], 1850 ['35C', 'C', 0, 1, 15]] 1851 1852 df = pd.DataFrame(data[1:], columns=data[0]) 1853 df.loc[df.fake == 1, 'fake'] = np.nan 1854 mod = gee.GEE.from_formula('status ~ fake', data=df, groups='grps', 1855 cov_struct=cov_struct.Independence(), 1856 family=families.Binomial()) 1857 1858 df = df.dropna().copy() 1859 df['constant'] = 1 1860 1861 mod2 = gee.GEE(df.status, df[['constant', 'fake']], groups=df.grps, 1862 cov_struct=cov_struct.Independence(), 1863 family=families.Binomial()) 1864 1865 assert_equal(mod.endog, mod2.endog) 1866 assert_equal(mod.exog, mod2.exog) 1867 assert_equal(mod.groups, mod2.groups) 1868 1869 res = mod.fit() 1870 res2 = mod2.fit() 1871 1872 assert_almost_equal(res.params.values, res2.params.values) 1873 1874 1875def simple_qic_data(fam): 1876 1877 y = np.r_[0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0] 1878 x1 = np.r_[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0] 1879 x2 = np.r_[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] 1880 g = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4] 1881 x1 = x1[:, None] 1882 x2 = x2[:, None] 1883 1884 return y, x1, x2, g 1885 1886 1887# Test quasi-likelihood by numerical integration in two settings 1888# where there is a closed form expression. 1889@pytest.mark.parametrize("family", [families.Gaussian, families.Poisson]) 1890def test_ql_known(family): 1891 1892 fam = family() 1893 1894 y, x1, x2, g = simple_qic_data(family) 1895 1896 model1 = gee.GEE(y, x1, family=fam, groups=g) 1897 result1 = model1.fit(ddof_scale=0) 1898 mean1 = result1.fittedvalues 1899 1900 model2 = gee.GEE(y, x2, family=fam, groups=g) 1901 result2 = model2.fit(ddof_scale=0) 1902 mean2 = result2.fittedvalues 1903 1904 if family is families.Gaussian: 1905 ql1 = -len(y) / 2. 1906 ql2 = -len(y) / 2. 1907 elif family is families.Poisson: 1908 c = np.zeros_like(y) 1909 ii = y > 0 1910 c[ii] = y[ii] * np.log(y[ii]) - y[ii] 1911 ql1 = np.sum(y * np.log(mean1) - mean1 - c) 1912 ql2 = np.sum(y * np.log(mean2) - mean2 - c) 1913 else: 1914 raise ValueError("Unknown family") 1915 1916 qle1 = model1.qic(result1.params, result1.scale, result1.cov_params()) 1917 qle2 = model2.qic(result2.params, result2.scale, result2.cov_params()) 1918 1919 assert_allclose(ql1, qle1[0], rtol=1e-4) 1920 assert_allclose(ql2, qle2[0], rtol=1e-4) 1921 1922 with warnings.catch_warnings(): 1923 warnings.simplefilter("ignore") 1924 qler1 = result1.qic() 1925 qler2 = result2.qic() 1926 assert_allclose(qler1, qle1[1:], rtol=1e-5) 1927 assert_allclose(qler2, qle2[1:], rtol=1e-5) 1928 1929 1930# Compare differences of QL values computed by numerical integration. 1931# Use difference here so that constants that are inconvenient to compute 1932# cancel out. 1933@pytest.mark.parametrize("family", [families.Gaussian, 1934 families.Binomial, 1935 families.Poisson]) 1936def test_ql_diff(family): 1937 1938 fam = family() 1939 1940 y, x1, x2, g = simple_qic_data(family) 1941 1942 model1 = gee.GEE(y, x1, family=fam, groups=g) 1943 result1 = model1.fit(ddof_scale=0) 1944 mean1 = result1.fittedvalues 1945 1946 model2 = gee.GEE(y, x2, family=fam, groups=g) 1947 result2 = model2.fit(ddof_scale=0) 1948 mean2 = result2.fittedvalues 1949 1950 if family is families.Gaussian: 1951 qldiff = 0 1952 elif family is families.Binomial: 1953 qldiff = np.sum(y * np.log(mean1 / (1 - mean1)) + np.log(1 - mean1)) 1954 qldiff -= np.sum(y * np.log(mean2 / (1 - mean2)) + np.log(1 - mean2)) 1955 elif family is families.Poisson: 1956 qldiff = (np.sum(y * np.log(mean1) - mean1) 1957 - np.sum(y * np.log(mean2) - mean2)) 1958 else: 1959 raise ValueError("unknown family") 1960 1961 qle1, _, _ = model1.qic(result1.params, result1.scale, 1962 result1.cov_params()) 1963 qle2, _, _ = model2.qic(result2.params, result2.scale, 1964 result2.cov_params()) 1965 1966 assert_allclose(qle1 - qle2, qldiff, rtol=1e-5, atol=1e-5) 1967 1968 1969def test_qic_warnings(): 1970 with pytest.warns(UserWarning): 1971 fam = families.Gaussian() 1972 y, x1, _, g = simple_qic_data(fam) 1973 model = gee.GEE(y, x1, family=fam, groups=g) 1974 result = model.fit() 1975 result.qic() 1976 1977 1978@pytest.mark.parametrize("reg", [False, True]) 1979def test_quasipoisson(reg): 1980 1981 np.random.seed(343) 1982 1983 n = 1000 1984 x = np.random.normal(size=(n, 3)) 1985 g = np.random.gamma(1, 1, size=n) 1986 y = np.random.poisson(g) 1987 grp = np.kron(np.arange(100), np.ones(n // 100)) 1988 1989 model1 = gee.GEE(y, x, family=families.Poisson(), groups=grp, 1990 ) 1991 model2 = gee.GEE(y, x, family=families.Poisson(), groups=grp, 1992 ) 1993 1994 if reg: 1995 result1 = model1.fit_regularized(pen_wt=0.1) 1996 result2 = model2.fit_regularized(pen_wt=0.1, scale="X2") 1997 else: 1998 result1 = model1.fit(cov_type="naive") 1999 result2 = model2.fit(scale="X2", cov_type="naive") 2000 2001 # The parameter estimates are the same regardless of how 2002 # the scale parameter is handled 2003 assert_allclose(result1.params, result2.params) 2004 2005 if not reg: 2006 # The robust covariance does not depend on the scale parameter, 2007 # but the naive covariance does. 2008 assert_allclose(result2.cov_naive / result1.cov_naive, 2009 result2.scale * np.ones_like(result2.cov_naive)) 2010 2011 2012def test_grid_ar(): 2013 2014 np.random.seed(243) 2015 2016 r = 0.5 2017 m = 10 2018 ng = 100 2019 ii = np.arange(m) 2020 cov = r**np.abs(np.subtract.outer(ii, ii)) 2021 covr = np.linalg.cholesky(cov) 2022 2023 e = [np.dot(covr, np.random.normal(size=m)) for k in range(ng)] 2024 e = 2 * np.concatenate(e) 2025 2026 grps = [[k]*m for k in range(ng)] 2027 grps = np.concatenate(grps) 2028 2029 x = np.random.normal(size=(ng*m, 3)) 2030 y = np.dot(x, np.r_[1, -1, 0]) + e 2031 2032 model1 = gee.GEE(y, x, groups=grps, 2033 cov_struct=cov_struct.Autoregressive(grid=False)) 2034 result1 = model1.fit() 2035 2036 model2 = gee.GEE(y, x, groups=grps, 2037 cov_struct=cov_struct.Autoregressive(grid=True)) 2038 result2 = model2.fit() 2039 2040 model3 = gee.GEE(y, x, groups=grps, 2041 cov_struct=cov_struct.Stationary(max_lag=1, grid=False)) 2042 result3 = model3.fit() 2043 2044 assert_allclose(result1.cov_struct.dep_params, 2045 result2.cov_struct.dep_params, 2046 rtol=0.05) 2047 assert_allclose(result1.cov_struct.dep_params, 2048 result3.cov_struct.dep_params[1], rtol=0.05) 2049 2050 2051def test_unstructured_complete(): 2052 2053 np.random.seed(43) 2054 ngrp = 400 2055 cov = np.asarray([[1, 0.7, 0.2], [0.7, 1, 0.5], [0.2, 0.5, 1]]) 2056 covr = np.linalg.cholesky(cov) 2057 e = np.random.normal(size=(ngrp, 3)) 2058 e = np.dot(e, covr.T) 2059 xmat = np.random.normal(size=(3*ngrp, 3)) 2060 par = np.r_[1, -2, 0.1] 2061 ey = np.dot(xmat, par) 2062 y = ey + e.ravel() 2063 g = np.kron(np.arange(ngrp), np.ones(3)) 2064 t = np.kron(np.ones(ngrp), np.r_[0, 1, 2]).astype(int) 2065 2066 m = gee.GEE(y, xmat, time=t, cov_struct=cov_struct.Unstructured(), 2067 groups=g) 2068 r = m.fit() 2069 2070 assert_allclose(r.params, par, 0.05, 0.5) 2071 assert_allclose(m.cov_struct.dep_params, cov, 0.05, 0.5) 2072 2073 2074def test_unstructured_incomplete(): 2075 2076 np.random.seed(43) 2077 ngrp = 400 2078 cov = np.asarray([[1, 0.7, 0.2], [0.7, 1, 0.5], [0.2, 0.5, 1]]) 2079 covr = np.linalg.cholesky(cov) 2080 e = np.random.normal(size=(ngrp, 3)) 2081 e = np.dot(e, covr.T) 2082 xmat = np.random.normal(size=(3*ngrp, 3)) 2083 par = np.r_[1, -2, 0.1] 2084 ey = np.dot(xmat, par) 2085 2086 yl, xl, tl, gl = [], [], [], [] 2087 for i in range(ngrp): 2088 2089 # Omit one observation from each group of 3 2090 ix = [0, 1, 2] 2091 ix.pop(i % 3) 2092 ix = np.asarray(ix) 2093 tl.append(ix) 2094 2095 yl.append(ey[3*i + ix] + e[i, ix]) 2096 x = xmat[3*i + ix, :] 2097 xl.append(x) 2098 gl.append(i * np.ones(2)) 2099 2100 y = np.concatenate(yl) 2101 x = np.concatenate(xl, axis=0) 2102 t = np.concatenate(tl) 2103 t = np.asarray(t, dtype=int) 2104 g = np.concatenate(gl) 2105 2106 m = gee.GEE(y, x, time=t[:, None], cov_struct=cov_struct.Unstructured(), 2107 groups=g) 2108 r = m.fit() 2109 2110 assert_allclose(r.params, par, 0.05, 0.5) 2111 assert_allclose(m.cov_struct.dep_params, cov, 0.05, 0.5) 2112 2113 2114def test_ar_covsolve(): 2115 2116 np.random.seed(123) 2117 2118 c = cov_struct.Autoregressive(grid=True) 2119 c.dep_params = 0.4 2120 2121 for d in 1, 2, 4: 2122 for q in 1, 4: 2123 2124 ii = np.arange(d) 2125 mat = 0.4 ** np.abs(np.subtract.outer(ii, ii)) 2126 sd = np.random.uniform(size=d) 2127 2128 if q == 1: 2129 z = np.random.normal(size=d) 2130 else: 2131 z = np.random.normal(size=(d, q)) 2132 2133 sm = np.diag(sd) 2134 z1 = np.linalg.solve(sm, 2135 np.linalg.solve(mat, np.linalg.solve(sm, z))) 2136 z2 = c.covariance_matrix_solve(np.zeros_like(sd), 2137 np.zeros_like(sd), 2138 sd, [z]) 2139 2140 assert_allclose(z1, z2[0], rtol=1e-5, atol=1e-5) 2141 2142 2143def test_ex_covsolve(): 2144 2145 np.random.seed(123) 2146 2147 c = cov_struct.Exchangeable() 2148 c.dep_params = 0.4 2149 2150 for d in 1, 2, 4: 2151 for q in 1, 4: 2152 2153 mat = 0.4 * np.ones((d, d)) + 0.6 * np.eye(d) 2154 sd = np.random.uniform(size=d) 2155 2156 if q == 1: 2157 z = np.random.normal(size=d) 2158 else: 2159 z = np.random.normal(size=(d, q)) 2160 2161 sm = np.diag(sd) 2162 z1 = np.linalg.solve(sm, 2163 np.linalg.solve(mat, np.linalg.solve(sm, z))) 2164 z2 = c.covariance_matrix_solve(np.zeros_like(sd), 2165 np.arange(d, dtype=int), 2166 sd, [z]) 2167 2168 assert_allclose(z1, z2[0], rtol=1e-5, atol=1e-5) 2169 2170 2171def test_stationary_covsolve(): 2172 2173 np.random.seed(123) 2174 2175 c = cov_struct.Stationary(grid=True) 2176 c.time = np.arange(10, dtype=int) 2177 2178 for d in 1, 2, 4: 2179 for q in 1, 4: 2180 2181 c.dep_params = (2.0 ** (-np.arange(d))) 2182 c.max_lag = d - 1 2183 mat, _ = c.covariance_matrix(np.zeros(d), 2184 np.arange(d, dtype=int)) 2185 sd = np.random.uniform(size=d) 2186 2187 if q == 1: 2188 z = np.random.normal(size=d) 2189 else: 2190 z = np.random.normal(size=(d, q)) 2191 2192 sm = np.diag(sd) 2193 z1 = np.linalg.solve(sm, 2194 np.linalg.solve(mat, np.linalg.solve(sm, z))) 2195 z2 = c.covariance_matrix_solve(np.zeros_like(sd), 2196 np.arange(d, dtype=int), 2197 sd, [z]) 2198 2199 assert_allclose(z1, z2[0], rtol=1e-5, atol=1e-5) 2200