1"""
2Test functions for GEE
3
4External comparisons are to R and Stata.  The statsmodels GEE
5implementation should generally agree with the R GEE implementation
6for the independence and exchangeable correlation structures.  For
7other correlation structures, the details of the correlation
8estimation differ among implementations and the results will not agree
9exactly.
10"""
11
12from statsmodels.compat import lrange
13import os
14import numpy as np
15import pytest
16from numpy.testing import (assert_almost_equal, assert_equal, assert_allclose,
17                           assert_array_less, assert_raises, assert_warns,
18                           assert_)
19import statsmodels.genmod.generalized_estimating_equations as gee
20import statsmodels.tools as tools
21import statsmodels.regression.linear_model as lm
22from statsmodels.genmod import families
23from statsmodels.genmod import cov_struct
24import statsmodels.discrete.discrete_model as discrete
25
26import pandas as pd
27from scipy.stats.distributions import norm
28import warnings
29
30try:
31    import matplotlib.pyplot as plt
32except ImportError:
33    pass
34
35pdf_output = False
36
37if pdf_output:
38    from matplotlib.backends.backend_pdf import PdfPages
39    pdf = PdfPages("test_glm.pdf")
40else:
41    pdf = None
42
43
44def close_or_save(pdf, fig):
45    if pdf_output:
46        pdf.savefig(fig)
47
48
49def load_data(fname, icept=True):
50    """
51    Load a data set from the results directory.  The data set should
52    be a CSV file with the following format:
53
54    Column 0: Group indicator
55    Column 1: endog variable
56    Columns 2-end: exog variables
57
58    If `icept` is True, an intercept is prepended to the exog
59    variables.
60    """
61
62    cur_dir = os.path.dirname(os.path.abspath(__file__))
63    Z = np.genfromtxt(os.path.join(cur_dir, 'results', fname),
64                      delimiter=",")
65
66    group = Z[:, 0]
67    endog = Z[:, 1]
68    exog = Z[:, 2:]
69
70    if icept:
71        exog = np.concatenate((np.ones((exog.shape[0], 1)), exog),
72                              axis=1)
73
74    return endog, exog, group
75
76
77def check_wrapper(results):
78    # check wrapper
79    assert_(isinstance(results.params, pd.Series))
80    assert_(isinstance(results.fittedvalues, pd.Series))
81    assert_(isinstance(results.resid, pd.Series))
82    assert_(isinstance(results.centered_resid, pd.Series))
83
84    assert_(isinstance(results._results.params, np.ndarray))
85    assert_(isinstance(results._results.fittedvalues, np.ndarray))
86    assert_(isinstance(results._results.resid, np.ndarray))
87    assert_(isinstance(results._results.centered_resid, np.ndarray))
88
89
90class TestGEE(object):
91
92    def test_margins_gaussian(self):
93        # Check marginal effects for a Gaussian GEE fit.  Marginal
94        # effects and ordinary effects should be equal.
95
96        n = 40
97        np.random.seed(34234)
98        exog = np.random.normal(size=(n, 3))
99        exog[:, 0] = 1
100
101        groups = np.kron(np.arange(n / 4), np.r_[1, 1, 1, 1])
102        endog = exog[:, 1] + np.random.normal(size=n)
103
104        model = gee.GEE(endog, exog, groups)
105        result = model.fit(
106            start_params=[-4.88085602e-04, 1.18501903, 4.78820100e-02])
107
108        marg = result.get_margeff()
109
110        assert_allclose(marg.margeff, result.params[1:])
111        assert_allclose(marg.margeff_se, result.bse[1:])
112
113        # smoke test
114        marg.summary()
115
116    def test_margins_logistic(self):
117        # Check marginal effects for a binomial GEE fit.  Comparison
118        # comes from Stata.
119
120        np.random.seed(34234)
121        endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1]
122        exog = np.ones((8, 2))
123        exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2]
124
125        groups = np.arange(8)
126
127        model = gee.GEE(endog, exog, groups, family=families.Binomial())
128        result = model.fit(
129            cov_type='naive', start_params=[-3.29583687,  2.19722458])
130
131        marg = result.get_margeff()
132
133        assert_allclose(marg.margeff, np.r_[0.4119796])
134        assert_allclose(marg.margeff_se, np.r_[0.1379962], rtol=1e-6)
135
136    def test_margins_multinomial(self):
137        # Check marginal effects for a 2-class multinomial GEE fit,
138        # which should be equivalent to logistic regression.  Comparison
139        # comes from Stata.
140
141        np.random.seed(34234)
142        endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1]
143        exog = np.ones((8, 2))
144        exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2]
145
146        groups = np.arange(8)
147
148        model = gee.NominalGEE(endog, exog, groups)
149        result = model.fit(cov_type='naive', start_params=[
150                           3.295837, -2.197225])
151
152        marg = result.get_margeff()
153
154        assert_allclose(marg.margeff, np.r_[-0.41197961], rtol=1e-5)
155        assert_allclose(marg.margeff_se, np.r_[0.1379962], rtol=1e-6)
156
157    @pytest.mark.smoke
158    @pytest.mark.matplotlib
159    def test_nominal_plot(self, close_figures):
160        np.random.seed(34234)
161        endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1]
162        exog = np.ones((8, 2))
163        exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2]
164
165        groups = np.arange(8)
166
167        model = gee.NominalGEE(endog, exog, groups)
168        result = model.fit(cov_type='naive',
169                           start_params=[3.295837, -2.197225])
170
171        fig = result.plot_distribution()
172        assert_equal(isinstance(fig, plt.Figure), True)
173
174    def test_margins_poisson(self):
175        # Check marginal effects for a Poisson GEE fit.
176
177        np.random.seed(34234)
178        endog = np.r_[10, 15, 12, 13, 20, 18, 26, 29]
179        exog = np.ones((8, 2))
180        exog[:, 1] = np.r_[0, 0, 0, 0, 1, 1, 1, 1]
181
182        groups = np.arange(8)
183
184        model = gee.GEE(endog, exog, groups, family=families.Poisson())
185        result = model.fit(cov_type='naive', start_params=[
186                           2.52572864, 0.62057649])
187
188        marg = result.get_margeff()
189
190        assert_allclose(marg.margeff, np.r_[11.0928], rtol=1e-6)
191        assert_allclose(marg.margeff_se, np.r_[3.269015], rtol=1e-6)
192
193    def test_multinomial(self):
194        """
195        Check the 2-class multinomial (nominal) GEE fit against
196        logistic regression.
197        """
198
199        np.random.seed(34234)
200        endog = np.r_[0, 0, 0, 0, 1, 1, 1, 1]
201        exog = np.ones((8, 2))
202        exog[:, 1] = np.r_[1, 2, 1, 1, 2, 1, 2, 2]
203
204        groups = np.arange(8)
205
206        model = gee.NominalGEE(endog, exog, groups)
207        results = model.fit(cov_type='naive', start_params=[
208                            3.295837, -2.197225])
209
210        logit_model = gee.GEE(endog, exog, groups,
211                              family=families.Binomial())
212        logit_results = logit_model.fit(cov_type='naive')
213
214        assert_allclose(results.params, -logit_results.params, rtol=1e-5)
215        assert_allclose(results.bse, logit_results.bse, rtol=1e-5)
216
217    def test_weighted(self):
218
219        # Simple check where the answer can be computed by hand.
220        exog = np.ones(20)
221        weights = np.ones(20)
222        weights[0:10] = 2
223        endog = np.zeros(20)
224        endog[0:10] += 1
225        groups = np.kron(np.arange(10), np.r_[1, 1])
226        model = gee.GEE(endog, exog, groups, weights=weights)
227        result = model.fit()
228        assert_allclose(result.params, np.r_[2 / 3.])
229
230        # Comparison against stata using groups with different sizes.
231        weights = np.ones(20)
232        weights[10:] = 2
233        endog = np.r_[1, 2, 3, 2, 3, 4, 3, 4, 5, 4, 5, 6, 5, 6, 7, 6,
234                      7, 8, 7, 8]
235        exog1 = np.r_[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4,
236                      3, 3, 3, 3]
237        groups = np.r_[1, 1, 2, 2, 2, 2, 4, 4, 5, 5, 6, 6, 6, 6,
238                       8, 8, 9, 9, 10, 10]
239        exog = np.column_stack((np.ones(20), exog1))
240
241        # Comparison using independence model
242        model = gee.GEE(endog, exog, groups, weights=weights,
243                        cov_struct=cov_struct.Independence())
244        g = np.mean([2, 4, 2, 2, 4, 2, 2, 2])
245        fac = 20 / float(20 - g)
246        result = model.fit(ddof_scale=0, scaling_factor=fac)
247
248        assert_allclose(result.params, np.r_[1.247573, 1.436893], atol=1e-6)
249        assert_allclose(result.scale, 1.808576)
250
251        # Stata multiples robust SE by sqrt(N / (N - g)), where N is
252        # the total sample size and g is the average group size.
253        assert_allclose(result.bse, np.r_[0.895366, 0.3425498], atol=1e-5)
254
255        # Comparison using exchangeable model
256        # Smoke test for now
257        model = gee.GEE(endog, exog, groups, weights=weights,
258                        cov_struct=cov_struct.Exchangeable())
259        model.fit(ddof_scale=0)
260
261    # This is in the release announcement for version 0.6.
262    def test_poisson_epil(self):
263
264        cur_dir = os.path.dirname(os.path.abspath(__file__))
265        fname = os.path.join(cur_dir, "results", "epil.csv")
266        data = pd.read_csv(fname)
267
268        fam = families.Poisson()
269        ind = cov_struct.Independence()
270        mod1 = gee.GEE.from_formula("y ~ age + trt + base", data["subject"],
271                                    data, cov_struct=ind, family=fam)
272        rslt1 = mod1.fit(cov_type='naive')
273
274        # Coefficients should agree with GLM
275        from statsmodels.genmod.generalized_linear_model import GLM
276
277        mod2 = GLM.from_formula("y ~ age + trt + base", data,
278                                family=families.Poisson())
279        rslt2 = mod2.fit()
280
281        # do not use wrapper, asserts_xxx do not work
282        rslt1 = rslt1._results
283        rslt2 = rslt2._results
284
285        assert_allclose(rslt1.params, rslt2.params, rtol=1e-6, atol=1e-6)
286        assert_allclose(rslt1.bse, rslt2.bse, rtol=1e-6, atol=1e-6)
287
288    def test_missing(self):
289        # Test missing data handling for calling from the api.  Missing
290        # data handling does not currently work for formulas.
291
292        np.random.seed(34234)
293        endog = np.random.normal(size=100)
294        exog = np.random.normal(size=(100, 3))
295        exog[:, 0] = 1
296        groups = np.kron(lrange(20), np.ones(5))
297
298        endog[0] = np.nan
299        endog[5:7] = np.nan
300        exog[10:12, 1] = np.nan
301
302        mod1 = gee.GEE(endog, exog, groups, missing='drop')
303        rslt1 = mod1.fit()
304
305        assert_almost_equal(len(mod1.endog), 95)
306        assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 3])
307
308        ii = np.isfinite(endog) & np.isfinite(exog).all(1)
309
310        mod2 = gee.GEE(endog[ii], exog[ii, :], groups[ii], missing='none')
311        rslt2 = mod2.fit()
312
313        assert_almost_equal(rslt1.params, rslt2.params)
314        assert_almost_equal(rslt1.bse, rslt2.bse)
315
316    def test_missing_formula(self):
317        # Test missing data handling for formulas.
318
319        np.random.seed(34234)
320        endog = np.random.normal(size=100)
321        exog1 = np.random.normal(size=100)
322        exog2 = np.random.normal(size=100)
323        exog3 = np.random.normal(size=100)
324        groups = np.kron(lrange(20), np.ones(5))
325
326        endog[0] = np.nan
327        endog[5:7] = np.nan
328        exog2[10:12] = np.nan
329
330        data0 = pd.DataFrame({"endog": endog, "exog1": exog1, "exog2": exog2,
331                              "exog3": exog3, "groups": groups})
332
333        for k in 0, 1:
334            data = data0.copy()
335            kwargs = {}
336            if k == 1:
337                data["offset"] = 0
338                data["time"] = 0
339                kwargs["offset"] = "offset"
340                kwargs["time"] = "time"
341
342            mod1 = gee.GEE.from_formula("endog ~ exog1 + exog2 + exog3",
343                                        groups="groups", data=data,
344                                        missing='drop', **kwargs)
345            rslt1 = mod1.fit()
346
347            assert_almost_equal(len(mod1.endog), 95)
348            assert_almost_equal(np.asarray(mod1.exog.shape), np.r_[95, 4])
349
350            data = data.dropna()
351
352            kwargs = {}
353            if k == 1:
354                kwargs["offset"] = data["offset"]
355                kwargs["time"] = data["time"]
356
357            mod2 = gee.GEE.from_formula("endog ~ exog1 + exog2 + exog3",
358                                        groups=data["groups"], data=data,
359                                        missing='none', **kwargs)
360            rslt2 = mod2.fit()
361
362            assert_almost_equal(rslt1.params.values, rslt2.params.values)
363            assert_almost_equal(rslt1.bse.values, rslt2.bse.values)
364
365    @pytest.mark.parametrize("k1", [False, True])
366    @pytest.mark.parametrize("k2", [False, True])
367    def test_invalid_args(self, k1, k2):
368
369        for j in range(3):
370
371            p = [20, 20, 20]
372            p[j] = 18
373
374            endog = np.zeros(p[0])
375            exog = np.zeros((p[1], 2))
376
377            kwargs = {}
378            kwargs["groups"] = np.zeros(p[2])
379            if k1:
380                kwargs["exposure"] = np.zeros(18)
381            if k2:
382                kwargs["time"] = np.zeros(18)
383            with assert_raises(ValueError):
384                gee.GEE(endog, exog, **kwargs)
385
386    def test_default_time(self):
387        # Check that the time defaults work correctly.
388
389        endog, exog, group = load_data("gee_logistic_1.csv")
390
391        # Time values for the autoregressive model
392        T = np.zeros(len(endog))
393        idx = set(group)
394        for ii in idx:
395            jj = np.flatnonzero(group == ii)
396            T[jj] = lrange(len(jj))
397
398        family = families.Binomial()
399        va = cov_struct.Autoregressive(grid=False)
400
401        md1 = gee.GEE(endog, exog, group, family=family, cov_struct=va)
402        mdf1 = md1.fit()
403
404        md2 = gee.GEE(endog, exog, group, time=T, family=family,
405                      cov_struct=va)
406        mdf2 = md2.fit()
407
408        assert_almost_equal(mdf1.params, mdf2.params, decimal=6)
409        assert_almost_equal(mdf1.standard_errors(),
410                            mdf2.standard_errors(), decimal=6)
411
412    def test_logistic(self):
413        # R code for comparing results:
414
415        # library(gee)
416        # Z = read.csv("results/gee_logistic_1.csv", header=FALSE)
417        # Y = Z[,2]
418        # Id = Z[,1]
419        # X1 = Z[,3]
420        # X2 = Z[,4]
421        # X3 = Z[,5]
422
423        # mi = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
424        #         corstr="independence")
425        # smi = summary(mi)
426        # u = coefficients(smi)
427        # cfi = paste(u[,1], collapse=",")
428        # sei = paste(u[,4], collapse=",")
429
430        # me = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
431        #         corstr="exchangeable")
432        # sme = summary(me)
433        # u = coefficients(sme)
434        # cfe = paste(u[,1], collapse=",")
435        # see = paste(u[,4], collapse=",")
436
437        # ma = gee(Y ~ X1 + X2 + X3, id=Id, family=binomial,
438        #         corstr="AR-M")
439        # sma = summary(ma)
440        # u = coefficients(sma)
441        # cfa = paste(u[,1], collapse=",")
442        # sea = paste(u[,4], collapse=",")
443
444        # sprintf("cf = [[%s],[%s],[%s]]", cfi, cfe, cfa)
445        # sprintf("se = [[%s],[%s],[%s]]", sei, see, sea)
446
447        endog, exog, group = load_data("gee_logistic_1.csv")
448
449        # Time values for the autoregressive model
450        T = np.zeros(len(endog))
451        idx = set(group)
452        for ii in idx:
453            jj = np.flatnonzero(group == ii)
454            T[jj] = lrange(len(jj))
455
456        family = families.Binomial()
457        ve = cov_struct.Exchangeable()
458        vi = cov_struct.Independence()
459        va = cov_struct.Autoregressive(grid=False)
460
461        # From R gee
462        cf = [[0.0167272965285882, 1.13038654425893,
463               -1.86896345082962, 1.09397608331333],
464              [0.0178982283915449, 1.13118798191788,
465               -1.86133518416017, 1.08944256230299],
466              [0.0109621937947958, 1.13226505028438,
467               -1.88278757333046, 1.09954623769449]]
468        se = [[0.127291720283049, 0.166725808326067,
469               0.192430061340865, 0.173141068839597],
470              [0.127045031730155, 0.165470678232842,
471               0.192052750030501, 0.173174779369249],
472              [0.127240302296444, 0.170554083928117,
473               0.191045527104503, 0.169776150974586]]
474
475        for j, v in enumerate((vi, ve, va)):
476            md = gee.GEE(endog, exog, group, T, family, v)
477            mdf = md.fit()
478            if id(v) != id(va):
479                assert_almost_equal(mdf.params, cf[j], decimal=6)
480                assert_almost_equal(mdf.standard_errors(), se[j],
481                                    decimal=6)
482
483        # Test with formulas
484        D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]),
485                           axis=1)
486        D = pd.DataFrame(D)
487        D.columns = ["Y", "Id", ] + ["X%d" % (k + 1)
488                                     for k in range(exog.shape[1] - 1)]
489        for j, v in enumerate((vi, ve)):
490            md = gee.GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D,
491                                      family=family, cov_struct=v)
492            mdf = md.fit()
493            assert_almost_equal(mdf.params, cf[j], decimal=6)
494            assert_almost_equal(mdf.standard_errors(), se[j],
495                                decimal=6)
496
497        # FIXME: do not leave commented-out
498        # Check for run-time exceptions in summary
499        # print(mdf.summary())
500
501    def test_autoregressive(self):
502
503        dep_params_true = [0, 0.589208623896, 0.559823804948]
504
505        params_true = [[1.08043787, 1.12709319, 0.90133927],
506                       [0.9613677, 1.05826987, 0.90832055],
507                       [1.05370439, 0.96084864, 0.93923374]]
508
509        np.random.seed(342837482)
510
511        num_group = 100
512        ar_param = 0.5
513        k = 3
514
515        ga = families.Gaussian()
516
517        for gsize in 1, 2, 3:
518
519            ix = np.arange(gsize)[:, None] - np.arange(gsize)[None, :]
520            ix = np.abs(ix)
521            cmat = ar_param ** ix
522            cmat_r = np.linalg.cholesky(cmat)
523
524            endog = []
525            exog = []
526            groups = []
527            for i in range(num_group):
528                x = np.random.normal(size=(gsize, k))
529                exog.append(x)
530                expval = x.sum(1)
531                errors = np.dot(cmat_r, np.random.normal(size=gsize))
532                endog.append(expval + errors)
533                groups.append(i * np.ones(gsize))
534
535            endog = np.concatenate(endog)
536            groups = np.concatenate(groups)
537            exog = np.concatenate(exog, axis=0)
538
539            ar = cov_struct.Autoregressive(grid=False)
540            md = gee.GEE(endog, exog, groups, family=ga, cov_struct=ar)
541            mdf = md.fit()
542            assert_almost_equal(ar.dep_params, dep_params_true[gsize - 1])
543            assert_almost_equal(mdf.params, params_true[gsize - 1])
544
545    def test_post_estimation(self):
546
547        family = families.Gaussian()
548        endog, exog, group = load_data("gee_linear_1.csv")
549
550        ve = cov_struct.Exchangeable()
551
552        md = gee.GEE(endog, exog, group, None, family, ve)
553        mdf = md.fit()
554
555        assert_almost_equal(np.dot(exog, mdf.params),
556                            mdf.fittedvalues)
557        assert_almost_equal(endog - np.dot(exog, mdf.params),
558                            mdf.resid)
559
560    def test_scoretest(self):
561        # Regression tests
562
563        np.random.seed(6432)
564        n = 200  # Must be divisible by 4
565        exog = np.random.normal(size=(n, 4))
566        endog = exog[:, 0] + exog[:, 1] + exog[:, 2]
567        endog += 3 * np.random.normal(size=n)
568        group = np.kron(np.arange(n / 4), np.ones(4))
569
570        # Test under the null.
571        L = np.array([[1., -1, 0, 0]])
572        R = np.array([0., ])
573        family = families.Gaussian()
574        va = cov_struct.Independence()
575        mod1 = gee.GEE(endog, exog, group, family=family,
576                       cov_struct=va, constraint=(L, R))
577        res1 = mod1.fit()
578        assert_almost_equal(res1.score_test()["statistic"],
579                            1.08126334)
580        assert_almost_equal(res1.score_test()["p-value"],
581                            0.2984151086)
582
583        # Test under the alternative.
584        L = np.array([[1., -1, 0, 0]])
585        R = np.array([1.0, ])
586        family = families.Gaussian()
587        va = cov_struct.Independence()
588        mod2 = gee.GEE(endog, exog, group, family=family,
589                       cov_struct=va, constraint=(L, R))
590        res2 = mod2.fit()
591        assert_almost_equal(res2.score_test()["statistic"],
592                            3.491110965)
593        assert_almost_equal(res2.score_test()["p-value"],
594                            0.0616991659)
595
596        # Compare to Wald tests
597        exog = np.random.normal(size=(n, 2))
598        L = np.array([[1, -1]])
599        R = np.array([0.])
600        f = np.r_[1, -1]
601        for i in range(10):
602            endog = exog[:, 0] + (0.5 + i / 10.) * exog[:, 1] +\
603                np.random.normal(size=n)
604            family = families.Gaussian()
605            va = cov_struct.Independence()
606            mod0 = gee.GEE(endog, exog, group, family=family,
607                           cov_struct=va)
608            rslt0 = mod0.fit()
609            family = families.Gaussian()
610            va = cov_struct.Independence()
611            mod1 = gee.GEE(endog, exog, group, family=family,
612                           cov_struct=va, constraint=(L, R))
613            res1 = mod1.fit()
614            se = np.sqrt(np.dot(f, np.dot(rslt0.cov_params(), f)))
615            wald_z = np.dot(f, rslt0.params) / se
616            wald_p = 2 * norm.cdf(-np.abs(wald_z))
617            score_p = res1.score_test()["p-value"]
618            assert_array_less(np.abs(wald_p - score_p), 0.02)
619
620    @pytest.mark.parametrize("cov_struct", [cov_struct.Independence,
621                                            cov_struct.Exchangeable])
622    def test_compare_score_test(self, cov_struct):
623
624        np.random.seed(6432)
625        n = 200  # Must be divisible by 4
626        exog = np.random.normal(size=(n, 4))
627        group = np.kron(np.arange(n / 4), np.ones(4))
628
629        exog_sub = exog[:, [0, 3]]
630        endog = exog_sub.sum(1) + 3 * np.random.normal(size=n)
631
632        L = np.asarray([[0, 1, 0, 0], [0, 0, 1, 0]])
633        R = np.zeros(2)
634        mod_lr = gee.GEE(endog, exog, group, constraint=(L, R),
635                         cov_struct=cov_struct())
636        mod_lr.fit()
637
638        mod_sub = gee.GEE(endog, exog_sub, group, cov_struct=cov_struct())
639        res_sub = mod_sub.fit()
640
641        for call_fit in [False, True]:
642            mod = gee.GEE(endog, exog, group, cov_struct=cov_struct())
643            if call_fit:
644                # Should work with or without fitting the parent model
645                mod.fit()
646            score_results = mod.compare_score_test(res_sub)
647            assert_almost_equal(
648                score_results["statistic"],
649                mod_lr.score_test_results["statistic"])
650            assert_almost_equal(
651                score_results["p-value"],
652                mod_lr.score_test_results["p-value"])
653            assert_almost_equal(
654                score_results["df"],
655                mod_lr.score_test_results["df"])
656
657    def test_compare_score_test_warnings(self):
658
659        np.random.seed(6432)
660        n = 200  # Must be divisible by 4
661        exog = np.random.normal(size=(n, 4))
662        group = np.kron(np.arange(n / 4), np.ones(4))
663        exog_sub = exog[:, [0, 3]]
664        endog = exog_sub.sum(1) + 3 * np.random.normal(size=n)
665
666        # Mismatched cov_struct
667        with assert_warns(UserWarning):
668            mod_sub = gee.GEE(endog, exog_sub, group,
669                              cov_struct=cov_struct.Exchangeable())
670            res_sub = mod_sub.fit()
671            mod = gee.GEE(endog, exog, group,
672                          cov_struct=cov_struct.Independence())
673            mod.compare_score_test(res_sub)  # smoketest
674
675        # Mismatched family
676        with assert_warns(UserWarning):
677            mod_sub = gee.GEE(endog, exog_sub, group,
678                              family=families.Gaussian())
679            res_sub = mod_sub.fit()
680            mod = gee.GEE(endog, exog, group, family=families.Poisson())
681            mod.compare_score_test(res_sub)  # smoketest
682
683        # Mismatched size
684        with assert_raises(Exception):
685            mod_sub = gee.GEE(endog, exog_sub, group)
686            res_sub = mod_sub.fit()
687            mod = gee.GEE(endog[0:100], exog[:100, :], group[0:100])
688            mod.compare_score_test(res_sub)  # smoketest
689
690        # Mismatched weights
691        with assert_warns(UserWarning):
692            w = np.random.uniform(size=n)
693            mod_sub = gee.GEE(endog, exog_sub, group, weights=w)
694            res_sub = mod_sub.fit()
695            mod = gee.GEE(endog, exog, group)
696            mod.compare_score_test(res_sub)  # smoketest
697
698        # Parent and submodel are the same dimension
699        with pytest.warns(UserWarning):
700            w = np.random.uniform(size=n)
701            mod_sub = gee.GEE(endog, exog, group)
702            res_sub = mod_sub.fit()
703            mod = gee.GEE(endog, exog, group)
704            mod.compare_score_test(res_sub)  # smoketest
705
706    def test_constraint_covtype(self):
707        # Test constraints with different cov types
708        np.random.seed(6432)
709        n = 200
710        exog = np.random.normal(size=(n, 4))
711        endog = exog[:, 0] + exog[:, 1] + exog[:, 2]
712        endog += 3 * np.random.normal(size=n)
713        group = np.kron(np.arange(n / 4), np.ones(4))
714        L = np.array([[1., -1, 0, 0]])
715        R = np.array([0., ])
716        family = families.Gaussian()
717        va = cov_struct.Independence()
718        for cov_type in "robust", "naive", "bias_reduced":
719            model = gee.GEE(endog, exog, group, family=family,
720                            cov_struct=va, constraint=(L, R))
721            result = model.fit(cov_type=cov_type)
722            result.standard_errors(cov_type=cov_type)
723            assert_allclose(result.cov_robust.shape, np.r_[4, 4])
724            assert_allclose(result.cov_naive.shape, np.r_[4, 4])
725            if cov_type == "bias_reduced":
726                assert_allclose(result.cov_robust_bc.shape, np.r_[4, 4])
727
728    def test_linear(self):
729        # library(gee)
730
731        # Z = read.csv("results/gee_linear_1.csv", header=FALSE)
732        # Y = Z[,2]
733        # Id = Z[,1]
734        # X1 = Z[,3]
735        # X2 = Z[,4]
736        # X3 = Z[,5]
737        # mi = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian,
738        #         corstr="independence", tol=1e-8, maxit=100)
739        # smi = summary(mi)
740        # u = coefficients(smi)
741
742        # cfi = paste(u[,1], collapse=",")
743        # sei = paste(u[,4], collapse=",")
744
745        # me = gee(Y ~ X1 + X2 + X3, id=Id, family=gaussian,
746        #         corstr="exchangeable", tol=1e-8, maxit=100)
747        # sme = summary(me)
748        # u = coefficients(sme)
749
750        # cfe = paste(u[,1], collapse=",")
751        # see = paste(u[,4], collapse=",")
752
753        # sprintf("cf = [[%s],[%s]]", cfi, cfe)
754        # sprintf("se = [[%s],[%s]]", sei, see)
755
756        family = families.Gaussian()
757
758        endog, exog, group = load_data("gee_linear_1.csv")
759
760        vi = cov_struct.Independence()
761        ve = cov_struct.Exchangeable()
762
763        # From R gee
764        cf = [[-0.01850226507491, 0.81436304278962,
765               -1.56167635393184, 0.794239361055003],
766              [-0.0182920577154767, 0.814898414022467,
767               -1.56194040106201, 0.793499517527478]]
768        se = [[0.0440733554189401, 0.0479993639119261,
769               0.0496045952071308, 0.0479467597161284],
770              [0.0440369906460754, 0.0480069787567662,
771               0.049519758758187, 0.0479760443027526]]
772
773        for j, v in enumerate((vi, ve)):
774            md = gee.GEE(endog, exog, group, None, family, v)
775            mdf = md.fit()
776            assert_almost_equal(mdf.params, cf[j], decimal=10)
777            assert_almost_equal(mdf.standard_errors(), se[j],
778                                decimal=10)
779
780        # Test with formulas
781        D = np.concatenate((endog[:, None], group[:, None], exog[:, 1:]),
782                           axis=1)
783        D = pd.DataFrame(D)
784        D.columns = ["Y", "Id", ] + ["X%d" % (k + 1)
785                                     for k in range(exog.shape[1] - 1)]
786        for j, v in enumerate((vi, ve)):
787            md = gee.GEE.from_formula("Y ~ X1 + X2 + X3", "Id", D,
788                                      family=family, cov_struct=v)
789            mdf = md.fit()
790            assert_almost_equal(mdf.params, cf[j], decimal=10)
791            assert_almost_equal(mdf.standard_errors(), se[j],
792                                decimal=10)
793
794    def test_linear_constrained(self):
795
796        family = families.Gaussian()
797
798        np.random.seed(34234)
799        exog = np.random.normal(size=(300, 4))
800        exog[:, 0] = 1
801        endog = np.dot(exog, np.r_[1, 1, 0, 0.2]) +\
802            np.random.normal(size=300)
803        group = np.kron(np.arange(100), np.r_[1, 1, 1])
804
805        vi = cov_struct.Independence()
806        ve = cov_struct.Exchangeable()
807
808        L = np.r_[[[0, 0, 0, 1]]]
809        R = np.r_[0, ]
810
811        for j, v in enumerate((vi, ve)):
812            md = gee.GEE(endog, exog, group, None, family, v,
813                         constraint=(L, R))
814            mdf = md.fit()
815            assert_almost_equal(mdf.params[3], 0, decimal=10)
816
817    def test_nested_linear(self):
818
819        family = families.Gaussian()
820
821        endog, exog, group = load_data("gee_nested_linear_1.csv")
822
823        group_n = []
824        for i in range(endog.shape[0] // 10):
825            group_n.extend([0, ] * 5)
826            group_n.extend([1, ] * 5)
827        group_n = np.array(group_n)[:, None]
828
829        dp = cov_struct.Independence()
830        md = gee.GEE(endog, exog, group, None, family, dp)
831        mdf1 = md.fit()
832
833        # From statsmodels.GEE (not an independent test)
834        cf = np.r_[-0.1671073,  1.00467426, -2.01723004,  0.97297106]
835        se = np.r_[0.08629606,  0.04058653,  0.04067038,  0.03777989]
836        assert_almost_equal(mdf1.params, cf, decimal=6)
837        assert_almost_equal(mdf1.standard_errors(), se,
838                            decimal=6)
839
840        ne = cov_struct.Nested()
841        md = gee.GEE(endog, exog, group, None, family, ne,
842                     dep_data=group_n)
843        mdf2 = md.fit(start_params=mdf1.params)
844
845        # From statsmodels.GEE (not an independent test)
846        cf = np.r_[-0.16655319,  1.02183688, -2.00858719,  1.00101969]
847        se = np.r_[0.08632616,  0.02913582,  0.03114428,  0.02893991]
848        assert_almost_equal(mdf2.params, cf, decimal=6)
849        assert_almost_equal(mdf2.standard_errors(), se,
850                            decimal=6)
851
852        smry = mdf2.cov_struct.summary()
853        assert_allclose(
854            smry.Variance,
855            np.r_[1.043878, 0.611656, 1.421205],
856            atol=1e-5, rtol=1e-5)
857
858    def test_nested_pandas(self):
859
860        np.random.seed(4234)
861        n = 10000
862
863        # Outer groups
864        groups = np.kron(np.arange(n // 100), np.ones(100)).astype(int)
865
866        # Inner groups
867        groups1 = np.kron(np.arange(n // 50), np.ones(50)).astype(int)
868        groups2 = np.kron(np.arange(n // 10), np.ones(10)).astype(int)
869
870        # Group effects
871        groups_e = np.random.normal(size=n // 100)
872        groups1_e = 2 * np.random.normal(size=n // 50)
873        groups2_e = 3 * np.random.normal(size=n // 10)
874
875        y = groups_e[groups] + groups1_e[groups1] + groups2_e[groups2]
876        y += 0.5 * np.random.normal(size=n)
877
878        df = pd.DataFrame({"y": y, "TheGroups": groups,
879                           "groups1": groups1, "groups2": groups2})
880
881        model = gee.GEE.from_formula("y ~ 1", groups="TheGroups",
882                                     dep_data="0 + groups1 + groups2",
883                                     cov_struct=cov_struct.Nested(),
884                                     data=df)
885        result = model.fit()
886
887        # The true variances are 1, 4, 9, 0.25
888        smry = result.cov_struct.summary()
889        assert_allclose(
890            smry.Variance,
891            np.r_[1.437299, 4.421543, 8.905295, 0.258480],
892            atol=1e-5, rtol=1e-5)
893
894    def test_ordinal(self):
895
896        family = families.Binomial()
897
898        endog, exog, groups = load_data("gee_ordinal_1.csv",
899                                        icept=False)
900
901        va = cov_struct.GlobalOddsRatio("ordinal")
902
903        mod = gee.OrdinalGEE(endog, exog, groups, None, family, va)
904        rslt = mod.fit()
905
906        # Regression test
907        cf = np.r_[1.09250002, 0.0217443, -0.39851092, -0.01812116,
908                   0.03023969, 1.18258516, 0.01803453, -1.10203381]
909        assert_almost_equal(rslt.params, cf, decimal=5)
910
911        # Regression test
912        se = np.r_[0.10883461, 0.10330197, 0.11177088, 0.05486569,
913                   0.05997153, 0.09168148, 0.05953324, 0.0853862]
914        assert_almost_equal(rslt.bse, se, decimal=5)
915
916        # Check that we get the correct results type
917        assert_equal(type(rslt), gee.OrdinalGEEResultsWrapper)
918        assert_equal(type(rslt._results), gee.OrdinalGEEResults)
919
920    @pytest.mark.smoke
921    def test_ordinal_formula(self):
922
923        np.random.seed(434)
924        n = 40
925        y = np.random.randint(0, 3, n)
926        groups = np.arange(n)
927        x1 = np.random.normal(size=n)
928        x2 = np.random.normal(size=n)
929
930        df = pd.DataFrame({"y": y, "groups": groups, "x1": x1, "x2": x2})
931
932        model = gee.OrdinalGEE.from_formula("y ~ 0 + x1 + x2", groups, data=df)
933        model.fit()
934
935        with warnings.catch_warnings():
936            warnings.simplefilter("ignore")
937            model = gee.NominalGEE.from_formula("y ~ 0 + x1 + x2", groups,
938                                                data=df)
939            model.fit()
940
941    @pytest.mark.smoke
942    def test_ordinal_independence(self):
943
944        np.random.seed(434)
945        n = 40
946        y = np.random.randint(0, 3, n)
947        groups = np.kron(np.arange(n / 2), np.r_[1, 1])
948        x = np.random.normal(size=(n, 1))
949
950        odi = cov_struct.OrdinalIndependence()
951        model1 = gee.OrdinalGEE(y, x, groups, cov_struct=odi)
952        model1.fit()
953
954    @pytest.mark.smoke
955    def test_nominal_independence(self):
956
957        np.random.seed(434)
958        n = 40
959        y = np.random.randint(0, 3, n)
960        groups = np.kron(np.arange(n / 2), np.r_[1, 1])
961        x = np.random.normal(size=(n, 1))
962
963        with warnings.catch_warnings():
964            warnings.simplefilter("ignore")
965            nmi = cov_struct.NominalIndependence()
966            model1 = gee.NominalGEE(y, x, groups, cov_struct=nmi)
967            model1.fit()
968
969    @pytest.mark.smoke
970    @pytest.mark.matplotlib
971    def test_ordinal_plot(self, close_figures):
972        family = families.Binomial()
973
974        endog, exog, groups = load_data("gee_ordinal_1.csv",
975                                        icept=False)
976
977        va = cov_struct.GlobalOddsRatio("ordinal")
978
979        mod = gee.OrdinalGEE(endog, exog, groups, None, family, va)
980        rslt = mod.fit()
981
982        fig = rslt.plot_distribution()
983        assert_equal(isinstance(fig, plt.Figure), True)
984
985    def test_nominal(self):
986
987        endog, exog, groups = load_data("gee_nominal_1.csv",
988                                        icept=False)
989
990        # Test with independence correlation
991        va = cov_struct.Independence()
992        mod1 = gee.NominalGEE(endog, exog, groups, cov_struct=va)
993        rslt1 = mod1.fit()
994
995        # Regression test
996        cf1 = np.r_[0.450009, 0.451959, -0.918825, -0.468266]
997        se1 = np.r_[0.08915936, 0.07005046, 0.12198139, 0.08281258]
998        assert_allclose(rslt1.params, cf1, rtol=1e-5, atol=1e-5)
999        assert_allclose(rslt1.standard_errors(), se1, rtol=1e-5, atol=1e-5)
1000
1001        # Test with global odds ratio dependence
1002        va = cov_struct.GlobalOddsRatio("nominal")
1003        mod2 = gee.NominalGEE(endog, exog, groups, cov_struct=va)
1004        rslt2 = mod2.fit(start_params=rslt1.params)
1005
1006        # Regression test
1007        cf2 = np.r_[0.455365, 0.415334, -0.916589, -0.502116]
1008        se2 = np.r_[0.08803614, 0.06628179, 0.12259726, 0.08411064]
1009        assert_allclose(rslt2.params, cf2, rtol=1e-5, atol=1e-5)
1010        assert_allclose(rslt2.standard_errors(), se2, rtol=1e-5, atol=1e-5)
1011
1012        # Make sure we get the correct results type
1013        assert_equal(type(rslt1), gee.NominalGEEResultsWrapper)
1014        assert_equal(type(rslt1._results), gee.NominalGEEResults)
1015
1016    def test_poisson(self):
1017        # library(gee)
1018        # Z = read.csv("results/gee_poisson_1.csv", header=FALSE)
1019        # Y = Z[,2]
1020        # Id = Z[,1]
1021        # X1 = Z[,3]
1022        # X2 = Z[,4]
1023        # X3 = Z[,5]
1024        # X4 = Z[,6]
1025        # X5 = Z[,7]
1026
1027        # mi = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
1028        #        corstr="independence", scale.fix=TRUE)
1029        # smi = summary(mi)
1030        # u = coefficients(smi)
1031        # cfi = paste(u[,1], collapse=",")
1032        # sei = paste(u[,4], collapse=",")
1033
1034        # me = gee(Y ~ X1 + X2 + X3 + X4 + X5, id=Id, family=poisson,
1035        #        corstr="exchangeable", scale.fix=TRUE)
1036        # sme = summary(me)
1037
1038        # u = coefficients(sme)
1039        # cfe = paste(u[,1], collapse=",")
1040        # see = paste(u[,4], collapse=",")
1041
1042        # sprintf("cf = [[%s],[%s]]", cfi, cfe)
1043        # sprintf("se = [[%s],[%s]]", sei, see)
1044
1045        family = families.Poisson()
1046
1047        endog, exog, group_n = load_data("gee_poisson_1.csv")
1048
1049        vi = cov_struct.Independence()
1050        ve = cov_struct.Exchangeable()
1051
1052        # From R gee
1053        cf = [[-0.0364450410793481, -0.0543209391301178,
1054               0.0156642711741052, 0.57628591338724,
1055               -0.00465659951186211, -0.477093153099256],
1056              [-0.0315615554826533, -0.0562589480840004,
1057               0.0178419412298561, 0.571512795340481,
1058               -0.00363255566297332, -0.475971696727736]]
1059        se = [[0.0611309237214186, 0.0390680524493108,
1060               0.0334234174505518, 0.0366860768962715,
1061               0.0304758505008105, 0.0316348058881079],
1062              [0.0610840153582275, 0.0376887268649102,
1063               0.0325168379415177, 0.0369786751362213,
1064               0.0296141014225009, 0.0306115470200955]]
1065
1066        for j, v in enumerate((vi, ve)):
1067            md = gee.GEE(endog, exog, group_n, None, family, v)
1068            mdf = md.fit()
1069            assert_almost_equal(mdf.params, cf[j], decimal=5)
1070            assert_almost_equal(mdf.standard_errors(), se[j],
1071                                decimal=6)
1072
1073        # Test with formulas
1074        D = np.concatenate((endog[:, None], group_n[:, None],
1075                            exog[:, 1:]), axis=1)
1076        D = pd.DataFrame(D)
1077        D.columns = ["Y", "Id", ] + ["X%d" % (k + 1)
1078                                     for k in range(exog.shape[1] - 1)]
1079        for j, v in enumerate((vi, ve)):
1080            md = gee.GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id",
1081                                      D, family=family, cov_struct=v)
1082            mdf = md.fit()
1083            assert_almost_equal(mdf.params, cf[j], decimal=5)
1084            assert_almost_equal(mdf.standard_errors(), se[j],
1085                                decimal=6)
1086            # print(mdf.params)
1087
1088    def test_groups(self):
1089        # Test various group structures (nonconsecutive, different
1090        # group sizes, not ordered, string labels)
1091
1092        np.random.seed(234)
1093        n = 40
1094        x = np.random.normal(size=(n, 2))
1095        y = np.random.normal(size=n)
1096
1097        # groups with unequal group sizes
1098        groups = np.kron(np.arange(n / 4), np.ones(4))
1099        groups[8:12] = 3
1100        groups[34:36] = 9
1101
1102        model1 = gee.GEE(y, x, groups=groups)
1103        result1 = model1.fit()
1104
1105        # Unordered groups
1106        ix = np.random.permutation(n)
1107        y1 = y[ix]
1108        x1 = x[ix, :]
1109        groups1 = groups[ix]
1110
1111        model2 = gee.GEE(y1, x1, groups=groups1)
1112        result2 = model2.fit()
1113
1114        assert_allclose(result1.params, result2.params)
1115        assert_allclose(result1.tvalues, result2.tvalues)
1116
1117        # group labels are strings
1118        mp = {}
1119        import string
1120        for j, g in enumerate(set(groups)):
1121            mp[g] = string.ascii_letters[j:j + 4]
1122        groups2 = [mp[g] for g in groups]
1123
1124        model3 = gee.GEE(y, x, groups=groups2)
1125        result3 = model3.fit()
1126
1127        assert_allclose(result1.params, result3.params)
1128        assert_allclose(result1.tvalues, result3.tvalues)
1129
1130    def test_compare_OLS(self):
1131        # Gaussian GEE with independence correlation should agree
1132        # exactly with OLS for parameter estimates and standard errors
1133        # derived from the naive covariance estimate.
1134
1135        vs = cov_struct.Independence()
1136        family = families.Gaussian()
1137
1138        np.random.seed(34234)
1139        Y = np.random.normal(size=100)
1140        X1 = np.random.normal(size=100)
1141        X2 = np.random.normal(size=100)
1142        X3 = np.random.normal(size=100)
1143        groups = np.kron(lrange(20), np.ones(5))
1144
1145        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})
1146
1147        md = gee.GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
1148                                  family=family, cov_struct=vs)
1149        mdf = md.fit()
1150
1151        ols = lm.OLS.from_formula("Y ~ X1 + X2 + X3", data=D).fit()
1152
1153        # do not use wrapper, asserts_xxx do not work
1154        ols = ols._results
1155
1156        assert_almost_equal(ols.params, mdf.params, decimal=10)
1157
1158        se = mdf.standard_errors(cov_type="naive")
1159        assert_almost_equal(ols.bse, se, decimal=10)
1160
1161        naive_tvalues = mdf.params / np.sqrt(np.diag(mdf.cov_naive))
1162        assert_almost_equal(naive_tvalues, ols.tvalues, decimal=10)
1163
1164    def test_formulas(self):
1165        # Check formulas, especially passing groups and time as either
1166        # variable names or arrays.
1167
1168        n = 100
1169        np.random.seed(34234)
1170        Y = np.random.normal(size=n)
1171        X1 = np.random.normal(size=n)
1172        mat = np.concatenate((np.ones((n, 1)), X1[:, None]), axis=1)
1173        Time = np.random.uniform(size=n)
1174        groups = np.kron(lrange(20), np.ones(5))
1175
1176        data = pd.DataFrame({"Y": Y, "X1": X1, "Time": Time, "groups": groups})
1177
1178        va = cov_struct.Autoregressive(grid=False)
1179        family = families.Gaussian()
1180
1181        mod1 = gee.GEE(Y, mat, groups, time=Time, family=family,
1182                       cov_struct=va)
1183        rslt1 = mod1.fit()
1184
1185        mod2 = gee.GEE.from_formula("Y ~ X1", groups, data, time=Time,
1186                                    family=family, cov_struct=va)
1187        rslt2 = mod2.fit()
1188
1189        mod3 = gee.GEE.from_formula("Y ~ X1", groups, data, time="Time",
1190                                    family=family, cov_struct=va)
1191        rslt3 = mod3.fit()
1192
1193        mod4 = gee.GEE.from_formula("Y ~ X1", "groups", data, time=Time,
1194                                    family=family, cov_struct=va)
1195        rslt4 = mod4.fit()
1196
1197        mod5 = gee.GEE.from_formula("Y ~ X1", "groups", data, time="Time",
1198                                    family=family, cov_struct=va)
1199        rslt5 = mod5.fit()
1200
1201        assert_almost_equal(rslt1.params, rslt2.params, decimal=8)
1202        assert_almost_equal(rslt1.params, rslt3.params, decimal=8)
1203        assert_almost_equal(rslt1.params, rslt4.params, decimal=8)
1204        assert_almost_equal(rslt1.params, rslt5.params, decimal=8)
1205
1206        check_wrapper(rslt2)
1207
1208    def test_compare_logit(self):
1209
1210        vs = cov_struct.Independence()
1211        family = families.Binomial()
1212
1213        np.random.seed(34234)
1214        Y = 1 * (np.random.normal(size=100) < 0)
1215        X1 = np.random.normal(size=100)
1216        X2 = np.random.normal(size=100)
1217        X3 = np.random.normal(size=100)
1218        groups = np.random.randint(0, 4, size=100)
1219
1220        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})
1221
1222        mod1 = gee.GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
1223                                    family=family, cov_struct=vs)
1224        rslt1 = mod1.fit()
1225
1226        mod2 = discrete.Logit.from_formula("Y ~ X1 + X2 + X3", data=D)
1227        rslt2 = mod2.fit(disp=False)
1228
1229        assert_almost_equal(rslt1.params.values, rslt2.params.values,
1230                            decimal=10)
1231
1232    def test_compare_poisson(self):
1233
1234        vs = cov_struct.Independence()
1235        family = families.Poisson()
1236
1237        np.random.seed(34234)
1238        Y = np.ceil(-np.log(np.random.uniform(size=100)))
1239        X1 = np.random.normal(size=100)
1240        X2 = np.random.normal(size=100)
1241        X3 = np.random.normal(size=100)
1242        groups = np.random.randint(0, 4, size=100)
1243
1244        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "X3": X3})
1245
1246        mod1 = gee.GEE.from_formula("Y ~ X1 + X2 + X3", groups, D,
1247                                    family=family, cov_struct=vs)
1248        rslt1 = mod1.fit()
1249
1250        mod2 = discrete.Poisson.from_formula("Y ~ X1 + X2 + X3", data=D)
1251        rslt2 = mod2.fit(disp=False)
1252
1253        assert_almost_equal(rslt1.params.values, rslt2.params.values,
1254                            decimal=10)
1255
1256    def test_predict(self):
1257
1258        n = 50
1259        np.random.seed(4324)
1260        X1 = np.random.normal(size=n)
1261        X2 = np.random.normal(size=n)
1262        groups = np.kron(np.arange(n / 2), np.r_[1, 1])
1263        offset = np.random.uniform(1, 2, size=n)
1264        Y = np.random.normal(0.1 * (X1 + X2) + offset, size=n)
1265        data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups,
1266                             "offset": offset})
1267
1268        fml = "Y ~ X1 + X2"
1269        model = gee.GEE.from_formula(fml, groups, data,
1270                                     family=families.Gaussian(),
1271                                     offset="offset")
1272        result = model.fit(start_params=[0, 0.1, 0.1])
1273        assert_equal(result.converged, True)
1274
1275        pred1 = result.predict()
1276        pred2 = result.predict(offset=data.offset)
1277        pred3 = result.predict(exog=data[["X1", "X2"]], offset=data.offset)
1278        pred4 = result.predict(exog=data[["X1", "X2"]], offset=0 * data.offset)
1279        pred5 = result.predict(offset=0 * data.offset)
1280
1281        assert_allclose(pred1, pred2)
1282        assert_allclose(pred1, pred3)
1283        assert_allclose(pred1, pred4 + data.offset)
1284        assert_allclose(pred1, pred5 + data.offset)
1285
1286        x1_new = np.random.normal(size=10)
1287        x2_new = np.random.normal(size=10)
1288        new_exog = pd.DataFrame({"X1": x1_new, "X2": x2_new})
1289        pred6 = result.predict(exog=new_exog)
1290        params = result.params
1291        pred6_correct = params[0] + params[1] * x1_new + params[2] * x2_new
1292        assert_allclose(pred6, pred6_correct)
1293
1294    def test_stationary_grid(self):
1295
1296        endog = np.r_[4, 2, 3, 1, 4, 5, 6, 7, 8, 3, 2, 4.]
1297        exog = np.r_[2, 3, 1, 4, 3, 2, 5, 4, 5, 6, 3, 2]
1298        group = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
1299        exog = tools.add_constant(exog)
1300
1301        cs = cov_struct.Stationary(max_lag=2, grid=True)
1302        model = gee.GEE(endog, exog, group, cov_struct=cs)
1303        result = model.fit()
1304        se = result.bse * np.sqrt(12 / 9.)  # Stata adjustment
1305
1306        assert_allclose(cs.covariance_matrix(np.r_[1, 1, 1], 0)[0].sum(),
1307                        6.4633538285149452)
1308
1309        # Obtained from Stata using:
1310        # xtgee y x, i(g) vce(robust) corr(Stationary2)
1311        assert_allclose(result.params, np.r_[
1312                        4.463968, -0.0386674], rtol=1e-5, atol=1e-5)
1313        assert_allclose(se, np.r_[0.5217202, 0.2800333], rtol=1e-5, atol=1e-5)
1314
1315    def test_stationary_nogrid(self):
1316
1317        # First test special case where the data follow a grid but we
1318        # fit using nogrid
1319        endog = np.r_[4, 2, 3, 1, 4, 5, 6, 7, 8, 3, 2, 4.]
1320        exog = np.r_[2, 3, 1, 4, 3, 2, 5, 4, 5, 6, 3, 2]
1321        time = np.r_[0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]
1322        group = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
1323
1324        exog = tools.add_constant(exog)
1325
1326        model = gee.GEE(endog, exog, group,
1327                        cov_struct=cov_struct.Stationary(max_lag=2,
1328                                                         grid=False))
1329        result = model.fit()
1330        se = result.bse * np.sqrt(12 / 9.)  # Stata adjustment
1331
1332        # Obtained from Stata using:
1333        # xtgee y x, i(g) vce(robust) corr(Stationary2)
1334        assert_allclose(result.params, np.r_[
1335                        4.463968, -0.0386674], rtol=1e-5, atol=1e-5)
1336        assert_allclose(se, np.r_[0.5217202, 0.2800333], rtol=1e-5, atol=1e-5)
1337
1338        # Smoke test for no grid  # TODO: pytest.mark.smoke>
1339        time = np.r_[0, 1, 3, 0, 2, 3, 0, 2, 3, 0, 1, 2][:, None]
1340        model = gee.GEE(endog, exog, group, time=time,
1341                        cov_struct=cov_struct.Stationary(max_lag=4,
1342                                                         grid=False))
1343        model.fit()
1344
1345    def test_predict_exposure(self):
1346
1347        n = 50
1348        np.random.seed(34234)
1349        X1 = np.random.normal(size=n)
1350        X2 = np.random.normal(size=n)
1351        groups = np.kron(np.arange(25), np.r_[1, 1])
1352        offset = np.random.uniform(1, 2, size=n)
1353        exposure = np.random.uniform(1, 2, size=n)
1354        Y = np.random.poisson(0.1 * (X1 + X2) + offset +
1355                              np.log(exposure), size=n)
1356        data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups,
1357                             "offset": offset, "exposure": exposure})
1358
1359        fml = "Y ~ X1 + X2"
1360        model = gee.GEE.from_formula(fml, groups, data,
1361                                     family=families.Poisson(),
1362                                     offset="offset", exposure="exposure")
1363        result = model.fit()
1364        assert_equal(result.converged, True)
1365
1366        pred1 = result.predict()
1367        pred2 = result.predict(offset=data["offset"])
1368        pred3 = result.predict(exposure=data["exposure"])
1369        pred4 = result.predict(
1370            offset=data["offset"], exposure=data["exposure"])
1371        pred5 = result.predict(exog=data[-10:],
1372                               offset=data["offset"][-10:],
1373                               exposure=data["exposure"][-10:])
1374        # without patsy
1375        pred6 = result.predict(exog=result.model.exog[-10:],
1376                               offset=data["offset"][-10:],
1377                               exposure=data["exposure"][-10:],
1378                               transform=False)
1379        assert_allclose(pred1, pred2)
1380        assert_allclose(pred1, pred3)
1381        assert_allclose(pred1, pred4)
1382        assert_allclose(pred1[-10:], pred5)
1383        assert_allclose(pred1[-10:], pred6)
1384
1385    def test_offset_formula(self):
1386        # Test various ways of passing offset and exposure to `from_formula`.
1387
1388        n = 50
1389        np.random.seed(34234)
1390        X1 = np.random.normal(size=n)
1391        X2 = np.random.normal(size=n)
1392        groups = np.kron(np.arange(25), np.r_[1, 1])
1393        offset = np.random.uniform(1, 2, size=n)
1394        exposure = np.exp(offset)
1395        Y = np.random.poisson(0.1 * (X1 + X2) + 2 * offset, size=n)
1396        data = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2, "groups": groups,
1397                             "offset": offset, "exposure": exposure})
1398
1399        fml = "Y ~ X1 + X2"
1400        model1 = gee.GEE.from_formula(fml, groups, data,
1401                                      family=families.Poisson(),
1402                                      offset="offset")
1403        result1 = model1.fit()
1404        assert_equal(result1.converged, True)
1405
1406        model2 = gee.GEE.from_formula(fml, groups, data,
1407                                      family=families.Poisson(),
1408                                      offset=offset)
1409        result2 = model2.fit(start_params=result1.params)
1410        assert_allclose(result1.params, result2.params)
1411        assert_equal(result2.converged, True)
1412
1413        model3 = gee.GEE.from_formula(fml, groups, data,
1414                                      family=families.Poisson(),
1415                                      exposure=exposure)
1416        result3 = model3.fit(start_params=result1.params)
1417        assert_allclose(result1.params, result3.params)
1418        assert_equal(result3.converged, True)
1419
1420        model4 = gee.GEE.from_formula(fml, groups, data,
1421                                      family=families.Poisson(),
1422                                      exposure="exposure")
1423        result4 = model4.fit(start_params=result1.params)
1424        assert_allclose(result1.params, result4.params)
1425        assert_equal(result4.converged, True)
1426
1427        model5 = gee.GEE.from_formula(fml, groups, data,
1428                                      family=families.Poisson(),
1429                                      exposure="exposure", offset="offset")
1430        result5 = model5.fit()
1431        assert_equal(result5.converged, True)
1432
1433        model6 = gee.GEE.from_formula(fml, groups, data,
1434                                      family=families.Poisson(),
1435                                      offset=2 * offset)
1436        result6 = model6.fit(start_params=result5.params)
1437        assert_allclose(result5.params, result6.params)
1438        assert_equal(result6.converged, True)
1439
1440    def test_sensitivity(self):
1441
1442        va = cov_struct.Exchangeable()
1443        family = families.Gaussian()
1444
1445        np.random.seed(34234)
1446        n = 100
1447        Y = np.random.normal(size=n)
1448        X1 = np.random.normal(size=n)
1449        X2 = np.random.normal(size=n)
1450        groups = np.kron(np.arange(50), np.r_[1, 1])
1451
1452        D = pd.DataFrame({"Y": Y, "X1": X1, "X2": X2})
1453
1454        mod = gee.GEE.from_formula("Y ~ X1 + X2", groups, D,
1455                                   family=family, cov_struct=va)
1456        rslt = mod.fit()
1457        ps = rslt.params_sensitivity(0, 0.5, 2)
1458        assert_almost_equal(len(ps), 2)
1459        assert_almost_equal([x.cov_struct.dep_params for x in ps],
1460                            [0.0, 0.5])
1461
1462        # Regression test
1463        assert_almost_equal([x.params[0] for x in ps],
1464                            [0.1696214707458818, 0.17836097387799127])
1465
1466    def test_equivalence(self):
1467        """
1468        The Equivalence covariance structure can represent an
1469        exchangeable covariance structure.  Here we check that the
1470        results are identical using the two approaches.
1471        """
1472
1473        np.random.seed(3424)
1474        endog = np.random.normal(size=20)
1475        exog = np.random.normal(size=(20, 2))
1476        exog[:, 0] = 1
1477        groups = np.kron(np.arange(5), np.ones(4))
1478        groups[12:] = 3  # Create unequal size groups
1479
1480        # Set up an Equivalence covariance structure to mimic an
1481        # Exchangeable covariance structure.
1482        pairs = {}
1483        start = [0, 4, 8, 12]
1484        for k in range(4):
1485            pairs[k] = {}
1486
1487            # Diagonal values (variance parameters)
1488            if k < 3:
1489                pairs[k][0] = (start[k] + np.r_[0, 1, 2, 3],
1490                               start[k] + np.r_[0, 1, 2, 3])
1491            else:
1492                pairs[k][0] = (start[k] + np.r_[0, 1, 2, 3, 4, 5, 6, 7],
1493                               start[k] + np.r_[0, 1, 2, 3, 4, 5, 6, 7])
1494
1495            # Off-diagonal pairs (covariance parameters)
1496            if k < 3:
1497                a, b = np.tril_indices(4, -1)
1498                pairs[k][1] = (start[k] + a, start[k] + b)
1499            else:
1500                a, b = np.tril_indices(8, -1)
1501                pairs[k][1] = (start[k] + a, start[k] + b)
1502
1503        ex = cov_struct.Exchangeable()
1504        model1 = gee.GEE(endog, exog, groups, cov_struct=ex)
1505        result1 = model1.fit()
1506
1507        for return_cov in False, True:
1508
1509            ec = cov_struct.Equivalence(pairs, return_cov=return_cov)
1510            model2 = gee.GEE(endog, exog, groups, cov_struct=ec)
1511            result2 = model2.fit()
1512
1513            # Use large atol/rtol for the correlation case since there
1514            # are some small differences in the results due to degree
1515            # of freedom differences.
1516            if return_cov is True:
1517                atol, rtol = 1e-6, 1e-6
1518            else:
1519                atol, rtol = 1e-3, 1e-3
1520            assert_allclose(result1.params, result2.params,
1521                            atol=atol, rtol=rtol)
1522            assert_allclose(result1.bse, result2.bse, atol=atol, rtol=rtol)
1523            assert_allclose(result1.scale, result2.scale, atol=atol, rtol=rtol)
1524
1525    def test_equivalence_from_pairs(self):
1526
1527        np.random.seed(3424)
1528        endog = np.random.normal(size=50)
1529        exog = np.random.normal(size=(50, 2))
1530        exog[:, 0] = 1
1531        groups = np.kron(np.arange(5), np.ones(10))
1532        groups[30:] = 3  # Create unequal size groups
1533
1534        # Set up labels.
1535        labels = np.kron(np.arange(5), np.ones(10)).astype(np.int32)
1536        labels = labels[np.random.permutation(len(labels))]
1537
1538        eq = cov_struct.Equivalence(labels=labels, return_cov=True)
1539        model1 = gee.GEE(endog, exog, groups, cov_struct=eq)
1540
1541        # Call this directly instead of letting init do it to get the
1542        # result before reindexing.
1543        eq._pairs_from_labels()
1544
1545        # Make sure the size is correct to hold every element.
1546        for g in model1.group_labels:
1547            p = eq.pairs[g]
1548            vl = [len(x[0]) for x in p.values()]
1549            m = sum(groups == g)
1550            assert_allclose(sum(vl), m * (m + 1) / 2)
1551
1552        # Check for duplicates.
1553        ixs = set([])
1554        for g in model1.group_labels:
1555            for v in eq.pairs[g].values():
1556                for a, b in zip(v[0], v[1]):
1557                    ky = (a, b)
1558                    assert(ky not in ixs)
1559                    ixs.add(ky)
1560
1561        # Smoke test  # TODO: pytest.mark.smoke?
1562        eq = cov_struct.Equivalence(labels=labels, return_cov=True)
1563        model1 = gee.GEE(endog, exog, groups, cov_struct=eq)
1564        with warnings.catch_warnings():
1565            warnings.simplefilter('ignore')
1566            model1.fit(maxiter=2)
1567
1568
1569class CheckConsistency(object):
1570
1571    start_params = None
1572
1573    def test_cov_type(self):
1574        mod = self.mod
1575        res_robust = mod.fit(start_params=self.start_params)
1576        res_naive = mod.fit(start_params=self.start_params,
1577                            cov_type='naive')
1578        res_robust_bc = mod.fit(start_params=self.start_params,
1579                                cov_type='bias_reduced')
1580
1581        # call summary to make sure it does not change cov_type
1582        res_naive.summary()
1583        res_robust_bc.summary()
1584
1585        # check cov_type
1586        assert_equal(res_robust.cov_type, 'robust')
1587        assert_equal(res_naive.cov_type, 'naive')
1588        assert_equal(res_robust_bc.cov_type, 'bias_reduced')
1589
1590        # check bse and cov_params
1591        # we are comparing different runs of the optimization
1592        # bse in ordinal and multinomial have an atol around 5e-10 for two
1593        # consecutive calls to fit.
1594        rtol = 1e-8
1595        for (res, cov_type, cov) in [
1596                (res_robust, 'robust', res_robust.cov_robust),
1597                (res_naive, 'naive', res_robust.cov_naive),
1598                (res_robust_bc, 'bias_reduced', res_robust_bc.cov_robust_bc)
1599        ]:
1600            bse = np.sqrt(np.diag(cov))
1601            assert_allclose(res.bse, bse, rtol=rtol)
1602            if cov_type != 'bias_reduced':
1603                # cov_type=naive shortcuts calculation of bias reduced
1604                # covariance for efficiency
1605                bse = res_naive.standard_errors(cov_type=cov_type)
1606                assert_allclose(res.bse, bse, rtol=rtol)
1607            assert_allclose(res.cov_params(), cov, rtol=rtol, atol=1e-10)
1608            assert_allclose(res.cov_params_default, cov, rtol=rtol, atol=1e-10)
1609
1610        # assert that we do not have a copy
1611        assert_(res_robust.cov_params_default is res_robust.cov_robust)
1612        assert_(res_naive.cov_params_default is res_naive.cov_naive)
1613        assert_(res_robust_bc.cov_params_default is
1614                res_robust_bc.cov_robust_bc)
1615
1616        # check exception for misspelled cov_type
1617        assert_raises(ValueError, mod.fit, cov_type='robust_bc')
1618
1619
1620class TestGEEPoissonCovType(CheckConsistency):
1621
1622    @classmethod
1623    def setup_class(cls):
1624
1625        endog, exog, group_n = load_data("gee_poisson_1.csv")
1626
1627        family = families.Poisson()
1628        vi = cov_struct.Independence()
1629
1630        cls.mod = gee.GEE(endog, exog, group_n, None, family, vi)
1631
1632        cls.start_params = np.array([-0.03644504, -0.05432094,  0.01566427,
1633                                     0.57628591, -0.0046566,  -0.47709315])
1634
1635    def test_wrapper(self):
1636
1637        endog, exog, group_n = load_data("gee_poisson_1.csv",
1638                                         icept=False)
1639        endog = pd.Series(endog)
1640        exog = pd.DataFrame(exog)
1641        group_n = pd.Series(group_n)
1642
1643        family = families.Poisson()
1644        vi = cov_struct.Independence()
1645
1646        mod = gee.GEE(endog, exog, group_n, None, family, vi)
1647        rslt2 = mod.fit()
1648
1649        check_wrapper(rslt2)
1650
1651
1652class TestGEEPoissonFormulaCovType(CheckConsistency):
1653
1654    @classmethod
1655    def setup_class(cls):
1656
1657        endog, exog, group_n = load_data("gee_poisson_1.csv")
1658
1659        family = families.Poisson()
1660        vi = cov_struct.Independence()
1661        # Test with formulas
1662        D = np.concatenate((endog[:, None], group_n[:, None],
1663                            exog[:, 1:]), axis=1)
1664        D = pd.DataFrame(D)
1665        D.columns = ["Y", "Id", ] + ["X%d" % (k + 1)
1666                                     for k in range(exog.shape[1] - 1)]
1667
1668        cls.mod = gee.GEE.from_formula("Y ~ X1 + X2 + X3 + X4 + X5", "Id",
1669                                       D, family=family, cov_struct=vi)
1670
1671        cls.start_params = np.array([-0.03644504, -0.05432094,  0.01566427,
1672                                     0.57628591, -0.0046566,  -0.47709315])
1673
1674
1675class TestGEEOrdinalCovType(CheckConsistency):
1676
1677    @classmethod
1678    def setup_class(cls):
1679
1680        family = families.Binomial()
1681
1682        endog, exog, groups = load_data("gee_ordinal_1.csv",
1683                                        icept=False)
1684
1685        va = cov_struct.GlobalOddsRatio("ordinal")
1686
1687        cls.mod = gee.OrdinalGEE(endog, exog, groups, None, family, va)
1688        cls.start_params = np.array([1.09250002, 0.0217443, -0.39851092,
1689                                     -0.01812116, 0.03023969, 1.18258516,
1690                                     0.01803453, -1.10203381])
1691
1692    def test_wrapper(self):
1693
1694        endog, exog, groups = load_data("gee_ordinal_1.csv",
1695                                        icept=False)
1696
1697        endog = pd.Series(endog, name='yendog')
1698        exog = pd.DataFrame(exog)
1699        groups = pd.Series(groups, name='the_group')
1700
1701        family = families.Binomial()
1702        va = cov_struct.GlobalOddsRatio("ordinal")
1703        mod = gee.OrdinalGEE(endog, exog, groups, None, family, va)
1704        rslt2 = mod.fit()
1705
1706        check_wrapper(rslt2)
1707
1708
1709class TestGEEMultinomialCovType(CheckConsistency):
1710
1711    @classmethod
1712    def setup_class(cls):
1713
1714        endog, exog, groups = load_data("gee_nominal_1.csv",
1715                                        icept=False)
1716
1717        # Test with independence correlation
1718        va = cov_struct.Independence()
1719        cls.mod = gee.NominalGEE(endog, exog, groups, cov_struct=va)
1720        cls.start_params = np.array([0.44944752,  0.45569985, -0.92007064,
1721                                     -0.46766728])
1722
1723    def test_wrapper(self):
1724
1725        endog, exog, groups = load_data("gee_nominal_1.csv",
1726                                        icept=False)
1727        endog = pd.Series(endog, name='yendog')
1728        exog = pd.DataFrame(exog)
1729        groups = pd.Series(groups, name='the_group')
1730
1731        va = cov_struct.Independence()
1732        mod = gee.NominalGEE(endog, exog, groups, cov_struct=va)
1733        rslt2 = mod.fit()
1734
1735        check_wrapper(rslt2)
1736
1737
1738def test_regularized_poisson():
1739
1740    np.random.seed(8735)
1741
1742    ng, gs, p = 1000, 5, 5
1743
1744    x = np.random.normal(size=(ng*gs, p))
1745    r = 0.5
1746    x[:, 2] = r*x[:, 1] + np.sqrt(1-r**2)*x[:, 2]
1747    lpr = 0.7*(x[:, 1] - x[:, 3])
1748    mean = np.exp(lpr)
1749    y = np.random.poisson(mean)
1750
1751    groups = np.kron(np.arange(ng), np.ones(gs))
1752
1753    model = gee.GEE(y, x, groups=groups, family=families.Poisson())
1754    result = model.fit_regularized(0.0000001)
1755
1756    assert_allclose(result.params, 0.7 * np.r_[0, 1, 0, -1, 0],
1757                    rtol=0.01, atol=0.12)
1758
1759
1760def test_regularized_gaussian():
1761
1762    # Example 1 from Wang et al.
1763
1764    np.random.seed(8735)
1765
1766    ng, gs, p = 200, 4, 200
1767
1768    groups = np.kron(np.arange(ng), np.ones(gs))
1769
1770    x = np.zeros((ng*gs, p))
1771    x[:, 0] = 1 * (np.random.uniform(size=ng*gs) < 0.5)
1772    x[:, 1] = np.random.normal(size=ng*gs)
1773    r = 0.5
1774    for j in range(2, p):
1775        eps = np.random.normal(size=ng*gs)
1776        x[:, j] = r * x[:, j-1] + np.sqrt(1 - r**2) * eps
1777    lpr = np.dot(x[:, 0:4], np.r_[2, 3, 1.5, 2])
1778    s = 0.4
1779    e = np.sqrt(s) * np.kron(np.random.normal(size=ng), np.ones(gs))
1780    e += np.sqrt(1 - s) * np.random.normal(size=ng*gs)
1781
1782    y = lpr + e
1783
1784    model = gee.GEE(y, x, cov_struct=cov_struct.Exchangeable(), groups=groups)
1785    result = model.fit_regularized(0.01, maxiter=100)
1786
1787    ex = np.zeros(200)
1788    ex[0:4] = np.r_[2, 3, 1.5, 2]
1789    assert_allclose(result.params, ex, rtol=0.01, atol=0.2)
1790
1791    assert_allclose(model.cov_struct.dep_params, np.r_[s],
1792                    rtol=0.01, atol=0.05)
1793
1794
1795@pytest.mark.smoke
1796@pytest.mark.matplotlib
1797def test_plots(close_figures):
1798
1799    np.random.seed(378)
1800    exog = np.random.normal(size=100)
1801    endog = np.random.normal(size=(100, 2))
1802    groups = np.kron(np.arange(50), np.r_[1, 1])
1803
1804    model = gee.GEE(exog, endog, groups)
1805    result = model.fit()
1806    fig = result.plot_added_variable(1)
1807    assert_equal(isinstance(fig, plt.Figure), True)
1808    fig = result.plot_partial_residuals(1)
1809    assert_equal(isinstance(fig, plt.Figure), True)
1810    fig = result.plot_ceres_residuals(1)
1811    assert_equal(isinstance(fig, plt.Figure), True)
1812    fig = result.plot_isotropic_dependence()
1813    assert_equal(isinstance(fig, plt.Figure), True)
1814
1815
1816def test_missing():
1817    # gh-1877
1818    data = [['id', 'al', 'status', 'fake', 'grps'],
1819            ['4A', 'A', 1, 1, 0],
1820            ['5A', 'A', 1, 2.0, 1],
1821            ['6A', 'A', 1, 3, 2],
1822            ['7A', 'A', 1, 2.0, 3],
1823            ['8A', 'A', 1, 1, 4],
1824            ['9A', 'A', 1, 2.0, 5],
1825            ['11A', 'A', 1, 1, 6],
1826            ['12A', 'A', 1, 2.0, 7],
1827            ['13A', 'A', 1, 1, 8],
1828            ['14A', 'A', 1, 1, 9],
1829            ['15A', 'A', 1, 1, 10],
1830            ['16A', 'A', 1, 2.0, 11],
1831            ['17A', 'A', 1, 3.0, 12],
1832            ['18A', 'A', 1, 3.0, 13],
1833            ['19A', 'A', 1, 2.0, 14],
1834            ['20A', 'A', 1, 2.0, 15],
1835            ['2C', 'C', 0, 3.0, 0],
1836            ['3C', 'C', 0, 1, 1],
1837            ['4C', 'C', 0, 1, 2],
1838            ['5C', 'C', 0, 2.0, 3],
1839            ['6C', 'C', 0, 1, 4],
1840            ['9C', 'C', 0, 1, 5],
1841            ['10C', 'C', 0, 3, 6],
1842            ['12C', 'C', 0, 3, 7],
1843            ['14C', 'C', 0, 2.5, 8],
1844            ['15C', 'C', 0, 1, 9],
1845            ['17C', 'C', 0, 1, 10],
1846            ['22C', 'C', 0, 1, 11],
1847            ['23C', 'C', 0, 1, 12],
1848            ['24C', 'C', 0, 1, 13],
1849            ['32C', 'C', 0, 2.0, 14],
1850            ['35C', 'C', 0, 1, 15]]
1851
1852    df = pd.DataFrame(data[1:], columns=data[0])
1853    df.loc[df.fake == 1, 'fake'] = np.nan
1854    mod = gee.GEE.from_formula('status ~ fake', data=df, groups='grps',
1855                               cov_struct=cov_struct.Independence(),
1856                               family=families.Binomial())
1857
1858    df = df.dropna().copy()
1859    df['constant'] = 1
1860
1861    mod2 = gee.GEE(df.status, df[['constant', 'fake']], groups=df.grps,
1862                   cov_struct=cov_struct.Independence(),
1863                   family=families.Binomial())
1864
1865    assert_equal(mod.endog, mod2.endog)
1866    assert_equal(mod.exog, mod2.exog)
1867    assert_equal(mod.groups, mod2.groups)
1868
1869    res = mod.fit()
1870    res2 = mod2.fit()
1871
1872    assert_almost_equal(res.params.values, res2.params.values)
1873
1874
1875def simple_qic_data(fam):
1876
1877    y = np.r_[0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0]
1878    x1 = np.r_[0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0]
1879    x2 = np.r_[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
1880    g = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4]
1881    x1 = x1[:, None]
1882    x2 = x2[:, None]
1883
1884    return y, x1, x2, g
1885
1886
1887# Test quasi-likelihood by numerical integration in two settings
1888# where there is a closed form expression.
1889@pytest.mark.parametrize("family", [families.Gaussian, families.Poisson])
1890def test_ql_known(family):
1891
1892    fam = family()
1893
1894    y, x1, x2, g = simple_qic_data(family)
1895
1896    model1 = gee.GEE(y, x1, family=fam, groups=g)
1897    result1 = model1.fit(ddof_scale=0)
1898    mean1 = result1.fittedvalues
1899
1900    model2 = gee.GEE(y, x2, family=fam, groups=g)
1901    result2 = model2.fit(ddof_scale=0)
1902    mean2 = result2.fittedvalues
1903
1904    if family is families.Gaussian:
1905        ql1 = -len(y) / 2.
1906        ql2 = -len(y) / 2.
1907    elif family is families.Poisson:
1908        c = np.zeros_like(y)
1909        ii = y > 0
1910        c[ii] = y[ii] * np.log(y[ii]) - y[ii]
1911        ql1 = np.sum(y * np.log(mean1) - mean1 - c)
1912        ql2 = np.sum(y * np.log(mean2) - mean2 - c)
1913    else:
1914        raise ValueError("Unknown family")
1915
1916    qle1 = model1.qic(result1.params, result1.scale, result1.cov_params())
1917    qle2 = model2.qic(result2.params, result2.scale, result2.cov_params())
1918
1919    assert_allclose(ql1, qle1[0], rtol=1e-4)
1920    assert_allclose(ql2, qle2[0], rtol=1e-4)
1921
1922    with warnings.catch_warnings():
1923        warnings.simplefilter("ignore")
1924        qler1 = result1.qic()
1925        qler2 = result2.qic()
1926    assert_allclose(qler1, qle1[1:], rtol=1e-5)
1927    assert_allclose(qler2, qle2[1:], rtol=1e-5)
1928
1929
1930# Compare differences of QL values computed by numerical integration.
1931#  Use difference here so that constants that are inconvenient to compute
1932#  cancel out.
1933@pytest.mark.parametrize("family", [families.Gaussian,
1934                                    families.Binomial,
1935                                    families.Poisson])
1936def test_ql_diff(family):
1937
1938    fam = family()
1939
1940    y, x1, x2, g = simple_qic_data(family)
1941
1942    model1 = gee.GEE(y, x1, family=fam, groups=g)
1943    result1 = model1.fit(ddof_scale=0)
1944    mean1 = result1.fittedvalues
1945
1946    model2 = gee.GEE(y, x2, family=fam, groups=g)
1947    result2 = model2.fit(ddof_scale=0)
1948    mean2 = result2.fittedvalues
1949
1950    if family is families.Gaussian:
1951        qldiff = 0
1952    elif family is families.Binomial:
1953        qldiff = np.sum(y * np.log(mean1 / (1 - mean1)) + np.log(1 - mean1))
1954        qldiff -= np.sum(y * np.log(mean2 / (1 - mean2)) + np.log(1 - mean2))
1955    elif family is families.Poisson:
1956        qldiff = (np.sum(y * np.log(mean1) - mean1)
1957                  - np.sum(y * np.log(mean2) - mean2))
1958    else:
1959        raise ValueError("unknown family")
1960
1961    qle1, _, _ = model1.qic(result1.params, result1.scale,
1962                            result1.cov_params())
1963    qle2, _, _ = model2.qic(result2.params, result2.scale,
1964                            result2.cov_params())
1965
1966    assert_allclose(qle1 - qle2, qldiff, rtol=1e-5, atol=1e-5)
1967
1968
1969def test_qic_warnings():
1970    with pytest.warns(UserWarning):
1971        fam = families.Gaussian()
1972        y, x1, _, g = simple_qic_data(fam)
1973        model = gee.GEE(y, x1, family=fam, groups=g)
1974        result = model.fit()
1975        result.qic()
1976
1977
1978@pytest.mark.parametrize("reg", [False, True])
1979def test_quasipoisson(reg):
1980
1981    np.random.seed(343)
1982
1983    n = 1000
1984    x = np.random.normal(size=(n, 3))
1985    g = np.random.gamma(1, 1, size=n)
1986    y = np.random.poisson(g)
1987    grp = np.kron(np.arange(100), np.ones(n // 100))
1988
1989    model1 = gee.GEE(y, x, family=families.Poisson(), groups=grp,
1990                     )
1991    model2 = gee.GEE(y, x, family=families.Poisson(), groups=grp,
1992                     )
1993
1994    if reg:
1995        result1 = model1.fit_regularized(pen_wt=0.1)
1996        result2 = model2.fit_regularized(pen_wt=0.1, scale="X2")
1997    else:
1998        result1 = model1.fit(cov_type="naive")
1999        result2 = model2.fit(scale="X2", cov_type="naive")
2000
2001    # The parameter estimates are the same regardless of how
2002    # the scale parameter is handled
2003    assert_allclose(result1.params, result2.params)
2004
2005    if not reg:
2006        # The robust covariance does not depend on the scale parameter,
2007        # but the naive covariance does.
2008        assert_allclose(result2.cov_naive / result1.cov_naive,
2009                        result2.scale * np.ones_like(result2.cov_naive))
2010
2011
2012def test_grid_ar():
2013
2014    np.random.seed(243)
2015
2016    r = 0.5
2017    m = 10
2018    ng = 100
2019    ii = np.arange(m)
2020    cov = r**np.abs(np.subtract.outer(ii, ii))
2021    covr = np.linalg.cholesky(cov)
2022
2023    e = [np.dot(covr, np.random.normal(size=m)) for k in range(ng)]
2024    e = 2 * np.concatenate(e)
2025
2026    grps = [[k]*m for k in range(ng)]
2027    grps = np.concatenate(grps)
2028
2029    x = np.random.normal(size=(ng*m, 3))
2030    y = np.dot(x, np.r_[1, -1, 0]) + e
2031
2032    model1 = gee.GEE(y, x, groups=grps,
2033                     cov_struct=cov_struct.Autoregressive(grid=False))
2034    result1 = model1.fit()
2035
2036    model2 = gee.GEE(y, x, groups=grps,
2037                     cov_struct=cov_struct.Autoregressive(grid=True))
2038    result2 = model2.fit()
2039
2040    model3 = gee.GEE(y, x, groups=grps,
2041                     cov_struct=cov_struct.Stationary(max_lag=1, grid=False))
2042    result3 = model3.fit()
2043
2044    assert_allclose(result1.cov_struct.dep_params,
2045                    result2.cov_struct.dep_params,
2046                    rtol=0.05)
2047    assert_allclose(result1.cov_struct.dep_params,
2048                    result3.cov_struct.dep_params[1], rtol=0.05)
2049
2050
2051def test_unstructured_complete():
2052
2053    np.random.seed(43)
2054    ngrp = 400
2055    cov = np.asarray([[1, 0.7, 0.2], [0.7, 1, 0.5], [0.2, 0.5, 1]])
2056    covr = np.linalg.cholesky(cov)
2057    e = np.random.normal(size=(ngrp, 3))
2058    e = np.dot(e, covr.T)
2059    xmat = np.random.normal(size=(3*ngrp, 3))
2060    par = np.r_[1, -2, 0.1]
2061    ey = np.dot(xmat, par)
2062    y = ey + e.ravel()
2063    g = np.kron(np.arange(ngrp), np.ones(3))
2064    t = np.kron(np.ones(ngrp), np.r_[0, 1, 2]).astype(int)
2065
2066    m = gee.GEE(y, xmat, time=t, cov_struct=cov_struct.Unstructured(),
2067                groups=g)
2068    r = m.fit()
2069
2070    assert_allclose(r.params, par, 0.05, 0.5)
2071    assert_allclose(m.cov_struct.dep_params, cov, 0.05, 0.5)
2072
2073
2074def test_unstructured_incomplete():
2075
2076    np.random.seed(43)
2077    ngrp = 400
2078    cov = np.asarray([[1, 0.7, 0.2], [0.7, 1, 0.5], [0.2, 0.5, 1]])
2079    covr = np.linalg.cholesky(cov)
2080    e = np.random.normal(size=(ngrp, 3))
2081    e = np.dot(e, covr.T)
2082    xmat = np.random.normal(size=(3*ngrp, 3))
2083    par = np.r_[1, -2, 0.1]
2084    ey = np.dot(xmat, par)
2085
2086    yl, xl, tl, gl = [], [], [], []
2087    for i in range(ngrp):
2088
2089        # Omit one observation from each group of 3
2090        ix = [0, 1, 2]
2091        ix.pop(i % 3)
2092        ix = np.asarray(ix)
2093        tl.append(ix)
2094
2095        yl.append(ey[3*i + ix] + e[i, ix])
2096        x = xmat[3*i + ix, :]
2097        xl.append(x)
2098        gl.append(i * np.ones(2))
2099
2100    y = np.concatenate(yl)
2101    x = np.concatenate(xl, axis=0)
2102    t = np.concatenate(tl)
2103    t = np.asarray(t, dtype=int)
2104    g = np.concatenate(gl)
2105
2106    m = gee.GEE(y, x, time=t[:, None], cov_struct=cov_struct.Unstructured(),
2107                groups=g)
2108    r = m.fit()
2109
2110    assert_allclose(r.params, par, 0.05, 0.5)
2111    assert_allclose(m.cov_struct.dep_params, cov, 0.05, 0.5)
2112
2113
2114def test_ar_covsolve():
2115
2116    np.random.seed(123)
2117
2118    c = cov_struct.Autoregressive(grid=True)
2119    c.dep_params = 0.4
2120
2121    for d in 1, 2, 4:
2122        for q in 1, 4:
2123
2124            ii = np.arange(d)
2125            mat = 0.4 ** np.abs(np.subtract.outer(ii, ii))
2126            sd = np.random.uniform(size=d)
2127
2128            if q == 1:
2129                z = np.random.normal(size=d)
2130            else:
2131                z = np.random.normal(size=(d, q))
2132
2133            sm = np.diag(sd)
2134            z1 = np.linalg.solve(sm,
2135                                 np.linalg.solve(mat, np.linalg.solve(sm, z)))
2136            z2 = c.covariance_matrix_solve(np.zeros_like(sd),
2137                                           np.zeros_like(sd),
2138                                           sd, [z])
2139
2140            assert_allclose(z1, z2[0], rtol=1e-5, atol=1e-5)
2141
2142
2143def test_ex_covsolve():
2144
2145    np.random.seed(123)
2146
2147    c = cov_struct.Exchangeable()
2148    c.dep_params = 0.4
2149
2150    for d in 1, 2, 4:
2151        for q in 1, 4:
2152
2153            mat = 0.4 * np.ones((d, d)) + 0.6 * np.eye(d)
2154            sd = np.random.uniform(size=d)
2155
2156            if q == 1:
2157                z = np.random.normal(size=d)
2158            else:
2159                z = np.random.normal(size=(d, q))
2160
2161            sm = np.diag(sd)
2162            z1 = np.linalg.solve(sm,
2163                                 np.linalg.solve(mat, np.linalg.solve(sm, z)))
2164            z2 = c.covariance_matrix_solve(np.zeros_like(sd),
2165                                           np.arange(d, dtype=int),
2166                                           sd, [z])
2167
2168            assert_allclose(z1, z2[0], rtol=1e-5, atol=1e-5)
2169
2170
2171def test_stationary_covsolve():
2172
2173    np.random.seed(123)
2174
2175    c = cov_struct.Stationary(grid=True)
2176    c.time = np.arange(10, dtype=int)
2177
2178    for d in 1, 2, 4:
2179        for q in 1, 4:
2180
2181            c.dep_params = (2.0 ** (-np.arange(d)))
2182            c.max_lag = d - 1
2183            mat, _ = c.covariance_matrix(np.zeros(d),
2184                                         np.arange(d, dtype=int))
2185            sd = np.random.uniform(size=d)
2186
2187            if q == 1:
2188                z = np.random.normal(size=d)
2189            else:
2190                z = np.random.normal(size=(d, q))
2191
2192            sm = np.diag(sd)
2193            z1 = np.linalg.solve(sm,
2194                                 np.linalg.solve(mat, np.linalg.solve(sm, z)))
2195            z2 = c.covariance_matrix_solve(np.zeros_like(sd),
2196                                           np.arange(d, dtype=int),
2197                                           sd, [z])
2198
2199            assert_allclose(z1, z2[0], rtol=1e-5, atol=1e-5)
2200