1"""
2Assesment of Generalized Estimating Equations using simulation.
3
4Only Gaussian models are currently checked.
5
6See the generated file "gee_simulation_check.txt" for results.
7"""
8from statsmodels.compat.python import lrange
9import scipy
10import numpy as np
11from itertools import product
12from statsmodels.genmod.families import Gaussian
13from statsmodels.genmod.generalized_estimating_equations import GEE
14from statsmodels.genmod.cov_struct import Autoregressive, Nested
15
16np.set_printoptions(formatter={'all': lambda x: "%8.3f" % x},
17                    suppress=True)
18
19
20OUT = open("gee_simulation_check.txt", "w")
21
22class GEE_simulator(object):
23
24    #
25    # Parameters that must be defined
26    #
27
28    # Number of groups
29    ngroups = None
30
31    # Standard deviation of the pure errors
32    error_sd = None
33
34    # The regression coefficients
35    params = None
36
37    # The parameters defining the dependence structure
38    dparams = None
39
40
41    #
42    # Output parameters
43    #
44
45    # Matrix of exogeneous data (rows are cases, columns are
46    # variables)
47    exog = None
48
49    # Matrix of endogeneous data (len(endog) = exog.shape[0])
50    endog = None
51
52    # Matrix of time information (time.shape[0] = len(endog))
53    time = None
54
55    # Group labels (len(groups) = len(endog))
56    group = None
57
58    # Group sizes are random within this range
59    group_size_range = [4, 11]
60
61    # dparams_est is dparams with scale_inv appended
62    def print_dparams(self, dparams_est):
63        raise NotImplementedError
64
65
66class AR_simulator(GEE_simulator):
67
68    # The distance function for determining AR correlations.
69    distfun = [lambda x, y: np.sqrt(np.sum((x-y)**2)),]
70
71
72    def print_dparams(self, dparams_est):
73        OUT.write("AR coefficient estimate:   %8.4f\n" %
74                  dparams_est[0])
75        OUT.write("AR coefficient truth:      %8.4f\n" %
76                  self.dparams[0])
77        OUT.write("Error variance estimate:   %8.4f\n" %
78                  dparams_est[1])
79        OUT.write("Error variance truth:      %8.4f\n" %
80                  self.error_sd**2)
81        OUT.write("\n")
82
83    def simulate(self):
84
85        endog, exog, group, time = [], [], [], []
86
87        for i in range(self.ngroups):
88
89            gsize = np.random.randint(self.group_size_range[0],
90                                      self.group_size_range[1])
91
92            group.append([i,] * gsize)
93
94            time1 = np.random.normal(size=(gsize,2))
95            time.append(time1)
96
97            exog1 = np.random.normal(size=(gsize, 5))
98            exog1[:,0] = 1
99            exog.append(exog1)
100
101            # Pairwise distances within the cluster
102            distances = scipy.spatial.distance.cdist(time1, time1,
103                                                     self.distfun[0])
104
105            # Pairwise correlations within the cluster
106            correlations = self.dparams[0]**distances
107            correlations_sr = np.linalg.cholesky(correlations)
108
109            errors = np.dot(correlations_sr, np.random.normal(size=gsize))
110
111            endog1 = np.dot(exog1, self.params) + errors * self.error_sd
112            endog.append(endog1)
113
114        self.exog = np.concatenate(exog, axis=0)
115        self.endog = np.concatenate(endog)
116        self.time = np.concatenate(time, axis=0)
117        self.group = np.concatenate(group)
118
119
120
121class Nested_simulator(GEE_simulator):
122
123    # Vector containing list of nest sizes (used instead of
124    # group_size_range).
125    nest_sizes = None
126
127    # Matrix of nest id's (an output parameter)
128    id_matrix = None
129
130
131    def print_dparams(self, dparams_est):
132        for j in range(len(self.nest_sizes)):
133            OUT.write("Nest %d variance estimate:  %8.4f\n" % \
134                          (j+1, dparams_est[j]))
135            OUT.write("Nest %d variance truth:     %8.4f\n" % \
136                          (j+1, self.dparams[j]))
137
138        OUT.write("Error variance estimate:   %8.4f\n" % \
139            (dparams_est[-1] - sum(dparams_est[0:-1])))
140        OUT.write("Error variance truth:      %8.4f\n" %
141                  self.error_sd**2)
142        OUT.write("\n")
143
144
145    def simulate(self):
146
147        group_effect_var = self.dparams[0]
148
149        vcomp = self.dparams[1:]
150        vcomp.append(0)
151
152        endog, exog, group, id_matrix = [], [], [], []
153
154        for i in range(self.ngroups):
155
156            iterators = [lrange(n) for n in self.nest_sizes]
157
158            # The random effects
159            variances = [np.sqrt(v)*np.random.normal(size=n)
160                         for v,n in zip(vcomp, self.nest_sizes)]
161
162            gpe = np.random.normal() * np.sqrt(group_effect_var)
163
164            nest_all = []
165            for j in self.nest_sizes:
166                nest_all.append(set())
167
168            for nest in product(*iterators):
169
170                group.append(i)
171
172                # The sum of all random effects that apply to this
173                # unit
174                ref = gpe + sum([v[j] for v,j in zip(variances, nest)])
175
176                exog1 = np.random.normal(size=5)
177                exog1[0] = 1
178                exog.append(exog1)
179
180                error = ref + self.error_sd * np.random.normal()
181
182                endog1 = np.dot(exog1, self.params) + error
183                endog.append(endog1)
184
185                for j in range(len(nest)):
186                    nest_all[j].add(tuple(nest[0:j+1]))
187
188                nest1 = [len(x)-1 for x in nest_all]
189                id_matrix.append(nest1[0:-1])
190
191        self.exog = np.array(exog)
192        self.endog = np.array(endog)
193        self.group = np.array(group)
194        self.id_matrix = np.array(id_matrix)
195        self.time = np.zeros_like(self.endog)
196
197
198
199
200
201
202
203def check_constraint(da, va, ga):
204    """
205    Check the score testing of the parameter constraints.
206    """
207
208
209
210
211
212def gen_gendat_ar0(ar):
213    def gendat_ar0(msg = False):
214        ars = AR_simulator()
215        ars.ngroups = 200
216        ars.params = np.r_[0, -1, 1, 0, 0.5]
217        ars.error_sd = 2
218        ars.dparams = [ar,]
219        ars.simulate()
220        return ars, Autoregressive()
221    return gendat_ar0
222
223def gen_gendat_ar1(ar):
224    def gendat_ar1():
225        ars = AR_simulator()
226        ars.ngroups = 200
227        ars.params = np.r_[0, -0.8, 1.2, 0, 0.5]
228        ars.error_sd = 2
229        ars.dparams = [ar,]
230        ars.simulate()
231        return ars, Autoregressive()
232    return gendat_ar1
233
234def gendat_nested0():
235    ns = Nested_simulator()
236    ns.error_sd = 1.
237    ns.params = np.r_[0., 1, 1, -1, -1]
238    ns.ngroups = 50
239    ns.nest_sizes = [10, 5]
240    ns.dparams = [2., 1.]
241    ns.simulate()
242    return ns, Nested(ns.id_matrix)
243
244def gendat_nested1():
245    ns = Nested_simulator()
246    ns.error_sd = 2.
247    ns.params = np.r_[0, 1, 1.3, -0.8, -1.2]
248    ns.ngroups = 50
249    ns.nest_sizes = [10, 5]
250    ns.dparams = [1., 3.]
251    ns.simulate()
252    return ns, Nested(ns.id_matrix)
253
254
255nrep = 100
256
257gendats = [gen_gendat_ar0(ar) for ar in (0, 0.3, 0.6)]
258gendats.extend([gen_gendat_ar1(ar) for ar in (0, 0.3, 0.6)])
259gendats.extend([gendat_nested0, gendat_nested1])
260
261lhs = np.array([[0., 1, 1, 0, 0],])
262rhs = np.r_[0.,]
263
264# Loop over data generating models
265for gendat in gendats:
266
267    pvalues = []
268    params = []
269    std_errors = []
270    dparams = []
271
272    for j in range(nrep):
273
274        da,va = gendat()
275        ga = Gaussian()
276
277        md = GEE(da.endog, da.exog, da.group, da.time, ga, va)
278        mdf = md.fit()
279
280        scale_inv = 1 / md.estimate_scale()
281        dparams.append(np.r_[va.dparams, scale_inv])
282        params.append(np.asarray(mdf.params))
283        std_errors.append(np.asarray(mdf.standard_errors))
284
285        da,va = gendat()
286        ga = Gaussian()
287
288        md = GEE(da.endog, da.exog, da.group, da.time, ga, va,
289                 constraint=(lhs, rhs))
290        mdf = md.fit()
291        score = md.score_test_results
292        pvalue = score["p-value"]
293        pvalues.append(pvalue)
294
295    dparams_mean = np.array(sum(dparams) / len(dparams))
296    OUT.write("Checking dependence parameters:\n")
297    da.print_dparams(dparams_mean)
298
299    params = np.array(params)
300    eparams = params.mean(0)
301    sdparams = params.std(0)
302    std_errors = np.array(std_errors)
303    std_errors = std_errors.mean(0)
304
305    OUT.write("Checking parameter values:\n")
306    OUT.write("Observed:            ")
307    OUT.write(np.array_str(eparams) + "\n")
308    OUT.write("Expected:            ")
309    OUT.write(np.array_str(da.params) + "\n")
310    OUT.write("Absolute difference: ")
311    OUT.write(np.array_str(eparams - da.params) + "\n")
312    OUT.write("Relative difference: ")
313    OUT.write(np.array_str((eparams - da.params) / da.params) + "\n")
314    OUT.write("\n")
315
316    OUT.write("Checking standard errors\n")
317    OUT.write("Observed:            ")
318    OUT.write(np.array_str(sdparams) + "\n")
319    OUT.write("Expected:            ")
320    OUT.write(np.array_str(std_errors) + "\n")
321    OUT.write("Absolute difference: ")
322    OUT.write(np.array_str(sdparams - std_errors) + "\n")
323    OUT.write("Relative difference: ")
324    OUT.write(np.array_str((sdparams - std_errors) / std_errors) + "\n")
325    OUT.write("\n")
326
327    pvalues.sort()
328    OUT.write("Checking constrained estimation:\n")
329    OUT.write("Left hand side:\n")
330    OUT.write(np.array_str(lhs) + "\n")
331    OUT.write("Right hand side:\n")
332    OUT.write(np.array_str(rhs) + "\n")
333    OUT.write("Observed p-values   Expected Null p-values\n")
334    for q in np.arange(0.1, 0.91, 0.1):
335        OUT.write("%20.3f %20.3f\n" % (pvalues[int(q*len(pvalues))], q))
336
337    OUT.write("=" * 80 + "\n\n")
338
339OUT.close()
340