1"""
2Assesment of Generalized Estimating Equations using simulation.
3
4This script checks ordinal and nominal models for multinomial data.
5
6See the generated file "gee_categorical_simulation_check.txt" for
7results.
8"""
9from statsmodels.compat.python import lrange
10import numpy as np
11from scipy import stats
12from statsmodels.genmod.generalized_estimating_equations import GEE,\
13    gee_setup_ordinal, gee_setup_nominal,\
14    gee_ordinal_starting_values, Multinomial
15from statsmodels.genmod.families import Binomial
16from statsmodels.genmod.cov_struct import GlobalOddsRatio
17from .gee_gaussian_simulation_check import GEE_simulator
18
19
20class ordinal_simulator(GEE_simulator):
21
22    # The thresholds where the latent continuous process is cut to
23    # obtain the categorical values.
24    thresholds = None
25
26
27    def true_params(self):
28        return np.concatenate((self.thresholds, self.params))
29
30
31    def starting_values(self, nconstraints):
32        beta = gee_ordinal_starting_values(self.endog,
33                                           len(self.params))
34        if nconstraints > 0:
35            m = self.exog_ex.shape[1] - nconstraints
36            beta = beta[0:m]
37
38        return beta
39
40
41    def print_dparams(self, dparams_est):
42        OUT.write("Odds ratio estimate:   %8.4f\n" % dparams_est[0])
43        OUT.write("Odds ratio truth:      %8.4f\n" %
44                  self.dparams[0])
45        OUT.write("\n")
46
47
48    def simulate(self):
49
50        endog, exog, group, time = [], [], [], []
51
52        for i in range(self.ngroups):
53
54            gsize = np.random.randint(self.group_size_range[0],
55                                      self.group_size_range[1])
56
57            group.append([i,] * gsize)
58
59            time1 = np.random.normal(size=(gsize,2))
60            time.append(time1)
61
62            exog1 = np.random.normal(size=(gsize, len(self.params)))
63            exog.append(exog1)
64
65            lp = np.dot(exog1, self.params)
66
67            z = np.random.uniform(size=gsize)
68            z = np.log(z / (1 - z)) + lp
69            endog1 = np.array([np.sum(x > self.thresholds) for x in z])
70            endog.append(endog1)
71
72        self.exog = np.concatenate(exog, axis=0)
73        self.endog = np.concatenate(endog)
74        self.time = np.concatenate(time, axis=0)
75        self.group = np.concatenate(group)
76        self.offset = np.zeros(len(self.endog), dtype=np.float64)
77
78
79class nominal_simulator(GEE_simulator):
80
81    def starting_values(self, nconstraints):
82        return None
83
84    def true_params(self):
85        return np.concatenate(self.params[:-1])
86
87    def print_dparams(self, dparams_est):
88        OUT.write("Odds ratio estimate:   %8.4f\n" % dparams_est[0])
89        OUT.write("Odds ratio truth:      %8.4f\n" % self.dparams[0])
90        OUT.write("\n")
91
92    def simulate(self):
93
94        endog, exog, group, time = [], [], [], []
95
96        for i in range(self.ngroups):
97
98            gsize = np.random.randint(self.group_size_range[0],
99                                      self.group_size_range[1])
100
101            group.append([i,] * gsize)
102
103            time1 = np.random.normal(size=(gsize,2))
104            time.append(time1)
105
106            exog1 = np.random.normal(size=(gsize, len(self.params[0])))
107            exog.append(exog1)
108
109            # Probabilities for each outcome
110            prob = [np.exp(np.dot(exog1, p)) for p in self.params]
111            prob = np.vstack(prob).T
112            prob /= prob.sum(1)[:, None]
113
114            m = len(self.params)
115            endog1 = []
116            for k in range(gsize):
117                pdist = stats.rv_discrete(values=(lrange(m),
118                                                  prob[k,:]))
119                endog1.append(pdist.rvs())
120
121            endog.append(np.asarray(endog1))
122
123        self.exog = np.concatenate(exog, axis=0)
124        self.endog = np.concatenate(endog).astype(np.int32)
125        self.time = np.concatenate(time, axis=0)
126        self.group = np.concatenate(group)
127        self.offset = np.zeros(len(self.endog), dtype=np.float64)
128
129
130
131def gendat_ordinal():
132
133    os = ordinal_simulator()
134    os.params = np.r_[0., 1]
135    os.ngroups = 200
136    os.thresholds = [1, 0, -1]
137    os.dparams = [1.,]
138    os.simulate()
139
140    data = np.concatenate((os.endog[:,None], os.exog,
141                           os.group[:,None]), axis=1)
142
143    os.endog_ex, os.exog_ex, os.intercepts, os.nthresh = \
144        gee_setup_ordinal(data, 0)
145
146    os.group_ex = os.exog_ex[:,-1]
147    os.exog_ex = os.exog_ex[:,0:-1]
148
149    os.exog_ex = np.concatenate((os.intercepts, os.exog_ex),
150                                axis=1)
151
152    va = GlobalOddsRatio(4, "ordinal")
153
154    lhs = np.array([[0., 0., 0, 1., 0.], [0., 0, 0, 0, 1]])
155    rhs = np.r_[0., 1]
156
157    return os, va, Binomial(), (lhs, rhs)
158
159
160def gendat_nominal():
161
162    ns = nominal_simulator()
163
164    # The last component of params must be identically zero
165    ns.params = [np.r_[0., 1], np.r_[-1., 0], np.r_[0., 0]]
166    ns.ngroups = 200
167    ns.dparams = [1., ]
168    ns.simulate()
169
170    data = np.concatenate((ns.endog[:,None], ns.exog,
171                           ns.group[:,None]), axis=1)
172
173    ns.endog_ex, ns.exog_ex, ns.exog_ne, ns.nlevel = \
174        gee_setup_nominal(data, 0, [3,])
175
176    ns.group_ex = ns.exog_ne[:,0]
177
178    va = GlobalOddsRatio(3, "nominal")
179
180    lhs = np.array([[0., 1., 1, 0],])
181    rhs = np.r_[0.,]
182
183    return ns, va, Multinomial(3), (lhs, rhs)
184
185
186if __name__ == '__main__':
187
188    nrep = 100
189
190    OUT = open("gee_categorical_simulation_check.txt", "w")
191
192    np.set_printoptions(formatter={'all': lambda x: "%8.3f" % x},
193                        suppress=True)
194
195    # Loop over data generating models
196    gendats = [gendat_nominal, gendat_ordinal]
197
198    for jg,gendat in enumerate(gendats):
199
200        dparams = []
201        params = []
202        std_errors = []
203        pvalues = []
204
205        for j in range(nrep):
206
207            da, va, mt, constraint = gendat()
208
209            beta = da.starting_values(0)
210
211            md = GEE(da.endog_ex, da.exog_ex, da.group_ex, None,
212                     mt, va)
213            mdf = md.fit(start_params = beta)
214
215            if mdf is None:
216                continue
217
218            scale_inv = 1 / md.estimate_scale()
219
220            dparams.append(np.r_[va.dparams, scale_inv])
221
222            params.append(np.asarray(mdf.params))
223            std_errors.append(np.asarray(mdf.standard_errors))
224
225            da, va, mt, constraint = gendat()
226
227            beta = da.starting_values(constraint[0].shape[0])
228
229            md = GEE(da.endog_ex, da.exog_ex, da.group_ex, None,
230                     mt, va, constraint=constraint)
231            mdf = md.fit(start_params = beta)
232
233            if mdf is None:
234                continue
235
236            score = md.score_test_results
237            pvalues.append(score["p-value"])
238
239        dparams_mean = np.array(sum(dparams) / len(dparams))
240
241        OUT.write("%s data.\n" % ("Nominal", "Ordinal")[jg])
242        OUT.write("%d runs converged successfully.\n" % len(pvalues))
243
244        OUT.write("Checking dependence parameters:\n")
245        da.print_dparams(dparams_mean)
246
247        params = np.array(params)
248        eparams = params.mean(0)
249        sdparams = params.std(0)
250        std_errors = np.array(std_errors)
251        std_errors = std_errors.mean(0)
252        true_params = da.true_params()
253
254        OUT.write("Checking parameter values:\n")
255        OUT.write("Observed:            ")
256        OUT.write(np.array_str(eparams) + "\n")
257        OUT.write("Expected:            ")
258        OUT.write(np.array_str(true_params) + "\n")
259        OUT.write("Absolute difference: ")
260        OUT.write(np.array_str(eparams - true_params) + "\n")
261        OUT.write("Relative difference: ")
262        OUT.write(np.array_str((eparams - true_params) / true_params)
263                  + "\n")
264        OUT.write("\n")
265
266        OUT.write("Checking standard errors:\n")
267        OUT.write("Observed:            ")
268        OUT.write(np.array_str(sdparams) + "\n")
269        OUT.write("Expected:            ")
270        OUT.write(np.array_str(std_errors) + "\n")
271        OUT.write("Absolute difference: ")
272        OUT.write(np.array_str(sdparams - std_errors) + "\n")
273        OUT.write("Relative difference: ")
274        OUT.write(np.array_str((sdparams - std_errors) / std_errors)
275                  + "\n")
276        OUT.write("\n")
277
278        OUT.write("Checking constrained estimation:\n")
279        OUT.write("Observed   Expected\n")
280
281        pvalues.sort()
282        for q in np.arange(0.1, 0.91, 0.1):
283            OUT.write("%10.3f %10.3f\n" %
284                      (pvalues[int(q*len(pvalues))], q))
285
286        OUT.write("=" * 80 + "\n\n")
287
288    OUT.close()
289