1""" 2Assesment of Generalized Estimating Equations using simulation. 3 4Only Gaussian models are currently checked. 5 6See the generated file "gee_simulation_check.txt" for results. 7""" 8from statsmodels.compat.python import lrange 9import scipy 10import numpy as np 11from itertools import product 12from statsmodels.genmod.families import Gaussian 13from statsmodels.genmod.generalized_estimating_equations import GEE 14from statsmodels.genmod.cov_struct import Autoregressive, Nested 15 16np.set_printoptions(formatter={'all': lambda x: "%8.3f" % x}, 17 suppress=True) 18 19 20OUT = open("gee_simulation_check.txt", "w") 21 22class GEE_simulator(object): 23 24 # 25 # Parameters that must be defined 26 # 27 28 # Number of groups 29 ngroups = None 30 31 # Standard deviation of the pure errors 32 error_sd = None 33 34 # The regression coefficients 35 params = None 36 37 # The parameters defining the dependence structure 38 dparams = None 39 40 41 # 42 # Output parameters 43 # 44 45 # Matrix of exogeneous data (rows are cases, columns are 46 # variables) 47 exog = None 48 49 # Matrix of endogeneous data (len(endog) = exog.shape[0]) 50 endog = None 51 52 # Matrix of time information (time.shape[0] = len(endog)) 53 time = None 54 55 # Group labels (len(groups) = len(endog)) 56 group = None 57 58 # Group sizes are random within this range 59 group_size_range = [4, 11] 60 61 # dparams_est is dparams with scale_inv appended 62 def print_dparams(self, dparams_est): 63 raise NotImplementedError 64 65 66class AR_simulator(GEE_simulator): 67 68 # The distance function for determining AR correlations. 69 distfun = [lambda x, y: np.sqrt(np.sum((x-y)**2)),] 70 71 72 def print_dparams(self, dparams_est): 73 OUT.write("AR coefficient estimate: %8.4f\n" % 74 dparams_est[0]) 75 OUT.write("AR coefficient truth: %8.4f\n" % 76 self.dparams[0]) 77 OUT.write("Error variance estimate: %8.4f\n" % 78 dparams_est[1]) 79 OUT.write("Error variance truth: %8.4f\n" % 80 self.error_sd**2) 81 OUT.write("\n") 82 83 def simulate(self): 84 85 endog, exog, group, time = [], [], [], [] 86 87 for i in range(self.ngroups): 88 89 gsize = np.random.randint(self.group_size_range[0], 90 self.group_size_range[1]) 91 92 group.append([i,] * gsize) 93 94 time1 = np.random.normal(size=(gsize,2)) 95 time.append(time1) 96 97 exog1 = np.random.normal(size=(gsize, 5)) 98 exog1[:,0] = 1 99 exog.append(exog1) 100 101 # Pairwise distances within the cluster 102 distances = scipy.spatial.distance.cdist(time1, time1, 103 self.distfun[0]) 104 105 # Pairwise correlations within the cluster 106 correlations = self.dparams[0]**distances 107 correlations_sr = np.linalg.cholesky(correlations) 108 109 errors = np.dot(correlations_sr, np.random.normal(size=gsize)) 110 111 endog1 = np.dot(exog1, self.params) + errors * self.error_sd 112 endog.append(endog1) 113 114 self.exog = np.concatenate(exog, axis=0) 115 self.endog = np.concatenate(endog) 116 self.time = np.concatenate(time, axis=0) 117 self.group = np.concatenate(group) 118 119 120 121class Nested_simulator(GEE_simulator): 122 123 # Vector containing list of nest sizes (used instead of 124 # group_size_range). 125 nest_sizes = None 126 127 # Matrix of nest id's (an output parameter) 128 id_matrix = None 129 130 131 def print_dparams(self, dparams_est): 132 for j in range(len(self.nest_sizes)): 133 OUT.write("Nest %d variance estimate: %8.4f\n" % \ 134 (j+1, dparams_est[j])) 135 OUT.write("Nest %d variance truth: %8.4f\n" % \ 136 (j+1, self.dparams[j])) 137 138 OUT.write("Error variance estimate: %8.4f\n" % \ 139 (dparams_est[-1] - sum(dparams_est[0:-1]))) 140 OUT.write("Error variance truth: %8.4f\n" % 141 self.error_sd**2) 142 OUT.write("\n") 143 144 145 def simulate(self): 146 147 group_effect_var = self.dparams[0] 148 149 vcomp = self.dparams[1:] 150 vcomp.append(0) 151 152 endog, exog, group, id_matrix = [], [], [], [] 153 154 for i in range(self.ngroups): 155 156 iterators = [lrange(n) for n in self.nest_sizes] 157 158 # The random effects 159 variances = [np.sqrt(v)*np.random.normal(size=n) 160 for v,n in zip(vcomp, self.nest_sizes)] 161 162 gpe = np.random.normal() * np.sqrt(group_effect_var) 163 164 nest_all = [] 165 for j in self.nest_sizes: 166 nest_all.append(set()) 167 168 for nest in product(*iterators): 169 170 group.append(i) 171 172 # The sum of all random effects that apply to this 173 # unit 174 ref = gpe + sum([v[j] for v,j in zip(variances, nest)]) 175 176 exog1 = np.random.normal(size=5) 177 exog1[0] = 1 178 exog.append(exog1) 179 180 error = ref + self.error_sd * np.random.normal() 181 182 endog1 = np.dot(exog1, self.params) + error 183 endog.append(endog1) 184 185 for j in range(len(nest)): 186 nest_all[j].add(tuple(nest[0:j+1])) 187 188 nest1 = [len(x)-1 for x in nest_all] 189 id_matrix.append(nest1[0:-1]) 190 191 self.exog = np.array(exog) 192 self.endog = np.array(endog) 193 self.group = np.array(group) 194 self.id_matrix = np.array(id_matrix) 195 self.time = np.zeros_like(self.endog) 196 197 198 199 200 201 202 203def check_constraint(da, va, ga): 204 """ 205 Check the score testing of the parameter constraints. 206 """ 207 208 209 210 211 212def gen_gendat_ar0(ar): 213 def gendat_ar0(msg = False): 214 ars = AR_simulator() 215 ars.ngroups = 200 216 ars.params = np.r_[0, -1, 1, 0, 0.5] 217 ars.error_sd = 2 218 ars.dparams = [ar,] 219 ars.simulate() 220 return ars, Autoregressive() 221 return gendat_ar0 222 223def gen_gendat_ar1(ar): 224 def gendat_ar1(): 225 ars = AR_simulator() 226 ars.ngroups = 200 227 ars.params = np.r_[0, -0.8, 1.2, 0, 0.5] 228 ars.error_sd = 2 229 ars.dparams = [ar,] 230 ars.simulate() 231 return ars, Autoregressive() 232 return gendat_ar1 233 234def gendat_nested0(): 235 ns = Nested_simulator() 236 ns.error_sd = 1. 237 ns.params = np.r_[0., 1, 1, -1, -1] 238 ns.ngroups = 50 239 ns.nest_sizes = [10, 5] 240 ns.dparams = [2., 1.] 241 ns.simulate() 242 return ns, Nested(ns.id_matrix) 243 244def gendat_nested1(): 245 ns = Nested_simulator() 246 ns.error_sd = 2. 247 ns.params = np.r_[0, 1, 1.3, -0.8, -1.2] 248 ns.ngroups = 50 249 ns.nest_sizes = [10, 5] 250 ns.dparams = [1., 3.] 251 ns.simulate() 252 return ns, Nested(ns.id_matrix) 253 254 255nrep = 100 256 257gendats = [gen_gendat_ar0(ar) for ar in (0, 0.3, 0.6)] 258gendats.extend([gen_gendat_ar1(ar) for ar in (0, 0.3, 0.6)]) 259gendats.extend([gendat_nested0, gendat_nested1]) 260 261lhs = np.array([[0., 1, 1, 0, 0],]) 262rhs = np.r_[0.,] 263 264# Loop over data generating models 265for gendat in gendats: 266 267 pvalues = [] 268 params = [] 269 std_errors = [] 270 dparams = [] 271 272 for j in range(nrep): 273 274 da,va = gendat() 275 ga = Gaussian() 276 277 md = GEE(da.endog, da.exog, da.group, da.time, ga, va) 278 mdf = md.fit() 279 280 scale_inv = 1 / md.estimate_scale() 281 dparams.append(np.r_[va.dparams, scale_inv]) 282 params.append(np.asarray(mdf.params)) 283 std_errors.append(np.asarray(mdf.standard_errors)) 284 285 da,va = gendat() 286 ga = Gaussian() 287 288 md = GEE(da.endog, da.exog, da.group, da.time, ga, va, 289 constraint=(lhs, rhs)) 290 mdf = md.fit() 291 score = md.score_test_results 292 pvalue = score["p-value"] 293 pvalues.append(pvalue) 294 295 dparams_mean = np.array(sum(dparams) / len(dparams)) 296 OUT.write("Checking dependence parameters:\n") 297 da.print_dparams(dparams_mean) 298 299 params = np.array(params) 300 eparams = params.mean(0) 301 sdparams = params.std(0) 302 std_errors = np.array(std_errors) 303 std_errors = std_errors.mean(0) 304 305 OUT.write("Checking parameter values:\n") 306 OUT.write("Observed: ") 307 OUT.write(np.array_str(eparams) + "\n") 308 OUT.write("Expected: ") 309 OUT.write(np.array_str(da.params) + "\n") 310 OUT.write("Absolute difference: ") 311 OUT.write(np.array_str(eparams - da.params) + "\n") 312 OUT.write("Relative difference: ") 313 OUT.write(np.array_str((eparams - da.params) / da.params) + "\n") 314 OUT.write("\n") 315 316 OUT.write("Checking standard errors\n") 317 OUT.write("Observed: ") 318 OUT.write(np.array_str(sdparams) + "\n") 319 OUT.write("Expected: ") 320 OUT.write(np.array_str(std_errors) + "\n") 321 OUT.write("Absolute difference: ") 322 OUT.write(np.array_str(sdparams - std_errors) + "\n") 323 OUT.write("Relative difference: ") 324 OUT.write(np.array_str((sdparams - std_errors) / std_errors) + "\n") 325 OUT.write("\n") 326 327 pvalues.sort() 328 OUT.write("Checking constrained estimation:\n") 329 OUT.write("Left hand side:\n") 330 OUT.write(np.array_str(lhs) + "\n") 331 OUT.write("Right hand side:\n") 332 OUT.write(np.array_str(rhs) + "\n") 333 OUT.write("Observed p-values Expected Null p-values\n") 334 for q in np.arange(0.1, 0.91, 0.1): 335 OUT.write("%20.3f %20.3f\n" % (pvalues[int(q*len(pvalues))], q)) 336 337 OUT.write("=" * 80 + "\n\n") 338 339OUT.close() 340