1""" 2Assesment of Generalized Estimating Equations using simulation. 3 4This script checks ordinal and nominal models for multinomial data. 5 6See the generated file "gee_categorical_simulation_check.txt" for 7results. 8""" 9from statsmodels.compat.python import lrange 10import numpy as np 11from scipy import stats 12from statsmodels.genmod.generalized_estimating_equations import GEE,\ 13 gee_setup_ordinal, gee_setup_nominal,\ 14 gee_ordinal_starting_values, Multinomial 15from statsmodels.genmod.families import Binomial 16from statsmodels.genmod.cov_struct import GlobalOddsRatio 17from .gee_gaussian_simulation_check import GEE_simulator 18 19 20class ordinal_simulator(GEE_simulator): 21 22 # The thresholds where the latent continuous process is cut to 23 # obtain the categorical values. 24 thresholds = None 25 26 27 def true_params(self): 28 return np.concatenate((self.thresholds, self.params)) 29 30 31 def starting_values(self, nconstraints): 32 beta = gee_ordinal_starting_values(self.endog, 33 len(self.params)) 34 if nconstraints > 0: 35 m = self.exog_ex.shape[1] - nconstraints 36 beta = beta[0:m] 37 38 return beta 39 40 41 def print_dparams(self, dparams_est): 42 OUT.write("Odds ratio estimate: %8.4f\n" % dparams_est[0]) 43 OUT.write("Odds ratio truth: %8.4f\n" % 44 self.dparams[0]) 45 OUT.write("\n") 46 47 48 def simulate(self): 49 50 endog, exog, group, time = [], [], [], [] 51 52 for i in range(self.ngroups): 53 54 gsize = np.random.randint(self.group_size_range[0], 55 self.group_size_range[1]) 56 57 group.append([i,] * gsize) 58 59 time1 = np.random.normal(size=(gsize,2)) 60 time.append(time1) 61 62 exog1 = np.random.normal(size=(gsize, len(self.params))) 63 exog.append(exog1) 64 65 lp = np.dot(exog1, self.params) 66 67 z = np.random.uniform(size=gsize) 68 z = np.log(z / (1 - z)) + lp 69 endog1 = np.array([np.sum(x > self.thresholds) for x in z]) 70 endog.append(endog1) 71 72 self.exog = np.concatenate(exog, axis=0) 73 self.endog = np.concatenate(endog) 74 self.time = np.concatenate(time, axis=0) 75 self.group = np.concatenate(group) 76 self.offset = np.zeros(len(self.endog), dtype=np.float64) 77 78 79class nominal_simulator(GEE_simulator): 80 81 def starting_values(self, nconstraints): 82 return None 83 84 def true_params(self): 85 return np.concatenate(self.params[:-1]) 86 87 def print_dparams(self, dparams_est): 88 OUT.write("Odds ratio estimate: %8.4f\n" % dparams_est[0]) 89 OUT.write("Odds ratio truth: %8.4f\n" % self.dparams[0]) 90 OUT.write("\n") 91 92 def simulate(self): 93 94 endog, exog, group, time = [], [], [], [] 95 96 for i in range(self.ngroups): 97 98 gsize = np.random.randint(self.group_size_range[0], 99 self.group_size_range[1]) 100 101 group.append([i,] * gsize) 102 103 time1 = np.random.normal(size=(gsize,2)) 104 time.append(time1) 105 106 exog1 = np.random.normal(size=(gsize, len(self.params[0]))) 107 exog.append(exog1) 108 109 # Probabilities for each outcome 110 prob = [np.exp(np.dot(exog1, p)) for p in self.params] 111 prob = np.vstack(prob).T 112 prob /= prob.sum(1)[:, None] 113 114 m = len(self.params) 115 endog1 = [] 116 for k in range(gsize): 117 pdist = stats.rv_discrete(values=(lrange(m), 118 prob[k,:])) 119 endog1.append(pdist.rvs()) 120 121 endog.append(np.asarray(endog1)) 122 123 self.exog = np.concatenate(exog, axis=0) 124 self.endog = np.concatenate(endog).astype(np.int32) 125 self.time = np.concatenate(time, axis=0) 126 self.group = np.concatenate(group) 127 self.offset = np.zeros(len(self.endog), dtype=np.float64) 128 129 130 131def gendat_ordinal(): 132 133 os = ordinal_simulator() 134 os.params = np.r_[0., 1] 135 os.ngroups = 200 136 os.thresholds = [1, 0, -1] 137 os.dparams = [1.,] 138 os.simulate() 139 140 data = np.concatenate((os.endog[:,None], os.exog, 141 os.group[:,None]), axis=1) 142 143 os.endog_ex, os.exog_ex, os.intercepts, os.nthresh = \ 144 gee_setup_ordinal(data, 0) 145 146 os.group_ex = os.exog_ex[:,-1] 147 os.exog_ex = os.exog_ex[:,0:-1] 148 149 os.exog_ex = np.concatenate((os.intercepts, os.exog_ex), 150 axis=1) 151 152 va = GlobalOddsRatio(4, "ordinal") 153 154 lhs = np.array([[0., 0., 0, 1., 0.], [0., 0, 0, 0, 1]]) 155 rhs = np.r_[0., 1] 156 157 return os, va, Binomial(), (lhs, rhs) 158 159 160def gendat_nominal(): 161 162 ns = nominal_simulator() 163 164 # The last component of params must be identically zero 165 ns.params = [np.r_[0., 1], np.r_[-1., 0], np.r_[0., 0]] 166 ns.ngroups = 200 167 ns.dparams = [1., ] 168 ns.simulate() 169 170 data = np.concatenate((ns.endog[:,None], ns.exog, 171 ns.group[:,None]), axis=1) 172 173 ns.endog_ex, ns.exog_ex, ns.exog_ne, ns.nlevel = \ 174 gee_setup_nominal(data, 0, [3,]) 175 176 ns.group_ex = ns.exog_ne[:,0] 177 178 va = GlobalOddsRatio(3, "nominal") 179 180 lhs = np.array([[0., 1., 1, 0],]) 181 rhs = np.r_[0.,] 182 183 return ns, va, Multinomial(3), (lhs, rhs) 184 185 186if __name__ == '__main__': 187 188 nrep = 100 189 190 OUT = open("gee_categorical_simulation_check.txt", "w") 191 192 np.set_printoptions(formatter={'all': lambda x: "%8.3f" % x}, 193 suppress=True) 194 195 # Loop over data generating models 196 gendats = [gendat_nominal, gendat_ordinal] 197 198 for jg,gendat in enumerate(gendats): 199 200 dparams = [] 201 params = [] 202 std_errors = [] 203 pvalues = [] 204 205 for j in range(nrep): 206 207 da, va, mt, constraint = gendat() 208 209 beta = da.starting_values(0) 210 211 md = GEE(da.endog_ex, da.exog_ex, da.group_ex, None, 212 mt, va) 213 mdf = md.fit(start_params = beta) 214 215 if mdf is None: 216 continue 217 218 scale_inv = 1 / md.estimate_scale() 219 220 dparams.append(np.r_[va.dparams, scale_inv]) 221 222 params.append(np.asarray(mdf.params)) 223 std_errors.append(np.asarray(mdf.standard_errors)) 224 225 da, va, mt, constraint = gendat() 226 227 beta = da.starting_values(constraint[0].shape[0]) 228 229 md = GEE(da.endog_ex, da.exog_ex, da.group_ex, None, 230 mt, va, constraint=constraint) 231 mdf = md.fit(start_params = beta) 232 233 if mdf is None: 234 continue 235 236 score = md.score_test_results 237 pvalues.append(score["p-value"]) 238 239 dparams_mean = np.array(sum(dparams) / len(dparams)) 240 241 OUT.write("%s data.\n" % ("Nominal", "Ordinal")[jg]) 242 OUT.write("%d runs converged successfully.\n" % len(pvalues)) 243 244 OUT.write("Checking dependence parameters:\n") 245 da.print_dparams(dparams_mean) 246 247 params = np.array(params) 248 eparams = params.mean(0) 249 sdparams = params.std(0) 250 std_errors = np.array(std_errors) 251 std_errors = std_errors.mean(0) 252 true_params = da.true_params() 253 254 OUT.write("Checking parameter values:\n") 255 OUT.write("Observed: ") 256 OUT.write(np.array_str(eparams) + "\n") 257 OUT.write("Expected: ") 258 OUT.write(np.array_str(true_params) + "\n") 259 OUT.write("Absolute difference: ") 260 OUT.write(np.array_str(eparams - true_params) + "\n") 261 OUT.write("Relative difference: ") 262 OUT.write(np.array_str((eparams - true_params) / true_params) 263 + "\n") 264 OUT.write("\n") 265 266 OUT.write("Checking standard errors:\n") 267 OUT.write("Observed: ") 268 OUT.write(np.array_str(sdparams) + "\n") 269 OUT.write("Expected: ") 270 OUT.write(np.array_str(std_errors) + "\n") 271 OUT.write("Absolute difference: ") 272 OUT.write(np.array_str(sdparams - std_errors) + "\n") 273 OUT.write("Relative difference: ") 274 OUT.write(np.array_str((sdparams - std_errors) / std_errors) 275 + "\n") 276 OUT.write("\n") 277 278 OUT.write("Checking constrained estimation:\n") 279 OUT.write("Observed Expected\n") 280 281 pvalues.sort() 282 for q in np.arange(0.1, 0.91, 0.1): 283 OUT.write("%10.3f %10.3f\n" % 284 (pvalues[int(q*len(pvalues))], q)) 285 286 OUT.write("=" * 80 + "\n\n") 287 288 OUT.close() 289