1import numpy as np 2from statsmodels.discrete.conditional_models import ( 3 ConditionalLogit, ConditionalPoisson, ConditionalMNLogit) 4from statsmodels.tools.numdiff import approx_fprime 5from numpy.testing import assert_allclose 6import pandas as pd 7 8 9def test_logit_1d(): 10 11 y = np.r_[0, 1, 0, 1, 0, 1, 0, 1, 1, 1] 12 g = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 2] 13 14 x = np.r_[0, 1, 0, 0, 1, 1, 0, 0, 1, 0] 15 x = x[:, None] 16 17 model = ConditionalLogit(y, x, groups=g) 18 19 # Check the gradient for the denominator of the partial likelihood 20 for x in -1, 0, 1, 2: 21 params = np.r_[x, ] 22 _, grad = model._denom_grad(0, params) 23 ngrad = approx_fprime(params, lambda x: model._denom(0, x)) 24 assert_allclose(grad, ngrad) 25 26 # Check the gradient for the loglikelihood 27 for x in -1, 0, 1, 2: 28 grad = approx_fprime(np.r_[x, ], model.loglike) 29 score = model.score(np.r_[x, ]) 30 assert_allclose(grad, score, rtol=1e-4) 31 32 result = model.fit() 33 34 # From Stata 35 assert_allclose(result.params, np.r_[0.9272407], rtol=1e-5) 36 assert_allclose(result.bse, np.r_[1.295155], rtol=1e-5) 37 38 39def test_logit_2d(): 40 41 y = np.r_[0, 1, 0, 1, 0, 1, 0, 1, 1, 1] 42 g = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 2] 43 44 x1 = np.r_[0, 1, 0, 0, 1, 1, 0, 0, 1, 0] 45 x2 = np.r_[0, 0, 1, 0, 0, 1, 0, 1, 1, 1] 46 x = np.empty((10, 2)) 47 x[:, 0] = x1 48 x[:, 1] = x2 49 50 model = ConditionalLogit(y, x, groups=g) 51 52 # Check the gradient for the denominator of the partial likelihood 53 for x in -1, 0, 1, 2: 54 params = np.r_[x, -1.5*x] 55 _, grad = model._denom_grad(0, params) 56 ngrad = approx_fprime(params, lambda x: model._denom(0, x)) 57 assert_allclose(grad, ngrad, rtol=1e-5) 58 59 # Check the gradient for the loglikelihood 60 for x in -1, 0, 1, 2: 61 params = np.r_[-0.5*x, 0.5*x] 62 grad = approx_fprime(params, model.loglike) 63 score = model.score(params) 64 assert_allclose(grad, score, rtol=1e-4) 65 66 result = model.fit() 67 68 # From Stata 69 assert_allclose(result.params, np.r_[1.011074, 1.236758], rtol=1e-3) 70 assert_allclose(result.bse, np.r_[1.420784, 1.361738], rtol=1e-5) 71 72 result.summary() 73 74 75def test_formula(): 76 77 for j in 0, 1: 78 79 np.random.seed(34234) 80 n = 200 81 y = np.random.randint(0, 2, size=n) 82 x1 = np.random.normal(size=n) 83 x2 = np.random.normal(size=n) 84 g = np.random.randint(0, 25, size=n) 85 86 x = np.hstack((x1[:, None], x2[:, None])) 87 if j == 0: 88 model1 = ConditionalLogit(y, x, groups=g) 89 else: 90 model1 = ConditionalPoisson(y, x, groups=g) 91 result1 = model1.fit() 92 93 df = pd.DataFrame({"y": y, "x1": x1, "x2": x2, "g": g}) 94 if j == 0: 95 model2 = ConditionalLogit.from_formula( 96 "y ~ 0 + x1 + x2", groups="g", data=df) 97 else: 98 model2 = ConditionalPoisson.from_formula( 99 "y ~ 0 + x1 + x2", groups="g", data=df) 100 result2 = model2.fit() 101 102 assert_allclose(result1.params, result2.params, rtol=1e-5) 103 assert_allclose(result1.bse, result2.bse, rtol=1e-5) 104 assert_allclose(result1.cov_params(), result2.cov_params(), rtol=1e-5) 105 assert_allclose(result1.tvalues, result2.tvalues, rtol=1e-5) 106 107 108def test_poisson_1d(): 109 110 y = np.r_[3, 1, 1, 4, 5, 2, 0, 1, 6, 2] 111 g = np.r_[0, 0, 0, 0, 1, 1, 1, 1, 1, 1] 112 113 x = np.r_[0, 1, 0, 0, 1, 1, 0, 0, 1, 0] 114 x = x[:, None] 115 116 model = ConditionalPoisson(y, x, groups=g) 117 118 # Check the gradient for the loglikelihood 119 for x in -1, 0, 1, 2: 120 grad = approx_fprime(np.r_[x, ], model.loglike) 121 score = model.score(np.r_[x, ]) 122 assert_allclose(grad, score, rtol=1e-4) 123 124 result = model.fit() 125 126 # From Stata 127 assert_allclose(result.params, np.r_[0.6466272], rtol=1e-4) 128 assert_allclose(result.bse, np.r_[0.4170918], rtol=1e-5) 129 130 131def test_poisson_2d(): 132 133 y = np.r_[3, 1, 4, 8, 2, 5, 4, 7, 2, 6] 134 g = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 2] 135 136 x1 = np.r_[0, 1, 0, 0, 1, 1, 0, 0, 1, 0] 137 x2 = np.r_[2, 1, 0, 0, 1, 2, 3, 2, 0, 1] 138 x = np.empty((10, 2)) 139 x[:, 0] = x1 140 x[:, 1] = x2 141 142 model = ConditionalPoisson(y, x, groups=g) 143 144 # Check the gradient for the loglikelihood 145 for x in -1, 0, 1, 2: 146 params = np.r_[-0.5*x, 0.5*x] 147 grad = approx_fprime(params, model.loglike) 148 score = model.score(params) 149 assert_allclose(grad, score, rtol=1e-4) 150 151 result = model.fit() 152 153 # From Stata 154 assert_allclose(result.params, np.r_[-.9478957, -.0134279], rtol=1e-3) 155 assert_allclose(result.bse, np.r_[.3874942, .1686712], rtol=1e-5) 156 157 result.summary() 158 159 160def test_lasso_logistic(): 161 162 np.random.seed(3423948) 163 164 n = 200 165 groups = np.arange(10) 166 groups = np.kron(groups, np.ones(n // 10)) 167 group_effects = np.random.normal(size=10) 168 group_effects = np.kron(group_effects, np.ones(n // 10)) 169 170 x = np.random.normal(size=(n, 4)) 171 params = np.r_[0, 0, 1, 0] 172 lin_pred = np.dot(x, params) + group_effects 173 174 mean = 1 / (1 + np.exp(-lin_pred)) 175 y = (np.random.uniform(size=n) < mean).astype(int) 176 177 model0 = ConditionalLogit(y, x, groups=groups) 178 result0 = model0.fit() 179 180 # Should be the same as model0 181 model1 = ConditionalLogit(y, x, groups=groups) 182 result1 = model1.fit_regularized(L1_wt=0, alpha=0) 183 184 assert_allclose(result0.params, result1.params, rtol=1e-3) 185 186 model2 = ConditionalLogit(y, x, groups=groups) 187 result2 = model2.fit_regularized(L1_wt=1, alpha=0.05) 188 189 # Rxegression test 190 assert_allclose(result2.params, np.r_[0, 0, 0.55235152, 0], rtol=1e-4) 191 192 # Test with formula 193 df = pd.DataFrame({"y": y, "x1": x[:, 0], "x2": x[:, 1], "x3": x[:, 2], 194 "x4": x[:, 3], "groups": groups}) 195 fml = "y ~ 0 + x1 + x2 + x3 + x4" 196 model3 = ConditionalLogit.from_formula(fml, groups="groups", data=df) 197 result3 = model3.fit_regularized(L1_wt=1, alpha=0.05) 198 assert_allclose(result2.params, result3.params) 199 200 201def test_lasso_poisson(): 202 203 np.random.seed(342394) 204 205 n = 200 206 groups = np.arange(10) 207 groups = np.kron(groups, np.ones(n // 10)) 208 group_effects = np.random.normal(size=10) 209 group_effects = np.kron(group_effects, np.ones(n // 10)) 210 211 x = np.random.normal(size=(n, 4)) 212 params = np.r_[0, 0, 1, 0] 213 lin_pred = np.dot(x, params) + group_effects 214 215 mean = np.exp(lin_pred) 216 y = np.random.poisson(mean) 217 218 model0 = ConditionalPoisson(y, x, groups=groups) 219 result0 = model0.fit() 220 221 # Should be the same as model0 222 model1 = ConditionalPoisson(y, x, groups=groups) 223 result1 = model1.fit_regularized(L1_wt=0, alpha=0) 224 225 assert_allclose(result0.params, result1.params, rtol=1e-3) 226 227 model2 = ConditionalPoisson(y, x, groups=groups) 228 result2 = model2.fit_regularized(L1_wt=1, alpha=0.2) 229 230 # Regression test 231 assert_allclose(result2.params, np.r_[0, 0, 0.91697508, 0], rtol=1e-4) 232 233 # Test with formula 234 df = pd.DataFrame({"y": y, "x1": x[:, 0], "x2": x[:, 1], "x3": x[:, 2], 235 "x4": x[:, 3], "groups": groups}) 236 fml = "y ~ 0 + x1 + x2 + x3 + x4" 237 model3 = ConditionalPoisson.from_formula(fml, groups="groups", data=df) 238 result3 = model3.fit_regularized(L1_wt=1, alpha=0.2) 239 assert_allclose(result2.params, result3.params) 240 241 242def gen_mnlogit(n): 243 244 np.random.seed(235) 245 246 g = np.kron(np.ones(5), np.arange(n//5)) 247 x1 = np.random.normal(size=n) 248 x2 = np.random.normal(size=n) 249 xm = np.concatenate((x1[:, None], x2[:, None]), axis=1) 250 pa = np.array([[0, 1, -1], [0, 2, -1]]) 251 lpr = np.dot(xm, pa) 252 pr = np.exp(lpr) 253 pr /= pr.sum(1)[:, None] 254 cpr = pr.cumsum(1) 255 y = 2 * np.ones(n) 256 u = np.random.uniform(size=n) 257 y[u < cpr[:, 2]] = 2 258 y[u < cpr[:, 1]] = 1 259 y[u < cpr[:, 0]] = 0 260 261 df = pd.DataFrame({"y": y, "x1": x1, 262 "x2": x2, "g": g}) 263 return df 264 265 266def test_conditional_mnlogit_grad(): 267 268 df = gen_mnlogit(90) 269 model = ConditionalMNLogit.from_formula( 270 "y ~ 0 + x1 + x2", groups="g", data=df) 271 272 # Compare the gradients to numeric gradients 273 for _ in range(5): 274 za = np.random.normal(size=4) 275 grad = model.score(za) 276 ngrad = approx_fprime(za, model.loglike) 277 assert_allclose(grad, ngrad, rtol=1e-5, atol=1e-3) 278 279 280def test_conditional_mnlogit_2d(): 281 282 df = gen_mnlogit(90) 283 model = ConditionalMNLogit.from_formula( 284 "y ~ 0 + x1 + x2", groups="g", data=df) 285 result = model.fit() 286 287 # Regression tests 288 assert_allclose( 289 result.params, 290 np.asarray([[0.75592035, -1.58565494], 291 [1.82919869, -1.32594231]]), 292 rtol=1e-5, atol=1e-5) 293 assert_allclose( 294 result.bse, 295 np.asarray([[0.68099698, 0.70142727], 296 [0.65190315, 0.59653771]]), 297 rtol=1e-5, atol=1e-5) 298 299 300def test_conditional_mnlogit_3d(): 301 302 df = gen_mnlogit(90) 303 df["x3"] = np.random.normal(size=df.shape[0]) 304 model = ConditionalMNLogit.from_formula( 305 "y ~ 0 + x1 + x2 + x3", groups="g", data=df) 306 result = model.fit() 307 308 # Regression tests 309 assert_allclose( 310 result.params, 311 np.asarray([[ 0.729629, -1.633673], 312 [ 1.879019, -1.327163], 313 [-0.114124, -0.109378]]), 314 atol=1e-5, rtol=1e-5) 315 316 assert_allclose( 317 result.bse, 318 np.asarray([[0.682965, 0.60472], 319 [0.672947, 0.42401], 320 [0.722631, 0.33663]]), 321 atol=1e-5, rtol=1e-5) 322 323 # Smoke test 324 result.summary() 325