1import numpy as np
2from statsmodels.discrete.conditional_models import (
3      ConditionalLogit, ConditionalPoisson, ConditionalMNLogit)
4from statsmodels.tools.numdiff import approx_fprime
5from numpy.testing import assert_allclose
6import pandas as pd
7
8
9def test_logit_1d():
10
11    y = np.r_[0, 1, 0, 1, 0, 1, 0, 1, 1, 1]
12    g = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 2]
13
14    x = np.r_[0, 1, 0, 0, 1, 1, 0, 0, 1, 0]
15    x = x[:, None]
16
17    model = ConditionalLogit(y, x, groups=g)
18
19    # Check the gradient for the denominator of the partial likelihood
20    for x in -1, 0, 1, 2:
21        params = np.r_[x, ]
22        _, grad = model._denom_grad(0, params)
23        ngrad = approx_fprime(params, lambda x: model._denom(0, x))
24        assert_allclose(grad, ngrad)
25
26    # Check the gradient for the loglikelihood
27    for x in -1, 0, 1, 2:
28        grad = approx_fprime(np.r_[x, ], model.loglike)
29        score = model.score(np.r_[x, ])
30        assert_allclose(grad, score, rtol=1e-4)
31
32    result = model.fit()
33
34    # From Stata
35    assert_allclose(result.params, np.r_[0.9272407], rtol=1e-5)
36    assert_allclose(result.bse, np.r_[1.295155], rtol=1e-5)
37
38
39def test_logit_2d():
40
41    y = np.r_[0, 1, 0, 1, 0, 1, 0, 1, 1, 1]
42    g = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 2]
43
44    x1 = np.r_[0, 1, 0, 0, 1, 1, 0, 0, 1, 0]
45    x2 = np.r_[0, 0, 1, 0, 0, 1, 0, 1, 1, 1]
46    x = np.empty((10, 2))
47    x[:, 0] = x1
48    x[:, 1] = x2
49
50    model = ConditionalLogit(y, x, groups=g)
51
52    # Check the gradient for the denominator of the partial likelihood
53    for x in -1, 0, 1, 2:
54        params = np.r_[x, -1.5*x]
55        _, grad = model._denom_grad(0, params)
56        ngrad = approx_fprime(params, lambda x: model._denom(0, x))
57        assert_allclose(grad, ngrad, rtol=1e-5)
58
59    # Check the gradient for the loglikelihood
60    for x in -1, 0, 1, 2:
61        params = np.r_[-0.5*x, 0.5*x]
62        grad = approx_fprime(params, model.loglike)
63        score = model.score(params)
64        assert_allclose(grad, score, rtol=1e-4)
65
66    result = model.fit()
67
68    # From Stata
69    assert_allclose(result.params, np.r_[1.011074, 1.236758], rtol=1e-3)
70    assert_allclose(result.bse, np.r_[1.420784, 1.361738], rtol=1e-5)
71
72    result.summary()
73
74
75def test_formula():
76
77    for j in 0, 1:
78
79        np.random.seed(34234)
80        n = 200
81        y = np.random.randint(0, 2, size=n)
82        x1 = np.random.normal(size=n)
83        x2 = np.random.normal(size=n)
84        g = np.random.randint(0, 25, size=n)
85
86        x = np.hstack((x1[:, None], x2[:, None]))
87        if j == 0:
88            model1 = ConditionalLogit(y, x, groups=g)
89        else:
90            model1 = ConditionalPoisson(y, x, groups=g)
91        result1 = model1.fit()
92
93        df = pd.DataFrame({"y": y, "x1": x1, "x2": x2, "g": g})
94        if j == 0:
95            model2 = ConditionalLogit.from_formula(
96                        "y ~ 0 + x1 + x2", groups="g", data=df)
97        else:
98            model2 = ConditionalPoisson.from_formula(
99                        "y ~ 0 + x1 + x2", groups="g", data=df)
100        result2 = model2.fit()
101
102        assert_allclose(result1.params, result2.params, rtol=1e-5)
103        assert_allclose(result1.bse, result2.bse, rtol=1e-5)
104        assert_allclose(result1.cov_params(), result2.cov_params(), rtol=1e-5)
105        assert_allclose(result1.tvalues, result2.tvalues, rtol=1e-5)
106
107
108def test_poisson_1d():
109
110    y = np.r_[3, 1, 1, 4, 5, 2, 0, 1, 6, 2]
111    g = np.r_[0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
112
113    x = np.r_[0, 1, 0, 0, 1, 1, 0, 0, 1, 0]
114    x = x[:, None]
115
116    model = ConditionalPoisson(y, x, groups=g)
117
118    # Check the gradient for the loglikelihood
119    for x in -1, 0, 1, 2:
120        grad = approx_fprime(np.r_[x, ], model.loglike)
121        score = model.score(np.r_[x, ])
122        assert_allclose(grad, score, rtol=1e-4)
123
124    result = model.fit()
125
126    # From Stata
127    assert_allclose(result.params, np.r_[0.6466272], rtol=1e-4)
128    assert_allclose(result.bse, np.r_[0.4170918], rtol=1e-5)
129
130
131def test_poisson_2d():
132
133    y = np.r_[3, 1, 4, 8, 2, 5, 4, 7, 2, 6]
134    g = np.r_[0, 0, 0, 1, 1, 1, 2, 2, 2, 2]
135
136    x1 = np.r_[0, 1, 0, 0, 1, 1, 0, 0, 1, 0]
137    x2 = np.r_[2, 1, 0, 0, 1, 2, 3, 2, 0, 1]
138    x = np.empty((10, 2))
139    x[:, 0] = x1
140    x[:, 1] = x2
141
142    model = ConditionalPoisson(y, x, groups=g)
143
144    # Check the gradient for the loglikelihood
145    for x in -1, 0, 1, 2:
146        params = np.r_[-0.5*x, 0.5*x]
147        grad = approx_fprime(params, model.loglike)
148        score = model.score(params)
149        assert_allclose(grad, score, rtol=1e-4)
150
151    result = model.fit()
152
153    # From Stata
154    assert_allclose(result.params, np.r_[-.9478957, -.0134279], rtol=1e-3)
155    assert_allclose(result.bse, np.r_[.3874942, .1686712], rtol=1e-5)
156
157    result.summary()
158
159
160def test_lasso_logistic():
161
162    np.random.seed(3423948)
163
164    n = 200
165    groups = np.arange(10)
166    groups = np.kron(groups, np.ones(n // 10))
167    group_effects = np.random.normal(size=10)
168    group_effects = np.kron(group_effects, np.ones(n // 10))
169
170    x = np.random.normal(size=(n, 4))
171    params = np.r_[0, 0, 1, 0]
172    lin_pred = np.dot(x, params) + group_effects
173
174    mean = 1 / (1 + np.exp(-lin_pred))
175    y = (np.random.uniform(size=n) < mean).astype(int)
176
177    model0 = ConditionalLogit(y, x, groups=groups)
178    result0 = model0.fit()
179
180    # Should be the same as model0
181    model1 = ConditionalLogit(y, x, groups=groups)
182    result1 = model1.fit_regularized(L1_wt=0, alpha=0)
183
184    assert_allclose(result0.params, result1.params, rtol=1e-3)
185
186    model2 = ConditionalLogit(y, x, groups=groups)
187    result2 = model2.fit_regularized(L1_wt=1, alpha=0.05)
188
189    # Rxegression test
190    assert_allclose(result2.params, np.r_[0, 0, 0.55235152, 0], rtol=1e-4)
191
192    # Test with formula
193    df = pd.DataFrame({"y": y, "x1": x[:, 0], "x2": x[:, 1], "x3": x[:, 2],
194                       "x4": x[:, 3], "groups": groups})
195    fml = "y ~ 0 + x1 + x2 + x3 + x4"
196    model3 = ConditionalLogit.from_formula(fml, groups="groups", data=df)
197    result3 = model3.fit_regularized(L1_wt=1, alpha=0.05)
198    assert_allclose(result2.params, result3.params)
199
200
201def test_lasso_poisson():
202
203    np.random.seed(342394)
204
205    n = 200
206    groups = np.arange(10)
207    groups = np.kron(groups, np.ones(n // 10))
208    group_effects = np.random.normal(size=10)
209    group_effects = np.kron(group_effects, np.ones(n // 10))
210
211    x = np.random.normal(size=(n, 4))
212    params = np.r_[0, 0, 1, 0]
213    lin_pred = np.dot(x, params) + group_effects
214
215    mean = np.exp(lin_pred)
216    y = np.random.poisson(mean)
217
218    model0 = ConditionalPoisson(y, x, groups=groups)
219    result0 = model0.fit()
220
221    # Should be the same as model0
222    model1 = ConditionalPoisson(y, x, groups=groups)
223    result1 = model1.fit_regularized(L1_wt=0, alpha=0)
224
225    assert_allclose(result0.params, result1.params, rtol=1e-3)
226
227    model2 = ConditionalPoisson(y, x, groups=groups)
228    result2 = model2.fit_regularized(L1_wt=1, alpha=0.2)
229
230    # Regression test
231    assert_allclose(result2.params, np.r_[0, 0, 0.91697508, 0], rtol=1e-4)
232
233    # Test with formula
234    df = pd.DataFrame({"y": y, "x1": x[:, 0], "x2": x[:, 1], "x3": x[:, 2],
235                       "x4": x[:, 3], "groups": groups})
236    fml = "y ~ 0 + x1 + x2 + x3 + x4"
237    model3 = ConditionalPoisson.from_formula(fml, groups="groups", data=df)
238    result3 = model3.fit_regularized(L1_wt=1, alpha=0.2)
239    assert_allclose(result2.params, result3.params)
240
241
242def gen_mnlogit(n):
243
244    np.random.seed(235)
245
246    g = np.kron(np.ones(5), np.arange(n//5))
247    x1 = np.random.normal(size=n)
248    x2 = np.random.normal(size=n)
249    xm = np.concatenate((x1[:, None], x2[:, None]), axis=1)
250    pa = np.array([[0, 1, -1], [0, 2, -1]])
251    lpr = np.dot(xm, pa)
252    pr = np.exp(lpr)
253    pr /= pr.sum(1)[:, None]
254    cpr = pr.cumsum(1)
255    y = 2 * np.ones(n)
256    u = np.random.uniform(size=n)
257    y[u < cpr[:, 2]] = 2
258    y[u < cpr[:, 1]] = 1
259    y[u < cpr[:, 0]] = 0
260
261    df = pd.DataFrame({"y": y, "x1": x1,
262                       "x2": x2, "g": g})
263    return df
264
265
266def test_conditional_mnlogit_grad():
267
268    df = gen_mnlogit(90)
269    model = ConditionalMNLogit.from_formula(
270                "y ~ 0 + x1 + x2", groups="g", data=df)
271
272    # Compare the gradients to numeric gradients
273    for _ in range(5):
274        za = np.random.normal(size=4)
275        grad = model.score(za)
276        ngrad = approx_fprime(za, model.loglike)
277        assert_allclose(grad, ngrad, rtol=1e-5, atol=1e-3)
278
279
280def test_conditional_mnlogit_2d():
281
282    df = gen_mnlogit(90)
283    model = ConditionalMNLogit.from_formula(
284                "y ~ 0 + x1 + x2", groups="g", data=df)
285    result = model.fit()
286
287    # Regression tests
288    assert_allclose(
289        result.params,
290        np.asarray([[0.75592035, -1.58565494],
291                    [1.82919869, -1.32594231]]),
292        rtol=1e-5, atol=1e-5)
293    assert_allclose(
294        result.bse,
295        np.asarray([[0.68099698, 0.70142727],
296                    [0.65190315, 0.59653771]]),
297        rtol=1e-5, atol=1e-5)
298
299
300def test_conditional_mnlogit_3d():
301
302    df = gen_mnlogit(90)
303    df["x3"] = np.random.normal(size=df.shape[0])
304    model = ConditionalMNLogit.from_formula(
305                "y ~ 0 + x1 + x2 + x3", groups="g", data=df)
306    result = model.fit()
307
308    # Regression tests
309    assert_allclose(
310        result.params,
311        np.asarray([[ 0.729629, -1.633673],
312                    [ 1.879019, -1.327163],
313                    [-0.114124, -0.109378]]),
314        atol=1e-5, rtol=1e-5)
315
316    assert_allclose(
317        result.bse,
318        np.asarray([[0.682965, 0.60472],
319                    [0.672947, 0.42401],
320                    [0.722631, 0.33663]]),
321        atol=1e-5, rtol=1e-5)
322
323    # Smoke test
324    result.summary()
325