1# -*- coding: utf-8 -*-
2"""
3Created on Mon May 05 17:29:56 2014
4
5Author: Josef Perktold
6"""
7
8import os
9
10import numpy as np
11import pandas as pd
12import pytest
13
14from numpy.testing import assert_allclose
15
16from statsmodels.regression.linear_model import OLS, GLS
17
18from statsmodels.sandbox.regression.penalized import TheilGLS
19
20
21class TestTheilTextile(object):
22
23    @classmethod
24    def setup_class(cls):
25
26        cur_dir = os.path.dirname(os.path.abspath(__file__))
27        filepath = os.path.join(cur_dir, "results",
28                                "theil_textile_predict.csv")
29        cls.res_predict = pd.read_csv(filepath, sep=",")
30
31        names = "year	lconsump	lincome	lprice".split()
32
33        data = np.array('''\
34        1923	1.99651	1.98543	2.00432
35        1924	1.99564	1.99167	2.00043
36        1925	2	2	2
37        1926	2.04766	2.02078	1.95713
38        1927	2.08707	2.02078	1.93702
39        1928	2.07041	2.03941	1.95279
40        1929	2.08314	2.04454	1.95713
41        1930	2.13354	2.05038	1.91803
42        1931	2.18808	2.03862	1.84572
43        1932	2.18639	2.02243	1.81558
44        1933	2.20003	2.00732	1.78746
45        1934	2.14799	1.97955	1.79588
46        1935	2.13418	1.98408	1.80346
47        1936	2.22531	1.98945	1.72099
48        1937	2.18837	2.0103	1.77597
49        1938	2.17319	2.00689	1.77452
50        1939	2.2188	2.0162	1.78746'''.split(), float).reshape(-1, 4)
51
52
53        endog = data[:, 1]
54        # constant at the end to match Stata
55        exog = np.column_stack((data[:, 2:], np.ones(endog.shape[0])))
56
57        #prior(lprice -0.7 0.15 lincome 1 0.15) cov(lprice lincome -0.01)
58        r_matrix = np.array([[1, 0, 0], [0, 1, 0]])
59        r_mean = [1, -0.7]
60
61        cov_r = np.array([[0.15**2, -0.01], [-0.01, 0.15**2]])
62        mod = TheilGLS(endog, exog, r_matrix, q_matrix=r_mean, sigma_prior=cov_r)
63        cls.res1 = mod.fit(cov_type='data-prior', use_t=True)
64        #cls.res1._cache['scale'] = 0.0001852252884817586 # from tg_mixed
65        cls.res1._cache['scale'] = 0.00018334123641580062 # from OLS
66        from .results import results_theil_textile as resmodule
67        cls.res2 = resmodule.results_theil_textile
68
69
70    def test_basic(self):
71        pt = self.res2.params_table[:,:6].T
72        params2, bse2, tvalues2, pvalues2, ci_low, ci_upp = pt
73        assert_allclose(self.res1.params, params2, rtol=2e-6)
74
75        #TODO tgmixed seems to use scale from initial OLS, not from final res
76        # np.sqrt(res.scale / res_ols.scale)
77        # see below mse_resid which is equal to scale
78        corr_fact = 0.9836026210570028
79        corr_fact = 0.97376865041463734
80        corr_fact = 1
81        assert_allclose(self.res1.bse / corr_fact, bse2, rtol=2e-6)
82        assert_allclose(self.res1.tvalues  * corr_fact, tvalues2, rtol=2e-6)
83        # pvalues are very small
84        #assert_allclose(self.res1.pvalues, pvalues2, atol=2e-6)
85        #assert_allclose(self.res1.pvalues, pvalues2, rtol=0.7)
86        ci = self.res1.conf_int()
87        # not scale corrected
88        assert_allclose(ci[:,0], ci_low, rtol=0.01)
89        assert_allclose(ci[:,1], ci_upp, rtol=0.01)
90        assert_allclose(self.res1.rsquared, self.res2.r2, rtol=2e-6)
91
92        # Note: tgmixed is using k_exog for df_resid
93        corr_fact = self.res1.df_resid / self.res2.df_r
94        assert_allclose(np.sqrt(self.res1.mse_resid * corr_fact),
95                        self.res2.rmse, rtol=2e-6)
96
97        assert_allclose(self.res1.fittedvalues,
98                        self.res_predict['fittedvalues'], atol=5e7)
99
100    def test_other(self):
101        tc = self.res1.test_compatibility()
102        assert_allclose(np.squeeze(tc[0]), self.res2.compat, rtol=2e-6)
103        assert_allclose(np.squeeze(tc[1]), self.res2.pvalue, rtol=2e-6)
104
105        frac = self.res1.share_data()
106        # TODO check again, I guess tgmixed uses final scale in hatmatrix
107        # but I'm not sure, it passed in previous version, but now we override
108        # scale with OLS scale
109        # assert_allclose(frac, self.res2.frac_sample, rtol=2e-6)
110        # regression tests:
111        assert_allclose(frac, 0.6946116246864239, rtol=2e-6)
112
113
114    def test_no_penalization(self):
115        res_ols = OLS(self.res1.model.endog, self.res1.model.exog).fit()
116        res_theil = self.res1.model.fit(pen_weight=0, cov_type='data-prior')
117        assert_allclose(res_theil.params, res_ols.params, rtol=1e-10)
118        assert_allclose(res_theil.bse, res_ols.bse, rtol=1e-10)
119
120    @pytest.mark.smoke
121    def test_summary(self):
122        with pytest.warns(UserWarning):
123            self.res1.summary()
124
125
126class CheckEquivalenceMixin(object):
127
128    tol = {'default': (1e-4, 1e-20)}
129
130    @classmethod
131    def get_sample(cls):
132        np.random.seed(987456)
133        nobs, k_vars = 200, 5
134        beta = 0.5 * np.array([0.1, 1, 1, 0, 0])
135        x = np.random.randn(nobs, k_vars)
136        x[:, 0] = 1
137        y = np.dot(x, beta) + 2 * np.random.randn(nobs)
138        return y, x
139
140    def test_attributes(self):
141
142        attributes_fit = ['params', 'rsquared', 'df_resid', 'df_model',
143                          'llf', 'aic', 'bic'
144                          #'fittedvalues', 'resid'
145                          ]
146        attributes_inference = ['bse', 'tvalues', 'pvalues']
147        import copy
148        attributes = copy.copy(attributes_fit)
149
150        if not getattr(self, 'skip_inference', False):
151            attributes.extend(attributes_inference)
152
153        for att in attributes:
154            r1 = getattr(self.res1, att)
155            r2 = getattr(self.res2, att)
156            if not np.size(r1) == 1:
157                r1 = r1[:len(r2)]
158
159            # check if we have overwritten tolerance
160            rtol, atol = self.tol.get(att, self.tol['default'])
161            message = 'attribute: ' + att #+ '\n%r\n\%r' % (r1, r2)
162            assert_allclose(r1, r2, rtol=rtol, atol=atol, err_msg=message)
163
164        # models are not close enough for some attributes at high precision
165        assert_allclose(self.res1.fittedvalues, self.res1.fittedvalues,
166                        rtol=1e-3, atol=1e-4)
167        assert_allclose(self.res1.resid, self.res1.resid,
168                        rtol=1e-3, atol=1e-4)
169
170
171class TestTheil1(CheckEquivalenceMixin):
172    # penalize last two parameters to zero
173
174    @classmethod
175    def setup_class(cls):
176        y, x = cls.get_sample()
177        mod1 = TheilGLS(y, x, sigma_prior=[0, 0, 1., 1.])
178        cls.res1 = mod1.fit(200000)
179        cls.res2 = OLS(y, x[:, :3]).fit()
180
181class TestTheil2(CheckEquivalenceMixin):
182    # no penalization = same as OLS
183
184    @classmethod
185    def setup_class(cls):
186        y, x = cls.get_sample()
187        mod1 = TheilGLS(y, x, sigma_prior=[0, 0, 1., 1.])
188        cls.res1 = mod1.fit(0)
189        cls.res2 = OLS(y, x).fit()
190
191
192class TestTheil3(CheckEquivalenceMixin):
193    # perfect multicollinearity = same as OLS in terms of fit
194    # inference: bse, ... is different
195
196    @classmethod
197    def setup_class(cls):
198        cls.skip_inference = True
199        y, x = cls.get_sample()
200        xd = np.column_stack((x, x))
201        #sp = np.zeros(5), np.ones(5)
202        r_matrix = np.eye(5, 10, 5)
203        mod1 = TheilGLS(y, xd, r_matrix=r_matrix) #sigma_prior=[0, 0, 1., 1.])
204        cls.res1 = mod1.fit(0.001, cov_type='data-prior')
205        cls.res2 = OLS(y, x).fit()
206
207
208class TestTheilGLS(CheckEquivalenceMixin):
209    # penalize last two parameters to zero
210
211    @classmethod
212    def setup_class(cls):
213        y, x = cls.get_sample()
214        nobs = len(y)
215        weights = (np.arange(nobs) < (nobs // 2)) + 0.5
216        mod1 = TheilGLS(y, x, sigma=weights, sigma_prior=[0, 0, 1., 1.])
217        cls.res1 = mod1.fit(200000)
218        cls.res2 = GLS(y, x[:, :3], sigma=weights).fit()
219
220
221class TestTheilLinRestriction(CheckEquivalenceMixin):
222    # impose linear restriction with small uncertainty - close to OLS
223
224    @classmethod
225    def setup_class(cls):
226        y, x = cls.get_sample()
227        #merge var1 and var2
228        x2 = x[:, :2].copy()
229        x2[:, 1] += x[:, 2]
230        #mod1 = TheilGLS(y, x, r_matrix =[[0, 1, -1, 0, 0]])
231        mod1 = TheilGLS(y, x[:, :3], r_matrix =[[0, 1, -1]])
232        cls.res1 = mod1.fit(200000)
233        cls.res2 = OLS(y, x2).fit()
234
235        # adjust precision, careful: cls.tol is mutable
236        tol = {'pvalues': (1e-4, 2e-7),
237               'tvalues': (5e-4, 0)}
238        tol.update(cls.tol)
239        cls.tol = tol
240
241
242class TestTheilLinRestrictionApprox(CheckEquivalenceMixin):
243    # impose linear restriction with some uncertainty
244
245    @classmethod
246    def setup_class(cls):
247        y, x = cls.get_sample()
248        #merge var1 and var2
249        x2 = x[:, :2].copy()
250        x2[:, 1] += x[:, 2]
251        #mod1 = TheilGLS(y, x, r_matrix =[[0, 1, -1, 0, 0]])
252        mod1 = TheilGLS(y, x[:, :3], r_matrix =[[0, 1, -1]])
253        cls.res1 = mod1.fit(100)
254        cls.res2 = OLS(y, x2).fit()
255
256        # adjust precision, careful: cls.tol is mutable
257        import copy
258        tol = copy.copy(cls.tol)
259        tol2 = {'default': (0.15,  0),
260                'params':  (0.05, 0),
261                'pvalues': (0.02, 0.001),
262                }
263        tol.update(tol2)
264        cls.tol = tol
265
266
267class TestTheilPanel(object):
268
269    @classmethod
270    def setup_class(cls):
271        #example 3
272        nobs = 300
273        nobs_i = 5
274        n_groups = nobs // nobs_i
275        k_vars = 3
276
277        from statsmodels.sandbox.panel.random_panel import PanelSample
278        dgp = PanelSample(nobs, k_vars, n_groups, seed=303305)
279        # add random intercept, using same RandomState
280        dgp.group_means = 2 + dgp.random_state.randn(n_groups)
281        print('seed', dgp.seed)
282        y = dgp.generate_panel()
283        x = np.column_stack((dgp.exog[:,1:],
284                             dgp.groups[:,None] == np.arange(n_groups)))
285        cls.dgp = dgp
286        cls.endog = y
287        cls.exog = x
288        cls.res_ols = OLS(y, x).fit()
289
290    def test_regression(self):
291        y = self.endog
292        x = self.exog
293        n_groups, k_vars = self.dgp.n_groups, self.dgp.k_vars
294
295        Rg = (np.eye(n_groups-1) - 1. / n_groups *
296                np.ones((n_groups - 1, n_groups-1)))
297        R = np.c_[np.zeros((n_groups - 1, k_vars)), Rg]
298        r = np.zeros(n_groups - 1)
299        R[:, k_vars-1] = -1
300
301        lambd = 1 #1e-4
302        mod = TheilGLS(y, x, r_matrix=R, q_matrix=r, sigma_prior=lambd)
303        res = mod.fit()
304
305        # regression test
306        params1 = np.array([
307            0.9751655 ,  1.05215277,  0.37135028,  2.0492626 ,  2.82062503,
308            2.82139775,  1.92940468,  2.96942081,  2.86349583,  3.20695368,
309            4.04516422,  3.04918839,  4.54748808,  3.49026961,  3.15529618,
310            4.25552932,  2.65471759,  3.62328747,  3.07283053,  3.49485898,
311            3.42301424,  2.94677593,  2.81549427,  2.24895113,  2.29222784,
312            2.89194946,  3.17052308,  2.37754241,  3.54358533,  3.79838425,
313            1.91189071,  1.15976407,  4.05629691,  1.58556827,  4.49941666,
314            4.08608599,  3.1889269 ,  2.86203652,  3.06785013,  1.9376162 ,
315            2.90657681,  3.71910592,  3.15607617,  3.58464547,  2.15466323,
316            4.87026717,  2.92909833,  2.64998337,  2.891171  ,  4.04422964,
317            3.54616122,  4.12135273,  3.70232028,  3.8314497 ,  2.2591451 ,
318            2.39321422,  3.13064532,  2.1569678 ,  2.04667506,  3.92064689,
319            3.66243644,  3.11742725])
320        assert_allclose(res.params, params1)
321
322        pen_weight_aicc = mod.select_pen_weight(method='aicc')
323        pen_weight_gcv = mod.select_pen_weight(method='gcv')
324        pen_weight_cv = mod.select_pen_weight(method='cv')
325        pen_weight_bic = mod.select_pen_weight(method='bic')
326        assert_allclose(pen_weight_gcv, pen_weight_aicc, rtol=0.1)
327        # regression tests:
328        assert_allclose(pen_weight_aicc, 4.77333984, rtol=1e-4)
329        assert_allclose(pen_weight_gcv,  4.45546875, rtol=1e-4)
330        assert_allclose(pen_weight_bic, 9.35957031, rtol=1e-4)
331        assert_allclose(pen_weight_cv, 1.99277344, rtol=1e-4)
332
333    def test_combine_subset_regression(self):
334        # split sample into two, use first sample as prior for second
335        endog = self.endog
336        exog = self.exog
337        nobs = len(endog)
338
339        n05 = nobs // 2
340        np.random.seed(987125)
341        # shuffle to get random subsamples
342        shuffle_idx = np.random.permutation(np.arange(nobs))
343        ys = endog[shuffle_idx]
344        xs = exog[shuffle_idx]
345        k = 10
346        res_ols0 = OLS(ys[:n05], xs[:n05, :k]).fit()
347        res_ols1 = OLS(ys[n05:], xs[n05:, :k]).fit()
348
349        w = res_ols1.scale / res_ols0.scale   #1.01
350        mod_1 = TheilGLS(ys[n05:], xs[n05:, :k], r_matrix=np.eye(k),
351                         q_matrix=res_ols0.params,
352                         sigma_prior=w * res_ols0.cov_params())
353        res_1p = mod_1.fit(cov_type='data-prior')
354        res_1s = mod_1.fit(cov_type='sandwich')
355        res_olsf = OLS(ys, xs[:, :k]).fit()
356
357        assert_allclose(res_1p.params, res_olsf.params, rtol=1e-9)
358        corr_fact = np.sqrt(res_1p.scale / res_olsf.scale)
359        # corrct for differences in scale computation
360        assert_allclose(res_1p.bse, res_olsf.bse * corr_fact, rtol=1e-3)
361
362        # regression test, does not verify numbers
363        # especially why are these smaller than OLS on full sample
364        # in larger sample, nobs=600, those were close to full OLS
365        bse1 = np.array([
366            0.26589869,  0.15224812,  0.38407399,  0.75679949,  0.66084200,
367            0.54174080,  0.53697607,  0.66006377,  0.38228551,  0.53920485])
368        assert_allclose(res_1s.bse, bse1, rtol=1e-7)
369