1# -*- coding: utf-8 -*- 2""" 3Created on Mon May 05 17:29:56 2014 4 5Author: Josef Perktold 6""" 7 8import os 9 10import numpy as np 11import pandas as pd 12import pytest 13 14from numpy.testing import assert_allclose 15 16from statsmodels.regression.linear_model import OLS, GLS 17 18from statsmodels.sandbox.regression.penalized import TheilGLS 19 20 21class TestTheilTextile(object): 22 23 @classmethod 24 def setup_class(cls): 25 26 cur_dir = os.path.dirname(os.path.abspath(__file__)) 27 filepath = os.path.join(cur_dir, "results", 28 "theil_textile_predict.csv") 29 cls.res_predict = pd.read_csv(filepath, sep=",") 30 31 names = "year lconsump lincome lprice".split() 32 33 data = np.array('''\ 34 1923 1.99651 1.98543 2.00432 35 1924 1.99564 1.99167 2.00043 36 1925 2 2 2 37 1926 2.04766 2.02078 1.95713 38 1927 2.08707 2.02078 1.93702 39 1928 2.07041 2.03941 1.95279 40 1929 2.08314 2.04454 1.95713 41 1930 2.13354 2.05038 1.91803 42 1931 2.18808 2.03862 1.84572 43 1932 2.18639 2.02243 1.81558 44 1933 2.20003 2.00732 1.78746 45 1934 2.14799 1.97955 1.79588 46 1935 2.13418 1.98408 1.80346 47 1936 2.22531 1.98945 1.72099 48 1937 2.18837 2.0103 1.77597 49 1938 2.17319 2.00689 1.77452 50 1939 2.2188 2.0162 1.78746'''.split(), float).reshape(-1, 4) 51 52 53 endog = data[:, 1] 54 # constant at the end to match Stata 55 exog = np.column_stack((data[:, 2:], np.ones(endog.shape[0]))) 56 57 #prior(lprice -0.7 0.15 lincome 1 0.15) cov(lprice lincome -0.01) 58 r_matrix = np.array([[1, 0, 0], [0, 1, 0]]) 59 r_mean = [1, -0.7] 60 61 cov_r = np.array([[0.15**2, -0.01], [-0.01, 0.15**2]]) 62 mod = TheilGLS(endog, exog, r_matrix, q_matrix=r_mean, sigma_prior=cov_r) 63 cls.res1 = mod.fit(cov_type='data-prior', use_t=True) 64 #cls.res1._cache['scale'] = 0.0001852252884817586 # from tg_mixed 65 cls.res1._cache['scale'] = 0.00018334123641580062 # from OLS 66 from .results import results_theil_textile as resmodule 67 cls.res2 = resmodule.results_theil_textile 68 69 70 def test_basic(self): 71 pt = self.res2.params_table[:,:6].T 72 params2, bse2, tvalues2, pvalues2, ci_low, ci_upp = pt 73 assert_allclose(self.res1.params, params2, rtol=2e-6) 74 75 #TODO tgmixed seems to use scale from initial OLS, not from final res 76 # np.sqrt(res.scale / res_ols.scale) 77 # see below mse_resid which is equal to scale 78 corr_fact = 0.9836026210570028 79 corr_fact = 0.97376865041463734 80 corr_fact = 1 81 assert_allclose(self.res1.bse / corr_fact, bse2, rtol=2e-6) 82 assert_allclose(self.res1.tvalues * corr_fact, tvalues2, rtol=2e-6) 83 # pvalues are very small 84 #assert_allclose(self.res1.pvalues, pvalues2, atol=2e-6) 85 #assert_allclose(self.res1.pvalues, pvalues2, rtol=0.7) 86 ci = self.res1.conf_int() 87 # not scale corrected 88 assert_allclose(ci[:,0], ci_low, rtol=0.01) 89 assert_allclose(ci[:,1], ci_upp, rtol=0.01) 90 assert_allclose(self.res1.rsquared, self.res2.r2, rtol=2e-6) 91 92 # Note: tgmixed is using k_exog for df_resid 93 corr_fact = self.res1.df_resid / self.res2.df_r 94 assert_allclose(np.sqrt(self.res1.mse_resid * corr_fact), 95 self.res2.rmse, rtol=2e-6) 96 97 assert_allclose(self.res1.fittedvalues, 98 self.res_predict['fittedvalues'], atol=5e7) 99 100 def test_other(self): 101 tc = self.res1.test_compatibility() 102 assert_allclose(np.squeeze(tc[0]), self.res2.compat, rtol=2e-6) 103 assert_allclose(np.squeeze(tc[1]), self.res2.pvalue, rtol=2e-6) 104 105 frac = self.res1.share_data() 106 # TODO check again, I guess tgmixed uses final scale in hatmatrix 107 # but I'm not sure, it passed in previous version, but now we override 108 # scale with OLS scale 109 # assert_allclose(frac, self.res2.frac_sample, rtol=2e-6) 110 # regression tests: 111 assert_allclose(frac, 0.6946116246864239, rtol=2e-6) 112 113 114 def test_no_penalization(self): 115 res_ols = OLS(self.res1.model.endog, self.res1.model.exog).fit() 116 res_theil = self.res1.model.fit(pen_weight=0, cov_type='data-prior') 117 assert_allclose(res_theil.params, res_ols.params, rtol=1e-10) 118 assert_allclose(res_theil.bse, res_ols.bse, rtol=1e-10) 119 120 @pytest.mark.smoke 121 def test_summary(self): 122 with pytest.warns(UserWarning): 123 self.res1.summary() 124 125 126class CheckEquivalenceMixin(object): 127 128 tol = {'default': (1e-4, 1e-20)} 129 130 @classmethod 131 def get_sample(cls): 132 np.random.seed(987456) 133 nobs, k_vars = 200, 5 134 beta = 0.5 * np.array([0.1, 1, 1, 0, 0]) 135 x = np.random.randn(nobs, k_vars) 136 x[:, 0] = 1 137 y = np.dot(x, beta) + 2 * np.random.randn(nobs) 138 return y, x 139 140 def test_attributes(self): 141 142 attributes_fit = ['params', 'rsquared', 'df_resid', 'df_model', 143 'llf', 'aic', 'bic' 144 #'fittedvalues', 'resid' 145 ] 146 attributes_inference = ['bse', 'tvalues', 'pvalues'] 147 import copy 148 attributes = copy.copy(attributes_fit) 149 150 if not getattr(self, 'skip_inference', False): 151 attributes.extend(attributes_inference) 152 153 for att in attributes: 154 r1 = getattr(self.res1, att) 155 r2 = getattr(self.res2, att) 156 if not np.size(r1) == 1: 157 r1 = r1[:len(r2)] 158 159 # check if we have overwritten tolerance 160 rtol, atol = self.tol.get(att, self.tol['default']) 161 message = 'attribute: ' + att #+ '\n%r\n\%r' % (r1, r2) 162 assert_allclose(r1, r2, rtol=rtol, atol=atol, err_msg=message) 163 164 # models are not close enough for some attributes at high precision 165 assert_allclose(self.res1.fittedvalues, self.res1.fittedvalues, 166 rtol=1e-3, atol=1e-4) 167 assert_allclose(self.res1.resid, self.res1.resid, 168 rtol=1e-3, atol=1e-4) 169 170 171class TestTheil1(CheckEquivalenceMixin): 172 # penalize last two parameters to zero 173 174 @classmethod 175 def setup_class(cls): 176 y, x = cls.get_sample() 177 mod1 = TheilGLS(y, x, sigma_prior=[0, 0, 1., 1.]) 178 cls.res1 = mod1.fit(200000) 179 cls.res2 = OLS(y, x[:, :3]).fit() 180 181class TestTheil2(CheckEquivalenceMixin): 182 # no penalization = same as OLS 183 184 @classmethod 185 def setup_class(cls): 186 y, x = cls.get_sample() 187 mod1 = TheilGLS(y, x, sigma_prior=[0, 0, 1., 1.]) 188 cls.res1 = mod1.fit(0) 189 cls.res2 = OLS(y, x).fit() 190 191 192class TestTheil3(CheckEquivalenceMixin): 193 # perfect multicollinearity = same as OLS in terms of fit 194 # inference: bse, ... is different 195 196 @classmethod 197 def setup_class(cls): 198 cls.skip_inference = True 199 y, x = cls.get_sample() 200 xd = np.column_stack((x, x)) 201 #sp = np.zeros(5), np.ones(5) 202 r_matrix = np.eye(5, 10, 5) 203 mod1 = TheilGLS(y, xd, r_matrix=r_matrix) #sigma_prior=[0, 0, 1., 1.]) 204 cls.res1 = mod1.fit(0.001, cov_type='data-prior') 205 cls.res2 = OLS(y, x).fit() 206 207 208class TestTheilGLS(CheckEquivalenceMixin): 209 # penalize last two parameters to zero 210 211 @classmethod 212 def setup_class(cls): 213 y, x = cls.get_sample() 214 nobs = len(y) 215 weights = (np.arange(nobs) < (nobs // 2)) + 0.5 216 mod1 = TheilGLS(y, x, sigma=weights, sigma_prior=[0, 0, 1., 1.]) 217 cls.res1 = mod1.fit(200000) 218 cls.res2 = GLS(y, x[:, :3], sigma=weights).fit() 219 220 221class TestTheilLinRestriction(CheckEquivalenceMixin): 222 # impose linear restriction with small uncertainty - close to OLS 223 224 @classmethod 225 def setup_class(cls): 226 y, x = cls.get_sample() 227 #merge var1 and var2 228 x2 = x[:, :2].copy() 229 x2[:, 1] += x[:, 2] 230 #mod1 = TheilGLS(y, x, r_matrix =[[0, 1, -1, 0, 0]]) 231 mod1 = TheilGLS(y, x[:, :3], r_matrix =[[0, 1, -1]]) 232 cls.res1 = mod1.fit(200000) 233 cls.res2 = OLS(y, x2).fit() 234 235 # adjust precision, careful: cls.tol is mutable 236 tol = {'pvalues': (1e-4, 2e-7), 237 'tvalues': (5e-4, 0)} 238 tol.update(cls.tol) 239 cls.tol = tol 240 241 242class TestTheilLinRestrictionApprox(CheckEquivalenceMixin): 243 # impose linear restriction with some uncertainty 244 245 @classmethod 246 def setup_class(cls): 247 y, x = cls.get_sample() 248 #merge var1 and var2 249 x2 = x[:, :2].copy() 250 x2[:, 1] += x[:, 2] 251 #mod1 = TheilGLS(y, x, r_matrix =[[0, 1, -1, 0, 0]]) 252 mod1 = TheilGLS(y, x[:, :3], r_matrix =[[0, 1, -1]]) 253 cls.res1 = mod1.fit(100) 254 cls.res2 = OLS(y, x2).fit() 255 256 # adjust precision, careful: cls.tol is mutable 257 import copy 258 tol = copy.copy(cls.tol) 259 tol2 = {'default': (0.15, 0), 260 'params': (0.05, 0), 261 'pvalues': (0.02, 0.001), 262 } 263 tol.update(tol2) 264 cls.tol = tol 265 266 267class TestTheilPanel(object): 268 269 @classmethod 270 def setup_class(cls): 271 #example 3 272 nobs = 300 273 nobs_i = 5 274 n_groups = nobs // nobs_i 275 k_vars = 3 276 277 from statsmodels.sandbox.panel.random_panel import PanelSample 278 dgp = PanelSample(nobs, k_vars, n_groups, seed=303305) 279 # add random intercept, using same RandomState 280 dgp.group_means = 2 + dgp.random_state.randn(n_groups) 281 print('seed', dgp.seed) 282 y = dgp.generate_panel() 283 x = np.column_stack((dgp.exog[:,1:], 284 dgp.groups[:,None] == np.arange(n_groups))) 285 cls.dgp = dgp 286 cls.endog = y 287 cls.exog = x 288 cls.res_ols = OLS(y, x).fit() 289 290 def test_regression(self): 291 y = self.endog 292 x = self.exog 293 n_groups, k_vars = self.dgp.n_groups, self.dgp.k_vars 294 295 Rg = (np.eye(n_groups-1) - 1. / n_groups * 296 np.ones((n_groups - 1, n_groups-1))) 297 R = np.c_[np.zeros((n_groups - 1, k_vars)), Rg] 298 r = np.zeros(n_groups - 1) 299 R[:, k_vars-1] = -1 300 301 lambd = 1 #1e-4 302 mod = TheilGLS(y, x, r_matrix=R, q_matrix=r, sigma_prior=lambd) 303 res = mod.fit() 304 305 # regression test 306 params1 = np.array([ 307 0.9751655 , 1.05215277, 0.37135028, 2.0492626 , 2.82062503, 308 2.82139775, 1.92940468, 2.96942081, 2.86349583, 3.20695368, 309 4.04516422, 3.04918839, 4.54748808, 3.49026961, 3.15529618, 310 4.25552932, 2.65471759, 3.62328747, 3.07283053, 3.49485898, 311 3.42301424, 2.94677593, 2.81549427, 2.24895113, 2.29222784, 312 2.89194946, 3.17052308, 2.37754241, 3.54358533, 3.79838425, 313 1.91189071, 1.15976407, 4.05629691, 1.58556827, 4.49941666, 314 4.08608599, 3.1889269 , 2.86203652, 3.06785013, 1.9376162 , 315 2.90657681, 3.71910592, 3.15607617, 3.58464547, 2.15466323, 316 4.87026717, 2.92909833, 2.64998337, 2.891171 , 4.04422964, 317 3.54616122, 4.12135273, 3.70232028, 3.8314497 , 2.2591451 , 318 2.39321422, 3.13064532, 2.1569678 , 2.04667506, 3.92064689, 319 3.66243644, 3.11742725]) 320 assert_allclose(res.params, params1) 321 322 pen_weight_aicc = mod.select_pen_weight(method='aicc') 323 pen_weight_gcv = mod.select_pen_weight(method='gcv') 324 pen_weight_cv = mod.select_pen_weight(method='cv') 325 pen_weight_bic = mod.select_pen_weight(method='bic') 326 assert_allclose(pen_weight_gcv, pen_weight_aicc, rtol=0.1) 327 # regression tests: 328 assert_allclose(pen_weight_aicc, 4.77333984, rtol=1e-4) 329 assert_allclose(pen_weight_gcv, 4.45546875, rtol=1e-4) 330 assert_allclose(pen_weight_bic, 9.35957031, rtol=1e-4) 331 assert_allclose(pen_weight_cv, 1.99277344, rtol=1e-4) 332 333 def test_combine_subset_regression(self): 334 # split sample into two, use first sample as prior for second 335 endog = self.endog 336 exog = self.exog 337 nobs = len(endog) 338 339 n05 = nobs // 2 340 np.random.seed(987125) 341 # shuffle to get random subsamples 342 shuffle_idx = np.random.permutation(np.arange(nobs)) 343 ys = endog[shuffle_idx] 344 xs = exog[shuffle_idx] 345 k = 10 346 res_ols0 = OLS(ys[:n05], xs[:n05, :k]).fit() 347 res_ols1 = OLS(ys[n05:], xs[n05:, :k]).fit() 348 349 w = res_ols1.scale / res_ols0.scale #1.01 350 mod_1 = TheilGLS(ys[n05:], xs[n05:, :k], r_matrix=np.eye(k), 351 q_matrix=res_ols0.params, 352 sigma_prior=w * res_ols0.cov_params()) 353 res_1p = mod_1.fit(cov_type='data-prior') 354 res_1s = mod_1.fit(cov_type='sandwich') 355 res_olsf = OLS(ys, xs[:, :k]).fit() 356 357 assert_allclose(res_1p.params, res_olsf.params, rtol=1e-9) 358 corr_fact = np.sqrt(res_1p.scale / res_olsf.scale) 359 # corrct for differences in scale computation 360 assert_allclose(res_1p.bse, res_olsf.bse * corr_fact, rtol=1e-3) 361 362 # regression test, does not verify numbers 363 # especially why are these smaller than OLS on full sample 364 # in larger sample, nobs=600, those were close to full OLS 365 bse1 = np.array([ 366 0.26589869, 0.15224812, 0.38407399, 0.75679949, 0.66084200, 367 0.54174080, 0.53697607, 0.66006377, 0.38228551, 0.53920485]) 368 assert_allclose(res_1s.bse, bse1, rtol=1e-7) 369