1""" 2Test functions for models.regression 3""" 4# TODO: Test for LM 5from statsmodels.compat.python import lrange 6 7import warnings 8 9import numpy as np 10from numpy.testing import ( 11 assert_, 12 assert_allclose, 13 assert_almost_equal, 14 assert_equal, 15 assert_raises, 16) 17import pandas as pd 18import pytest 19from scipy.linalg import toeplitz 20from scipy.stats import t as student_t 21 22from statsmodels.datasets import longley 23from statsmodels.regression.linear_model import ( 24 GLS, 25 OLS, 26 WLS, 27 burg, 28 yule_walker, 29) 30from statsmodels.tools.tools import add_constant 31 32DECIMAL_4 = 4 33DECIMAL_3 = 3 34DECIMAL_2 = 2 35DECIMAL_1 = 1 36DECIMAL_7 = 7 37DECIMAL_0 = 0 38 39try: 40 import cvxopt # noqa:F401 41 42 has_cvxopt = True 43except ImportError: 44 has_cvxopt = False 45 46 47class CheckRegressionResults(object): 48 """ 49 res2 contains results from Rmodelwrap or were obtained from a statistical 50 packages such as R, Stata, or SAS and were written to model_results 51 """ 52 53 decimal_params = DECIMAL_4 54 55 def test_params(self): 56 assert_almost_equal( 57 self.res1.params, self.res2.params, self.decimal_params 58 ) 59 60 decimal_standarderrors = DECIMAL_4 61 62 def test_standarderrors(self): 63 assert_allclose( 64 self.res1.bse, self.res2.bse, self.decimal_standarderrors 65 ) 66 67 decimal_confidenceintervals = DECIMAL_4 68 69 def test_confidenceintervals(self): 70 # NOTE: stata rounds residuals (at least) to sig digits so approx_equal 71 conf1 = self.res1.conf_int() 72 conf2 = self.res2.conf_int() 73 for i in range(len(conf1)): 74 assert_allclose( 75 conf1[i][0], 76 conf2[i][0], 77 rtol=10 ** -self.decimal_confidenceintervals, 78 ) 79 assert_allclose( 80 conf1[i][1], 81 conf2[i][1], 82 rtol=10 ** -self.decimal_confidenceintervals, 83 ) 84 85 decimal_conf_int_subset = DECIMAL_4 86 87 def test_conf_int_subset(self): 88 if len(self.res1.params) > 1: 89 ci1 = self.res1.conf_int(cols=(1, 2)) 90 ci2 = self.res1.conf_int()[1:3] 91 assert_almost_equal(ci1, ci2, self.decimal_conf_int_subset) 92 else: 93 pass 94 95 decimal_scale = DECIMAL_4 96 97 def test_scale(self): 98 assert_almost_equal( 99 self.res1.scale, self.res2.scale, self.decimal_scale 100 ) 101 102 decimal_rsquared = DECIMAL_4 103 104 def test_rsquared(self): 105 assert_almost_equal( 106 self.res1.rsquared, self.res2.rsquared, self.decimal_rsquared 107 ) 108 109 decimal_rsquared_adj = DECIMAL_4 110 111 def test_rsquared_adj(self): 112 assert_almost_equal( 113 self.res1.rsquared_adj, 114 self.res2.rsquared_adj, 115 self.decimal_rsquared_adj, 116 ) 117 118 def test_degrees(self): 119 assert_equal(self.res1.model.df_model, self.res2.df_model) 120 assert_equal(self.res1.model.df_resid, self.res2.df_resid) 121 122 decimal_ess = DECIMAL_4 123 124 def test_ess(self): 125 # Explained Sum of Squares 126 assert_almost_equal(self.res1.ess, self.res2.ess, self.decimal_ess) 127 128 decimal_ssr = DECIMAL_4 129 130 def test_sumof_squaredresids(self): 131 assert_almost_equal(self.res1.ssr, self.res2.ssr, self.decimal_ssr) 132 133 decimal_mse_resid = DECIMAL_4 134 135 def test_mse_resid(self): 136 # Mean squared error of residuals 137 assert_almost_equal( 138 self.res1.mse_model, self.res2.mse_model, self.decimal_mse_resid 139 ) 140 141 decimal_mse_model = DECIMAL_4 142 143 def test_mse_model(self): 144 assert_almost_equal( 145 self.res1.mse_resid, self.res2.mse_resid, self.decimal_mse_model 146 ) 147 148 decimal_mse_total = DECIMAL_4 149 150 def test_mse_total(self): 151 assert_almost_equal( 152 self.res1.mse_total, 153 self.res2.mse_total, 154 self.decimal_mse_total, 155 err_msg="Test class %s" % self, 156 ) 157 158 decimal_fvalue = DECIMAL_4 159 160 def test_fvalue(self): 161 # did not change this, not sure it should complain -inf not equal -inf 162 # if not (np.isinf(self.res1.fvalue) and np.isinf(self.res2.fvalue)): 163 assert_almost_equal( 164 self.res1.fvalue, self.res2.fvalue, self.decimal_fvalue 165 ) 166 167 decimal_loglike = DECIMAL_4 168 169 def test_loglike(self): 170 assert_almost_equal(self.res1.llf, self.res2.llf, self.decimal_loglike) 171 172 decimal_aic = DECIMAL_4 173 174 def test_aic(self): 175 assert_almost_equal(self.res1.aic, self.res2.aic, self.decimal_aic) 176 # the following just checks the definition 177 aicc1 = self.res1.info_criteria("aicc") 178 k = self.res1.df_model + self.res1.model.k_constant 179 nobs = self.res1.model.nobs 180 aicc2 = self.res1.aic + 2 * (k**2 + k) / (nobs - k - 1) 181 assert_allclose(aicc1, aicc2, rtol=1e-10) 182 hqic1 = self.res1.info_criteria("hqic") 183 hqic2 = (self.res1.aic - 2 * k) + 2 * np.log(np.log(nobs)) * k 184 assert_allclose(hqic1, hqic2, rtol=1e-10) 185 186 decimal_bic = DECIMAL_4 187 188 def test_bic(self): 189 assert_almost_equal(self.res1.bic, self.res2.bic, self.decimal_bic) 190 191 decimal_pvalues = DECIMAL_4 192 193 def test_pvalues(self): 194 assert_almost_equal( 195 self.res1.pvalues, self.res2.pvalues, self.decimal_pvalues 196 ) 197 198 decimal_wresid = DECIMAL_4 199 200 def test_wresid(self): 201 assert_almost_equal( 202 self.res1.wresid, self.res2.wresid, self.decimal_wresid 203 ) 204 205 decimal_resids = DECIMAL_4 206 207 def test_resids(self): 208 assert_almost_equal( 209 self.res1.resid, self.res2.resid, self.decimal_resids 210 ) 211 212 decimal_norm_resids = DECIMAL_4 213 214 def test_norm_resids(self): 215 assert_almost_equal( 216 self.res1.resid_pearson, 217 self.res2.resid_pearson, 218 self.decimal_norm_resids, 219 ) 220 221 222# TODO: test fittedvalues and what else? 223 224 225class TestOLS(CheckRegressionResults): 226 @classmethod 227 def setup_class(cls): 228 from .results.results_regression import Longley 229 230 data = longley.load() 231 endog = np.asarray(data.endog) 232 exog = np.asarray(data.exog) 233 exog = add_constant(exog, prepend=False) 234 res1 = OLS(endog, exog).fit() 235 res2 = Longley() 236 res2.wresid = res1.wresid # workaround hack 237 cls.res1 = res1 238 cls.res2 = res2 239 240 res_qr = OLS(endog, exog).fit(method="qr") 241 242 model_qr = OLS(endog, exog) 243 Q, R = np.linalg.qr(exog) 244 model_qr.exog_Q, model_qr.exog_R = Q, R 245 model_qr.normalized_cov_params = np.linalg.inv(np.dot(R.T, R)) 246 model_qr.rank = np.linalg.matrix_rank(R) 247 res_qr2 = model_qr.fit(method="qr") 248 249 cls.res_qr = res_qr 250 cls.res_qr_manual = res_qr2 251 252 def test_eigenvalues(self): 253 eigenval_perc_diff = ( 254 self.res_qr.eigenvals - self.res_qr_manual.eigenvals 255 ) 256 eigenval_perc_diff /= self.res_qr.eigenvals 257 zeros = np.zeros_like(eigenval_perc_diff) 258 assert_almost_equal(eigenval_perc_diff, zeros, DECIMAL_7) 259 260 # Robust error tests. Compare values computed with SAS 261 def test_HC0_errors(self): 262 # They are split up because the copied results do not have any 263 # DECIMAL_4 places for the last place. 264 assert_almost_equal( 265 self.res1.HC0_se[:-1], self.res2.HC0_se[:-1], DECIMAL_4 266 ) 267 assert_allclose(np.round(self.res1.HC0_se[-1]), self.res2.HC0_se[-1]) 268 269 def test_HC1_errors(self): 270 assert_almost_equal( 271 self.res1.HC1_se[:-1], self.res2.HC1_se[:-1], DECIMAL_4 272 ) 273 # Note: tolerance is tight; rtol=3e-7 fails while 4e-7 passes 274 assert_allclose(self.res1.HC1_se[-1], self.res2.HC1_se[-1], rtol=4e-7) 275 276 def test_HC2_errors(self): 277 assert_almost_equal( 278 self.res1.HC2_se[:-1], self.res2.HC2_se[:-1], DECIMAL_4 279 ) 280 # Note: tolerance is tight; rtol=4e-7 fails while 5e-7 passes 281 assert_allclose(self.res1.HC2_se[-1], self.res2.HC2_se[-1], rtol=5e-7) 282 283 def test_HC3_errors(self): 284 assert_almost_equal( 285 self.res1.HC3_se[:-1], self.res2.HC3_se[:-1], DECIMAL_4 286 ) 287 # Note: tolerance is tight; rtol=1e-7 fails while 1.5e-7 passes 288 assert_allclose( 289 self.res1.HC3_se[-1], self.res2.HC3_se[-1], rtol=1.5e-7 290 ) 291 292 def test_qr_params(self): 293 assert_almost_equal(self.res1.params, self.res_qr.params, 6) 294 295 def test_qr_normalized_cov_params(self): 296 # todo: need assert_close 297 assert_almost_equal( 298 np.ones_like(self.res1.normalized_cov_params), 299 self.res1.normalized_cov_params 300 / self.res_qr.normalized_cov_params, 301 5, 302 ) 303 304 def test_missing(self): 305 data = longley.load() 306 data.exog = add_constant(data.exog, prepend=False) 307 data.endog[[3, 7, 14]] = np.nan 308 mod = OLS(data.endog, data.exog, missing="drop") 309 assert_equal(mod.endog.shape[0], 13) 310 assert_equal(mod.exog.shape[0], 13) 311 312 def test_rsquared_adj_overfit(self): 313 # Test that if df_resid = 0, rsquared_adj = 0. 314 # This is a regression test for user issue: 315 # https://github.com/statsmodels/statsmodels/issues/868 316 with warnings.catch_warnings(record=True): 317 x = np.random.randn(5) 318 y = np.random.randn(5, 6) 319 results = OLS(x, y).fit() 320 rsquared_adj = results.rsquared_adj 321 assert_equal(rsquared_adj, np.nan) 322 323 def test_qr_alternatives(self): 324 assert_allclose( 325 self.res_qr.params, self.res_qr_manual.params, rtol=5e-12 326 ) 327 328 def test_norm_resid(self): 329 resid = self.res1.wresid 330 norm_resid = resid / np.sqrt(np.sum(resid ** 2.0) / self.res1.df_resid) 331 model_norm_resid = self.res1.resid_pearson 332 assert_almost_equal(model_norm_resid, norm_resid, DECIMAL_7) 333 334 def test_summary_slim(self): 335 # check that slim summary is smaller, does not verify content 336 summ = self.res1.summary(slim=True) 337 assert len(summ.tables) == 2 338 assert len(str(summ)) < 6700 339 340 def test_norm_resid_zero_variance(self): 341 with warnings.catch_warnings(record=True): 342 y = self.res1.model.endog 343 res = OLS(y, y).fit() 344 assert_allclose(res.scale, 0, atol=1e-20) 345 assert_allclose(res.wresid, res.resid_pearson, atol=5e-11) 346 347 348class TestRTO(CheckRegressionResults): 349 @classmethod 350 def setup_class(cls): 351 from .results.results_regression import LongleyRTO 352 353 data = longley.load() 354 endog = np.asarray(data.endog) 355 exog = np.asarray(data.exog) 356 res1 = OLS(endog, exog).fit() 357 res2 = LongleyRTO() 358 res2.wresid = res1.wresid # workaround hack 359 cls.res1 = res1 360 cls.res2 = res2 361 362 res_qr = OLS(endog, exog).fit(method="qr") 363 cls.res_qr = res_qr 364 365 366class TestFtest(object): 367 """ 368 Tests f_test vs. RegressionResults 369 """ 370 371 @classmethod 372 def setup_class(cls): 373 data = longley.load() 374 data.exog = add_constant(data.exog, prepend=False) 375 cls.res1 = OLS(data.endog, data.exog).fit() 376 R = np.identity(7)[:-1, :] 377 cls.Ftest = cls.res1.f_test(R) 378 379 def test_F(self): 380 assert_almost_equal(self.Ftest.fvalue, self.res1.fvalue, DECIMAL_4) 381 382 def test_p(self): 383 assert_almost_equal(self.Ftest.pvalue, self.res1.f_pvalue, DECIMAL_4) 384 385 def test_Df_denom(self): 386 assert_equal(self.Ftest.df_denom, self.res1.model.df_resid) 387 388 def test_Df_num(self): 389 assert_equal(self.Ftest.df_num, 6) 390 391 392class TestFTest2(object): 393 """ 394 A joint test that the coefficient on 395 GNP = the coefficient on UNEMP and that the coefficient on 396 POP = the coefficient on YEAR for the Longley dataset. 397 398 Ftest1 is from statsmodels. Results are from Rpy using R's car library. 399 """ 400 401 @classmethod 402 def setup_class(cls): 403 data = longley.load() 404 columns = [f"x{i}" for i in range(1, data.exog.shape[1] + 1)] 405 data.exog.columns = columns 406 data.exog = add_constant(data.exog, prepend=False) 407 res1 = OLS(data.endog, data.exog).fit() 408 R2 = [[0, 1, -1, 0, 0, 0, 0], [0, 0, 0, 0, 1, -1, 0]] 409 cls.Ftest1 = res1.f_test(R2) 410 hyp = "x2 = x3, x5 = x6" 411 cls.NewFtest1 = res1.f_test(hyp) 412 413 def test_new_ftest(self): 414 assert_equal(self.NewFtest1.fvalue, self.Ftest1.fvalue) 415 416 def test_fvalue(self): 417 assert_almost_equal(self.Ftest1.fvalue, 9.7404618732968196, DECIMAL_4) 418 419 def test_pvalue(self): 420 assert_almost_equal( 421 self.Ftest1.pvalue, 0.0056052885317493459, DECIMAL_4 422 ) 423 424 def test_df_denom(self): 425 assert_equal(self.Ftest1.df_denom, 9) 426 427 def test_df_num(self): 428 assert_equal(self.Ftest1.df_num, 2) 429 430 431class TestFtestQ(object): 432 """ 433 A joint hypothesis test that Rb = q. Coefficient tests are essentially 434 made up. Test values taken from Stata. 435 """ 436 437 @classmethod 438 def setup_class(cls): 439 data = longley.load() 440 data.exog = add_constant(data.exog, prepend=False) 441 res1 = OLS(data.endog, data.exog).fit() 442 R = np.array( 443 [ 444 [0, 1, 1, 0, 0, 0, 0], 445 [0, 1, 0, 1, 0, 0, 0], 446 [0, 1, 0, 0, 0, 0, 0], 447 [0, 0, 0, 0, 1, 0, 0], 448 [0, 0, 0, 0, 0, 1, 0], 449 ] 450 ) 451 q = np.array([0, 0, 0, 1, 0]) 452 cls.Ftest1 = res1.f_test((R, q)) 453 454 def test_fvalue(self): 455 assert_almost_equal(self.Ftest1.fvalue, 70.115557, 5) 456 457 def test_pvalue(self): 458 assert_almost_equal(self.Ftest1.pvalue, 6.229e-07, 10) 459 460 def test_df_denom(self): 461 assert_equal(self.Ftest1.df_denom, 9) 462 463 def test_df_num(self): 464 assert_equal(self.Ftest1.df_num, 5) 465 466 467class TestTtest(object): 468 """ 469 Test individual t-tests. Ie., are the coefficients significantly 470 different than zero. 471 """ 472 473 @classmethod 474 def setup_class(cls): 475 data = longley.load() 476 columns = [f"x{i}" for i in range(1, data.exog.shape[1] + 1)] 477 data.exog.columns = columns 478 data.exog = add_constant(data.exog, prepend=False) 479 cls.res1 = OLS(data.endog, data.exog).fit() 480 R = np.identity(7) 481 cls.Ttest = cls.res1.t_test(R) 482 hyp = "x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0, x6 = 0, const = 0" 483 cls.NewTTest = cls.res1.t_test(hyp) 484 485 def test_new_tvalue(self): 486 assert_equal(self.NewTTest.tvalue, self.Ttest.tvalue) 487 488 def test_tvalue(self): 489 assert_almost_equal(self.Ttest.tvalue, self.res1.tvalues, DECIMAL_4) 490 491 def test_sd(self): 492 assert_almost_equal(self.Ttest.sd, self.res1.bse, DECIMAL_4) 493 494 def test_pvalue(self): 495 assert_almost_equal( 496 self.Ttest.pvalue, 497 student_t.sf(np.abs(self.res1.tvalues), self.res1.model.df_resid) 498 * 2, 499 DECIMAL_4, 500 ) 501 502 def test_df_denom(self): 503 assert_equal(self.Ttest.df_denom, self.res1.model.df_resid) 504 505 def test_effect(self): 506 assert_almost_equal(self.Ttest.effect, self.res1.params) 507 508 509class TestTtest2(object): 510 """ 511 Tests the hypothesis that the coefficients on POP and YEAR 512 are equal. 513 514 Results from RPy using 'car' package. 515 """ 516 517 @classmethod 518 def setup_class(cls): 519 R = np.zeros(7) 520 R[4:6] = [1, -1] 521 data = longley.load() 522 data.exog = add_constant(data.exog, prepend=False) 523 res1 = OLS(data.endog, data.exog).fit() 524 cls.Ttest1 = res1.t_test(R) 525 526 def test_tvalue(self): 527 assert_almost_equal(self.Ttest1.tvalue, -4.0167754636397284, DECIMAL_4) 528 529 def test_sd(self): 530 assert_almost_equal(self.Ttest1.sd, 455.39079425195314, DECIMAL_4) 531 532 def test_pvalue(self): 533 assert_almost_equal( 534 self.Ttest1.pvalue, 2 * 0.0015163772380932246, DECIMAL_4 535 ) 536 537 def test_df_denom(self): 538 assert_equal(self.Ttest1.df_denom, 9) 539 540 def test_effect(self): 541 assert_almost_equal(self.Ttest1.effect, -1829.2025687186533, DECIMAL_4) 542 543 544class TestGLS(object): 545 """ 546 These test results were obtained by replication with R. 547 """ 548 549 @classmethod 550 def setup_class(cls): 551 from .results.results_regression import LongleyGls 552 553 data = longley.load() 554 exog = add_constant( 555 np.column_stack((data.exog.iloc[:, 1], data.exog.iloc[:, 4])), 556 prepend=False, 557 ) 558 tmp_results = OLS(data.endog, exog).fit() 559 rho = np.corrcoef(tmp_results.resid[1:], tmp_results.resid[:-1])[0][ 560 1 561 ] # by assumption 562 order = toeplitz(np.arange(16)) 563 sigma = rho ** order 564 GLS_results = GLS(data.endog, exog, sigma=sigma).fit() 565 cls.res1 = GLS_results 566 cls.res2 = LongleyGls() 567 # attach for test_missing 568 cls.sigma = sigma 569 cls.exog = exog 570 cls.endog = data.endog 571 572 def test_aic(self): 573 # Note: tolerance is tight; rtol=3e-3 fails while 4e-3 passes 574 assert_allclose(self.res1.aic + 2, self.res2.aic, rtol=4e-3) 575 576 def test_bic(self): 577 # Note: tolerance is tight; rtol=1e-2 fails while 1.5e-2 passes 578 assert_allclose(self.res1.bic, self.res2.bic, rtol=1.5e-2) 579 580 def test_loglike(self): 581 assert_almost_equal(self.res1.llf, self.res2.llf, DECIMAL_0) 582 583 def test_params(self): 584 assert_almost_equal(self.res1.params, self.res2.params, DECIMAL_1) 585 586 def test_resid(self): 587 assert_almost_equal(self.res1.resid, self.res2.resid, DECIMAL_4) 588 589 def test_scale(self): 590 assert_almost_equal(self.res1.scale, self.res2.scale, DECIMAL_4) 591 592 def test_tvalues(self): 593 assert_almost_equal(self.res1.tvalues, self.res2.tvalues, DECIMAL_4) 594 595 def test_standarderrors(self): 596 assert_almost_equal(self.res1.bse, self.res2.bse, DECIMAL_4) 597 598 def test_fittedvalues(self): 599 assert_almost_equal( 600 self.res1.fittedvalues, self.res2.fittedvalues, DECIMAL_4 601 ) 602 603 def test_pvalues(self): 604 assert_almost_equal(self.res1.pvalues, self.res2.pvalues, DECIMAL_4) 605 606 def test_missing(self): 607 endog = self.endog.copy() # copy or changes endog for other methods 608 endog[[4, 7, 14]] = np.nan 609 mod = GLS(endog, self.exog, sigma=self.sigma, missing="drop") 610 assert_equal(mod.endog.shape[0], 13) 611 assert_equal(mod.exog.shape[0], 13) 612 assert_equal(mod.sigma.shape, (13, 13)) 613 614 615class TestGLS_alt_sigma(CheckRegressionResults): 616 """ 617 Test that GLS with no argument is equivalent to OLS. 618 """ 619 620 @classmethod 621 def setup_class(cls): 622 data = longley.load() 623 endog = np.asarray(data.endog) 624 exog = np.asarray(data.exog) 625 exog = add_constant(exog, prepend=False) 626 ols_res = OLS(endog, exog).fit() 627 gls_res = GLS(endog, exog).fit() 628 gls_res_scalar = GLS(endog, exog, sigma=1) 629 cls.endog = endog 630 cls.exog = exog 631 cls.res1 = gls_res 632 cls.res2 = ols_res 633 cls.res3 = gls_res_scalar 634 635 # self.res2.conf_int = self.res2.conf_int() 636 637 def test_wrong_size_sigma_1d(self): 638 n = len(self.endog) 639 assert_raises( 640 ValueError, GLS, self.endog, self.exog, sigma=np.ones(n - 1) 641 ) 642 643 def test_wrong_size_sigma_2d(self): 644 n = len(self.endog) 645 assert_raises( 646 ValueError, 647 GLS, 648 self.endog, 649 self.exog, 650 sigma=np.ones((n - 1, n - 1)), 651 ) 652 653 654# FIXME: do not leave commented-out, use or move/remove 655# def check_confidenceintervals(self, conf1, conf2): 656# assert_almost_equal(conf1, conf2, DECIMAL_4) 657 658 659class TestLM(object): 660 @classmethod 661 def setup_class(cls): 662 # TODO: Test HAC method 663 rs = np.random.RandomState(1234) 664 x = rs.randn(100, 3) 665 b = np.ones((3, 1)) 666 e = rs.randn(100, 1) 667 y = np.dot(x, b) + e 668 # Cases? 669 # Homoskedastic 670 # HC0 671 cls.res1_full = OLS(y, x).fit() 672 cls.res1_restricted = OLS(y, x[:, 0]).fit() 673 674 cls.res2_full = cls.res1_full.get_robustcov_results("HC0") 675 cls.res2_restricted = cls.res1_restricted.get_robustcov_results("HC0") 676 677 cls.x = x 678 cls.Y = y 679 680 def test_LM_homoskedastic(self): 681 resid = self.res1_restricted.wresid 682 n = resid.shape[0] 683 x = self.x 684 S = np.dot(resid, resid) / n * np.dot(x.T, x) / n 685 Sinv = np.linalg.inv(S) 686 s = np.mean(x * resid[:, None], 0) 687 LMstat = n * np.dot(np.dot(s, Sinv), s.T) 688 LMstat_OLS = self.res1_full.compare_lm_test(self.res1_restricted) 689 LMstat2 = LMstat_OLS[0] 690 assert_almost_equal(LMstat, LMstat2, DECIMAL_7) 691 692 def test_LM_heteroskedastic_nodemean(self): 693 resid = self.res1_restricted.wresid 694 n = resid.shape[0] 695 x = self.x 696 scores = x * resid[:, None] 697 S = np.dot(scores.T, scores) / n 698 Sinv = np.linalg.inv(S) 699 s = np.mean(scores, 0) 700 LMstat = n * np.dot(np.dot(s, Sinv), s.T) 701 LMstat_OLS = self.res2_full.compare_lm_test( 702 self.res2_restricted, demean=False 703 ) 704 LMstat2 = LMstat_OLS[0] 705 assert_almost_equal(LMstat, LMstat2, DECIMAL_7) 706 707 def test_LM_heteroskedastic_demean(self): 708 resid = self.res1_restricted.wresid 709 n = resid.shape[0] 710 x = self.x 711 scores = x * resid[:, None] 712 scores_demean = scores - scores.mean(0) 713 S = np.dot(scores_demean.T, scores_demean) / n 714 Sinv = np.linalg.inv(S) 715 s = np.mean(scores, 0) 716 LMstat = n * np.dot(np.dot(s, Sinv), s.T) 717 LMstat_OLS = self.res2_full.compare_lm_test(self.res2_restricted) 718 LMstat2 = LMstat_OLS[0] 719 assert_almost_equal(LMstat, LMstat2, DECIMAL_7) 720 721 def test_LM_heteroskedastic_LRversion(self): 722 resid = self.res1_restricted.wresid 723 resid_full = self.res1_full.wresid 724 n = resid.shape[0] 725 x = self.x 726 scores = x * resid[:, None] 727 s = np.mean(scores, 0) 728 scores = x * resid_full[:, None] 729 S = np.dot(scores.T, scores) / n 730 Sinv = np.linalg.inv(S) 731 LMstat = n * np.dot(np.dot(s, Sinv), s.T) 732 LMstat_OLS = self.res2_full.compare_lm_test( 733 self.res2_restricted, use_lr=True 734 ) 735 LMstat2 = LMstat_OLS[0] 736 assert_almost_equal(LMstat, LMstat2, DECIMAL_7) 737 738 def test_LM_nonnested(self): 739 assert_raises( 740 ValueError, self.res2_restricted.compare_lm_test, self.res2_full 741 ) 742 743 744class TestOLS_GLS_WLS_equivalence(object): 745 @classmethod 746 def setup_class(cls): 747 data = longley.load() 748 data.exog = add_constant(data.exog, prepend=False) 749 y = data.endog 750 x = data.exog 751 n = y.shape[0] 752 w = np.ones(n) 753 cls.results = [] 754 cls.results.append(OLS(y, x).fit()) 755 cls.results.append(WLS(y, x, w).fit()) 756 # scaling weights does not change main results (except scale) 757 cls.results.append(GLS(y, x, 100 * w).fit()) 758 cls.results.append(GLS(y, x, np.diag(0.1 * w)).fit()) 759 760 def test_ll(self): 761 llf = np.array([r.llf for r in self.results]) 762 llf_1 = np.ones_like(llf) * self.results[0].llf 763 assert_almost_equal(llf, llf_1, DECIMAL_7) 764 765 ic = np.array([r.aic for r in self.results]) 766 ic_1 = np.ones_like(ic) * self.results[0].aic 767 assert_almost_equal(ic, ic_1, DECIMAL_7) 768 769 ic = np.array([r.bic for r in self.results]) 770 ic_1 = np.ones_like(ic) * self.results[0].bic 771 assert_almost_equal(ic, ic_1, DECIMAL_7) 772 773 def test_params(self): 774 params = np.array([r.params for r in self.results]) 775 params_1 = np.array([self.results[0].params] * len(self.results)) 776 assert_allclose(params, params_1) 777 778 def test_ss(self): 779 bse = np.array([r.bse for r in self.results]) 780 bse_1 = np.array([self.results[0].bse] * len(self.results)) 781 assert_allclose(bse, bse_1) 782 783 def test_rsquared(self): 784 rsquared = np.array([r.rsquared for r in self.results]) 785 rsquared_1 = np.array([self.results[0].rsquared] * len(self.results)) 786 assert_almost_equal(rsquared, rsquared_1, DECIMAL_7) 787 788 789class TestGLS_WLS_equivalence(TestOLS_GLS_WLS_equivalence): 790 # reuse test methods 791 792 @classmethod 793 def setup_class(cls): 794 data = longley.load() 795 data.exog = add_constant(data.exog, prepend=False) 796 y = data.endog 797 x = data.exog 798 n = y.shape[0] 799 np.random.seed(5) 800 w = np.random.uniform(0.5, 1, n) 801 w_inv = 1.0 / w 802 cls.results = [] 803 cls.results.append(WLS(y, x, w).fit()) 804 # scaling weights does not change main results (except scale) 805 cls.results.append(WLS(y, x, 0.01 * w).fit()) 806 cls.results.append(GLS(y, x, 100 * w_inv).fit()) 807 cls.results.append(GLS(y, x, np.diag(0.1 * w_inv)).fit()) 808 809 810class TestNonFit(object): 811 @classmethod 812 def setup_class(cls): 813 data = longley.load() 814 data.exog = add_constant(data.exog, prepend=False) 815 cls.endog = data.endog 816 cls.exog = data.exog 817 cls.ols_model = OLS(data.endog, data.exog) 818 819 def test_df_resid(self): 820 df_resid = self.endog.shape[0] - self.exog.shape[1] 821 assert_equal(self.ols_model.df_resid, 9) 822 823 824class TestWLS_CornerCases(object): 825 @classmethod 826 def setup_class(cls): 827 cls.exog = np.ones((1,)) 828 cls.endog = np.ones((1,)) 829 weights = 1 830 cls.wls_res = WLS(cls.endog, cls.exog, weights=weights).fit() 831 832 def test_wrong_size_weights(self): 833 weights = np.ones((10, 10)) 834 assert_raises(ValueError, WLS, self.endog, self.exog, weights=weights) 835 836 837class TestWLSExogWeights(CheckRegressionResults): 838 # Test WLS with Greene's credit card data 839 # reg avgexp age income incomesq ownrent [aw=1/incomesq] 840 @classmethod 841 def setup_class(cls): 842 from statsmodels.datasets.ccard import load 843 844 from .results.results_regression import CCardWLS 845 846 dta = load() 847 endog = np.asarray(dta.endog) 848 exog = np.asarray(dta.exog) 849 exog = add_constant(exog, prepend=False) 850 nobs = 72.0 851 852 weights = 1 / exog[:, 2] 853 # for comparison with stata analytic weights 854 scaled_weights = (weights * nobs) / weights.sum() 855 856 cls.res1 = WLS(endog, exog, weights=scaled_weights).fit() 857 cls.res2 = CCardWLS() 858 cls.res2.wresid = scaled_weights ** 0.5 * cls.res2.resid 859 860 # correction because we use different definition for loglike/llf 861 corr_ic = 2 * (cls.res1.llf - cls.res2.llf) 862 cls.res2.aic -= corr_ic 863 cls.res2.bic -= corr_ic 864 cls.res2.llf += 0.5 * np.sum(np.log(cls.res1.model.weights)) 865 866 867def test_wls_example(): 868 # example from the docstring, there was a note about a bug, should 869 # be fixed now 870 Y = [1, 3, 4, 5, 2, 3, 4] 871 x = lrange(1, 8) 872 x = add_constant(x, prepend=False) 873 wls_model = WLS(Y, x, weights=lrange(1, 8)).fit() 874 # taken from R lm.summary 875 assert_almost_equal(wls_model.fvalue, 0.127337843215, 6) 876 assert_almost_equal(wls_model.scale, 2.44608530786 ** 2, 6) 877 878 879def test_wls_tss(): 880 y = np.array([22, 22, 22, 23, 23, 23]) 881 x = [[1, 0], [1, 0], [1, 1], [0, 1], [0, 1], [0, 1]] 882 883 ols_mod = OLS(y, add_constant(x, prepend=False)).fit() 884 885 yw = np.array([22, 22, 23.0]) 886 Xw = [[1, 0], [1, 1], [0, 1]] 887 w = np.array([2, 1, 3.0]) 888 889 wls_mod = WLS(yw, add_constant(Xw, prepend=False), weights=w).fit() 890 assert_equal(ols_mod.centered_tss, wls_mod.centered_tss) 891 892 893class TestWLSScalarVsArray(CheckRegressionResults): 894 @classmethod 895 def setup_class(cls): 896 from statsmodels.datasets.longley import load 897 898 dta = load() 899 endog = np.asarray(dta.endog) 900 exog = np.asarray(dta.exog) 901 exog = add_constant(exog, prepend=True) 902 wls_scalar = WLS(endog, exog, weights=1.0 / 3).fit() 903 weights = [1 / 3.0] * len(endog) 904 wls_array = WLS(endog, exog, weights=weights).fit() 905 cls.res1 = wls_scalar 906 cls.res2 = wls_array 907 908 909class TestWLS_GLS(CheckRegressionResults): 910 @classmethod 911 def setup_class(cls): 912 from statsmodels.datasets.ccard import load 913 914 data = load() 915 endog = np.asarray(data.endog) 916 exog = np.asarray(data.exog) 917 sigma = exog[:, 2] 918 cls.res1 = WLS(endog, exog, weights=1 / sigma).fit() 919 cls.res2 = GLS(endog, exog, sigma=sigma).fit() 920 921 def check_confidenceintervals(self, conf1, conf2): # FIXME: never called 922 assert_almost_equal(conf1, conf2(), DECIMAL_4) 923 924 925def test_wls_missing(): 926 from statsmodels.datasets.ccard import load 927 928 data = load() 929 endog = data.endog 930 endog[[10, 25]] = np.nan 931 mod = WLS( 932 data.endog, data.exog, weights=1 / data.exog.iloc[:, 2], missing="drop" 933 ) 934 assert_equal(mod.endog.shape[0], 70) 935 assert_equal(mod.exog.shape[0], 70) 936 assert_equal(mod.weights.shape[0], 70) 937 938 939class TestWLS_OLS(CheckRegressionResults): 940 @classmethod 941 def setup_class(cls): 942 data = longley.load() 943 endog = np.asarray(data.endog) 944 exog = np.asarray(data.exog) 945 exog = add_constant(exog, prepend=False) 946 cls.res1 = OLS(endog, exog).fit() 947 cls.res2 = WLS(endog, exog).fit() 948 949 def check_confidenceintervals(self, conf1, conf2): # FIXME: never called 950 assert_almost_equal(conf1, conf2(), DECIMAL_4) 951 952 953class TestGLS_OLS(CheckRegressionResults): 954 @classmethod 955 def setup_class(cls): 956 data = longley.load() 957 endog = np.asarray(data.endog) 958 exog = np.asarray(data.exog) 959 exog = add_constant(exog, prepend=False) 960 cls.res1 = GLS(endog, exog).fit() 961 cls.res2 = OLS(endog, exog).fit() 962 963 def check_confidenceintervals(self, conf1, conf2): # FIXME: never called 964 assert_almost_equal(conf1, conf2(), DECIMAL_4) 965 966 967# FIXME: do not leave this commented-out sitting here 968# TODO: test AR 969# why the two-stage in AR? 970# class TestAR(object): 971# from statsmodels.datasets.sunspots import load 972# data = load() 973# model = AR(data.endog, rho=4).fit() 974# R_res = RModel(data.endog, aic="FALSE", order_max=4)# 975 976# def test_params(self): 977# assert_almost_equal(self.model.rho, 978# pass 979 980# def test_order(self): 981# In R this can be defined or chosen by minimizing the AIC if aic=True 982# pass 983 984 985class TestYuleWalker(object): 986 @classmethod 987 def setup_class(cls): 988 from statsmodels.datasets.sunspots import load 989 990 data = load() 991 cls.rho, cls.sigma = yule_walker(data.endog, order=4, method="mle") 992 cls.R_params = [ 993 1.2831003105694765, 994 -0.45240924374091945, 995 -0.20770298557575195, 996 0.047943648089542337, 997 ] 998 999 def test_params(self): 1000 assert_almost_equal(self.rho, self.R_params, DECIMAL_4) 1001 1002 1003class TestDataDimensions(CheckRegressionResults): 1004 @classmethod 1005 def setup_class(cls): 1006 np.random.seed(54321) 1007 cls.endog_n_ = np.random.uniform(0, 20, size=30) 1008 cls.endog_n_one = cls.endog_n_[:, None] 1009 cls.exog_n_ = np.random.uniform(0, 20, size=30) 1010 cls.exog_n_one = cls.exog_n_[:, None] 1011 cls.degen_exog = cls.exog_n_one[:-1] 1012 cls.mod1 = OLS(cls.endog_n_one, cls.exog_n_one) 1013 cls.mod1.df_model += 1 1014 cls.res1 = cls.mod1.fit() 1015 # Note that these are created for every subclass.. 1016 # A little extra overhead probably 1017 cls.mod2 = OLS(cls.endog_n_one, cls.exog_n_one) 1018 cls.mod2.df_model += 1 1019 cls.res2 = cls.mod2.fit() 1020 1021 def check_confidenceintervals(self, conf1, conf2): # FIXME: never called 1022 assert_almost_equal(conf1, conf2(), DECIMAL_4) 1023 1024 1025class TestGLS_large_data(TestDataDimensions): 1026 @classmethod 1027 def setup_class(cls): 1028 super(TestGLS_large_data, cls).setup_class() 1029 nobs = 1000 1030 y = np.random.randn(nobs, 1) 1031 x = np.random.randn(nobs, 20) 1032 sigma = np.ones_like(y) 1033 cls.gls_res = GLS(y, x, sigma=sigma).fit() 1034 cls.gls_res_scalar = GLS(y, x, sigma=1).fit() 1035 cls.gls_res_none = GLS(y, x).fit() 1036 cls.ols_res = OLS(y, x).fit() 1037 1038 def test_large_equal_params(self): 1039 assert_almost_equal( 1040 self.ols_res.params, self.gls_res.params, DECIMAL_7 1041 ) 1042 1043 def test_large_equal_loglike(self): 1044 assert_almost_equal(self.ols_res.llf, self.gls_res.llf, DECIMAL_7) 1045 1046 def test_large_equal_params_none(self): 1047 assert_almost_equal( 1048 self.gls_res.params, self.gls_res_none.params, DECIMAL_7 1049 ) 1050 1051 1052class TestNxNx(TestDataDimensions): 1053 @classmethod 1054 def setup_class(cls): 1055 super(TestNxNx, cls).setup_class() 1056 cls.mod2 = OLS(cls.endog_n_, cls.exog_n_) 1057 cls.mod2.df_model += 1 1058 cls.res2 = cls.mod2.fit() 1059 1060 1061class TestNxOneNx(TestDataDimensions): 1062 @classmethod 1063 def setup_class(cls): 1064 super(TestNxOneNx, cls).setup_class() 1065 cls.mod2 = OLS(cls.endog_n_one, cls.exog_n_) 1066 cls.mod2.df_model += 1 1067 cls.res2 = cls.mod2.fit() 1068 1069 1070class TestNxNxOne(TestDataDimensions): 1071 @classmethod 1072 def setup_class(cls): 1073 super(TestNxNxOne, cls).setup_class() 1074 cls.mod2 = OLS(cls.endog_n_, cls.exog_n_one) 1075 cls.mod2.df_model += 1 1076 cls.res2 = cls.mod2.fit() 1077 1078 1079def test_bad_size(): 1080 np.random.seed(54321) 1081 data = np.random.uniform(0, 20, 31) 1082 assert_raises(ValueError, OLS, data, data[1:]) 1083 1084 1085def test_const_indicator(): 1086 rs = np.random.RandomState(12345) 1087 x = rs.randint(0, 3, size=30) 1088 x = pd.get_dummies(pd.Series(x, dtype="category"), drop_first=False) 1089 y = np.dot(x, [1.0, 2.0, 3.0]) + rs.normal(size=30) 1090 resc = OLS(y, add_constant(x.iloc[:, 1:], prepend=True)).fit() 1091 res = OLS(y, x, hasconst=True).fit() 1092 assert_almost_equal(resc.rsquared, res.rsquared, 12) 1093 assert res.model.data.k_constant == 1 1094 assert resc.model.data.k_constant == 1 1095 1096 1097def test_fvalue_const_only(): 1098 rs = np.random.RandomState(12345) 1099 x = rs.randint(0, 3, size=30) 1100 x = pd.get_dummies(pd.Series(x, dtype="category"), drop_first=False) 1101 x.iloc[:, 0] = 1 1102 y = np.dot(x, [1.0, 2.0, 3.0]) + rs.normal(size=30) 1103 res = OLS(y, x, hasconst=True).fit(cov_type="HC1") 1104 assert not np.isnan(res.fvalue) 1105 assert isinstance(res.fvalue, float) 1106 assert isinstance(res.f_pvalue, float) 1107 1108 1109def test_conf_int_single_regressor(): 1110 # GH#706 single-regressor model (i.e. no intercept) with 1D exog 1111 # should get passed to DataFrame for conf_int 1112 y = pd.Series(np.random.randn(10)) 1113 x = pd.Series(np.ones(10)) 1114 res = OLS(y, x).fit() 1115 conf_int = res.conf_int() 1116 np.testing.assert_equal(conf_int.shape, (1, 2)) 1117 np.testing.assert_(isinstance(conf_int, pd.DataFrame)) 1118 1119 1120def test_summary_as_latex(): 1121 # GH#734 1122 import re 1123 1124 dta = longley.load_pandas() 1125 x = dta.exog 1126 x["constant"] = 1 1127 y = dta.endog 1128 res = OLS(y, x).fit() 1129 with pytest.warns(UserWarning): 1130 table = res.summary().as_latex() 1131 # replace the date and time 1132 table = re.sub( 1133 "(?<=\n\\\\textbf\\{Date:\\} &).+?&", 1134 " Sun, 07 Apr 2013 &", 1135 table, 1136 ) 1137 table = re.sub( 1138 "(?<=\n\\\\textbf\\{Time:\\} &).+?&", 1139 " 13:46:07 &", 1140 table, 1141 ) 1142 1143 expected = """\\begin{center} 1144\\begin{tabular}{lclc} 1145\\toprule 1146\\textbf{Dep. Variable:} & TOTEMP & \\textbf{ R-squared: } & 0.995 \\\\ 1147\\textbf{Model:} & OLS & \\textbf{ Adj. R-squared: } & 0.992 \\\\ 1148\\textbf{Method:} & Least Squares & \\textbf{ F-statistic: } & 330.3 \\\\ 1149\\textbf{Date:} & Sun, 07 Apr 2013 & \\textbf{ Prob (F-statistic):} & 4.98e-10 \\\\ 1150\\textbf{Time:} & 13:46:07 & \\textbf{ Log-Likelihood: } & -109.62 \\\\ 1151\\textbf{No. Observations:} & 16 & \\textbf{ AIC: } & 233.2 \\\\ 1152\\textbf{Df Residuals:} & 9 & \\textbf{ BIC: } & 238.6 \\\\ 1153\\textbf{Df Model:} & 6 & \\textbf{ } & \\\\ 1154\\textbf{Covariance Type:} & nonrobust & \\textbf{ } & \\\\ 1155\\bottomrule 1156\\end{tabular} 1157\\begin{tabular}{lcccccc} 1158 & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$> |$t$|$} & \\textbf{[0.025} & \\textbf{0.975]} \\\\ 1159\\midrule 1160\\textbf{GNPDEFL} & 15.0619 & 84.915 & 0.177 & 0.863 & -177.029 & 207.153 \\\\ 1161\\textbf{GNP} & -0.0358 & 0.033 & -1.070 & 0.313 & -0.112 & 0.040 \\\\ 1162\\textbf{UNEMP} & -2.0202 & 0.488 & -4.136 & 0.003 & -3.125 & -0.915 \\\\ 1163\\textbf{ARMED} & -1.0332 & 0.214 & -4.822 & 0.001 & -1.518 & -0.549 \\\\ 1164\\textbf{POP} & -0.0511 & 0.226 & -0.226 & 0.826 & -0.563 & 0.460 \\\\ 1165\\textbf{YEAR} & 1829.1515 & 455.478 & 4.016 & 0.003 & 798.788 & 2859.515 \\\\ 1166\\textbf{constant} & -3.482e+06 & 8.9e+05 & -3.911 & 0.004 & -5.5e+06 & -1.47e+06 \\\\ 1167\\bottomrule 1168\\end{tabular} 1169\\begin{tabular}{lclc} 1170\\textbf{Omnibus:} & 0.749 & \\textbf{ Durbin-Watson: } & 2.559 \\\\ 1171\\textbf{Prob(Omnibus):} & 0.688 & \\textbf{ Jarque-Bera (JB): } & 0.684 \\\\ 1172\\textbf{Skew:} & 0.420 & \\textbf{ Prob(JB): } & 0.710 \\\\ 1173\\textbf{Kurtosis:} & 2.434 & \\textbf{ Cond. No. } & 4.86e+09 \\\\ 1174\\bottomrule 1175\\end{tabular} 1176%\\caption{OLS Regression Results} 1177\\end{center} 1178 1179Notes: \\newline 1180 [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. \\newline 1181 [2] The condition number is large, 4.86e+09. This might indicate that there are \\newline 1182 strong multicollinearity or other numerical problems.""" 1183 assert_equal(table, expected) 1184 1185 1186class TestRegularizedFit(object): 1187 1188 # Make sure there are no problems when no variables are selected. 1189 def test_empty_model(self): 1190 1191 np.random.seed(742) 1192 n = 100 1193 endog = np.random.normal(size=n) 1194 exog = np.random.normal(size=(n, 3)) 1195 1196 for cls in OLS, WLS, GLS: 1197 model = cls(endog, exog) 1198 result = model.fit_regularized(alpha=1000) 1199 assert_equal(result.params, 0.0) 1200 1201 def test_regularized(self): 1202 1203 import os 1204 1205 from .results import glmnet_r_results 1206 1207 cur_dir = os.path.dirname(os.path.abspath(__file__)) 1208 data = np.loadtxt( 1209 os.path.join(cur_dir, "results", "lasso_data.csv"), delimiter="," 1210 ) 1211 1212 tests = [x for x in dir(glmnet_r_results) if x.startswith("rslt_")] 1213 1214 for test in tests: 1215 1216 vec = getattr(glmnet_r_results, test) 1217 1218 n = vec[0] 1219 p = vec[1] 1220 L1_wt = float(vec[2]) 1221 lam = float(vec[3]) 1222 params = vec[4:].astype(np.float64) 1223 1224 endog = data[0 : int(n), 0] 1225 exog = data[0 : int(n), 1 : (int(p) + 1)] 1226 1227 endog = endog - endog.mean() 1228 endog /= endog.std(ddof=1) 1229 exog = exog - exog.mean(0) 1230 exog /= exog.std(0, ddof=1) 1231 1232 for cls in OLS, WLS, GLS: 1233 mod = cls(endog, exog) 1234 rslt = mod.fit_regularized(L1_wt=L1_wt, alpha=lam) 1235 assert_almost_equal(rslt.params, params, decimal=3) 1236 1237 # Smoke test for profile likelihood 1238 mod.fit_regularized(L1_wt=L1_wt, alpha=lam, profile_scale=True) 1239 1240 def test_regularized_weights(self): 1241 1242 np.random.seed(1432) 1243 exog1 = np.random.normal(size=(100, 3)) 1244 endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100) 1245 exog2 = np.random.normal(size=(100, 3)) 1246 endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100) 1247 1248 exog_a = np.vstack((exog1, exog1, exog2)) 1249 endog_a = np.concatenate((endog1, endog1, endog2)) 1250 1251 # Should be equivalent to exog_a, endog_a. 1252 exog_b = np.vstack((exog1, exog2)) 1253 endog_b = np.concatenate((endog1, endog2)) 1254 wgts = np.ones(200) 1255 wgts[0:100] = 2 1256 sigma = np.diag(1 / wgts) 1257 1258 for L1_wt in 0, 0.5, 1: 1259 for alpha in 0, 1: 1260 mod1 = OLS(endog_a, exog_a) 1261 rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha) 1262 1263 mod2 = WLS(endog_b, exog_b, weights=wgts) 1264 rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha) 1265 1266 mod3 = GLS(endog_b, exog_b, sigma=sigma) 1267 rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha) 1268 1269 assert_almost_equal(rslt1.params, rslt2.params, decimal=3) 1270 assert_almost_equal(rslt1.params, rslt3.params, decimal=3) 1271 1272 def test_regularized_weights_list(self): 1273 1274 np.random.seed(132) 1275 exog1 = np.random.normal(size=(100, 3)) 1276 endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100) 1277 exog2 = np.random.normal(size=(100, 3)) 1278 endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100) 1279 1280 exog_a = np.vstack((exog1, exog1, exog2)) 1281 endog_a = np.concatenate((endog1, endog1, endog2)) 1282 1283 # Should be equivalent to exog_a, endog_a. 1284 exog_b = np.vstack((exog1, exog2)) 1285 endog_b = np.concatenate((endog1, endog2)) 1286 wgts = np.ones(200) 1287 wgts[0:100] = 2 1288 sigma = np.diag(1 / wgts) 1289 1290 for L1_wt in 0, 0.5, 1: 1291 for alpha_element in 0, 1: 1292 alpha = [ 1293 alpha_element, 1294 ] * 3 1295 1296 mod1 = OLS(endog_a, exog_a) 1297 rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha) 1298 1299 mod2 = WLS(endog_b, exog_b, weights=wgts) 1300 rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha) 1301 1302 mod3 = GLS(endog_b, exog_b, sigma=sigma) 1303 rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha) 1304 1305 assert_almost_equal(rslt1.params, rslt2.params, decimal=3) 1306 assert_almost_equal(rslt1.params, rslt3.params, decimal=3) 1307 1308 1309def test_formula_missing_cat(): 1310 # gh-805 1311 1312 from patsy import PatsyError 1313 1314 import statsmodels.api as sm 1315 from statsmodels.formula.api import ols 1316 1317 dta = sm.datasets.grunfeld.load_pandas().data 1318 dta.loc[dta.index[0], "firm"] = np.nan 1319 1320 mod = ols( 1321 formula="value ~ invest + capital + firm + year", data=dta.dropna() 1322 ) 1323 res = mod.fit() 1324 1325 mod2 = ols(formula="value ~ invest + capital + firm + year", data=dta) 1326 res2 = mod2.fit() 1327 1328 assert_almost_equal(res.params.values, res2.params.values) 1329 1330 assert_raises( 1331 PatsyError, 1332 ols, 1333 "value ~ invest + capital + firm + year", 1334 data=dta, 1335 missing="raise", 1336 ) 1337 1338 1339def test_missing_formula_predict(): 1340 # see 2171 1341 nsample = 30 1342 1343 data = np.linspace(0, 10, nsample) 1344 null = np.array([np.nan]) 1345 data = pd.DataFrame({"x": np.concatenate((data, null))}) 1346 beta = np.array([1, 0.1]) 1347 e = np.random.normal(size=nsample + 1) 1348 data["y"] = beta[0] + beta[1] * data["x"] + e 1349 model = OLS.from_formula("y ~ x", data=data) 1350 fit = model.fit() 1351 fit.predict(exog=data[:-1]) 1352 1353 1354def test_fvalue_implicit_constant(): 1355 # if constant is implicit, return nan see #2444 1356 nobs = 100 1357 np.random.seed(2) 1358 x = np.random.randn(nobs, 1) 1359 x = ((x > 0) == [True, False]).astype(int) 1360 y = x.sum(1) + np.random.randn(nobs) 1361 1362 from statsmodels.regression.linear_model import OLS, WLS 1363 1364 res = OLS(y, x).fit(cov_type="HC1") 1365 assert_(np.isnan(res.fvalue)) 1366 assert_(np.isnan(res.f_pvalue)) 1367 res.summary() 1368 1369 res = WLS(y, x).fit(cov_type="HC1") 1370 assert_(np.isnan(res.fvalue)) 1371 assert_(np.isnan(res.f_pvalue)) 1372 res.summary() 1373 1374 1375def test_fvalue_only_constant(): 1376 # if only constant in model, return nan see #3642 1377 nobs = 20 1378 np.random.seed(2) 1379 x = np.ones(nobs) 1380 y = np.random.randn(nobs) 1381 1382 from statsmodels.regression.linear_model import OLS, WLS 1383 1384 res = OLS(y, x).fit(cov_type="hac", cov_kwds={"maxlags": 3}) 1385 assert_(np.isnan(res.fvalue)) 1386 assert_(np.isnan(res.f_pvalue)) 1387 res.summary() 1388 1389 res = WLS(y, x).fit(cov_type="HC1") 1390 assert_(np.isnan(res.fvalue)) 1391 assert_(np.isnan(res.f_pvalue)) 1392 res.summary() 1393 1394 1395def test_ridge(): 1396 n = 100 1397 p = 5 1398 np.random.seed(3132) 1399 xmat = np.random.normal(size=(n, p)) 1400 yvec = xmat.sum(1) + np.random.normal(size=n) 1401 1402 v = np.ones(p) 1403 v[0] = 0 1404 1405 for a in (0, 1, 10): 1406 for alpha in (a, a * np.ones(p), a * v): 1407 model1 = OLS(yvec, xmat) 1408 result1 = model1._fit_ridge(alpha=alpha) 1409 model2 = OLS(yvec, xmat) 1410 result2 = model2.fit_regularized(alpha=alpha, L1_wt=0) 1411 assert_allclose(result1.params, result2.params) 1412 model3 = OLS(yvec, xmat) 1413 result3 = model3.fit_regularized(alpha=alpha, L1_wt=1e-10) 1414 assert_allclose(result1.params, result3.params) 1415 1416 fv1 = result1.fittedvalues 1417 fv2 = np.dot(xmat, result1.params) 1418 assert_allclose(fv1, fv2) 1419 1420 1421def test_regularized_refit(): 1422 n = 100 1423 p = 5 1424 np.random.seed(3132) 1425 xmat = np.random.normal(size=(n, p)) 1426 # covariates 0 and 2 matter 1427 yvec = xmat[:, 0] + xmat[:, 2] + np.random.normal(size=n) 1428 model1 = OLS(yvec, xmat) 1429 result1 = model1.fit_regularized(alpha=2.0, L1_wt=0.5, refit=True) 1430 model2 = OLS(yvec, xmat[:, [0, 2]]) 1431 result2 = model2.fit() 1432 ii = [0, 2] 1433 assert_allclose(result1.params[ii], result2.params) 1434 assert_allclose(result1.bse[ii], result2.bse) 1435 1436 1437def test_regularized_predict(): 1438 # this also compares WLS with GLS 1439 n = 100 1440 p = 5 1441 np.random.seed(3132) 1442 xmat = np.random.normal(size=(n, p)) 1443 yvec = xmat.sum(1) + np.random.normal(size=n) 1444 wgt = np.random.uniform(1, 2, n) 1445 model_wls = WLS(yvec, xmat, weights=wgt) 1446 # TODO: params is not the same in GLS if sigma=1 / wgt, i.e 1-dim, #7755 1447 model_gls1 = GLS(yvec, xmat, sigma=np.diag(1 / wgt)) 1448 model_gls2 = GLS(yvec, xmat, sigma=1 / wgt) 1449 res = [] 1450 for model1 in [model_wls, model_gls1, model_gls2]: 1451 result1 = model1.fit_regularized(alpha=20.0, L1_wt=0.5, refit=True) 1452 res.append(result1) 1453 params = result1.params 1454 fittedvalues = np.dot(xmat, params) 1455 pr = model1.predict(result1.params) 1456 assert_allclose(fittedvalues, pr) 1457 assert_allclose(result1.fittedvalues, pr) 1458 1459 pr = result1.predict() 1460 assert_allclose(fittedvalues, pr) 1461 1462 assert_allclose(res[0].model.wendog, res[1].model.wendog, rtol=1e-10) 1463 assert_allclose(res[0].model.wexog, res[1].model.wexog, rtol=1e-10) 1464 assert_allclose(res[0].fittedvalues, res[1].fittedvalues, rtol=1e-10) 1465 assert_allclose(res[0].params, res[1].params, rtol=1e-10) 1466 1467 assert_allclose(res[0].model.wendog, res[2].model.wendog, rtol=1e-10) 1468 assert_allclose(res[0].model.wexog, res[2].model.wexog, rtol=1e-10) 1469 assert_allclose(res[0].fittedvalues, res[2].fittedvalues, rtol=1e-10) 1470 assert_allclose(res[0].params, res[2].params, rtol=1e-10) 1471 1472 1473def test_regularized_options(): 1474 n = 100 1475 p = 5 1476 np.random.seed(3132) 1477 xmat = np.random.normal(size=(n, p)) 1478 yvec = xmat.sum(1) + np.random.normal(size=n) 1479 model1 = OLS(yvec - 1, xmat) 1480 result1 = model1.fit_regularized(alpha=1.0, L1_wt=0.5) 1481 model2 = OLS(yvec, xmat, offset=1) 1482 result2 = model2.fit_regularized( 1483 alpha=1.0, L1_wt=0.5, start_params=np.zeros(5) 1484 ) 1485 assert_allclose(result1.params, result2.params) 1486 1487 1488def test_burg(): 1489 rnd = np.random.RandomState(12345) 1490 e = rnd.randn(10001) 1491 y = e[1:] + 0.5 * e[:-1] 1492 # R, ar.burg 1493 expected = [ 1494 [0.3909931], 1495 [0.4602607, -0.1771582], 1496 [0.47473245, -0.21475602, 0.08168813], 1497 [0.4787017, -0.2251910, 0.1047554, -0.0485900], 1498 [0.47975462, -0.22746106, 0.10963527, -0.05896347, 0.02167001], 1499 ] 1500 1501 for i in range(1, 6): 1502 ar, _ = burg(y, i) 1503 assert_allclose(ar, expected[i - 1], atol=1e-6) 1504 as_nodemean, _ = burg(1 + y, i, False) 1505 assert np.all(ar != as_nodemean) 1506 1507 1508def test_burg_errors(): 1509 with pytest.raises(ValueError): 1510 burg(np.ones((100, 2))) 1511 with pytest.raises(ValueError): 1512 burg(np.random.randn(100), 0) 1513 with pytest.raises(ValueError): 1514 burg(np.random.randn(100), "apple") 1515 1516 1517@pytest.mark.skipif(not has_cvxopt, reason="sqrt_lasso requires cvxopt") 1518def test_sqrt_lasso(): 1519 1520 np.random.seed(234923) 1521 1522 # Based on the example in the Belloni paper 1523 n = 100 1524 p = 500 1525 ii = np.arange(p) 1526 cx = 0.5 ** np.abs(np.subtract.outer(ii, ii)) 1527 cxr = np.linalg.cholesky(cx) 1528 1529 x = np.dot(np.random.normal(size=(n, p)), cxr.T) 1530 b = np.zeros(p) 1531 b[0:5] = [1, 1, 1, 1, 1] 1532 1533 from scipy.stats.distributions import norm 1534 1535 alpha = 1.1 * np.sqrt(n) * norm.ppf(1 - 0.05 / (2 * p)) 1536 1537 # Use very low noise level for a unit test 1538 y = np.dot(x, b) + 0.25 * np.random.normal(size=n) 1539 1540 # At low noise levels, the sqrt lasso should be around a 1541 # factor of 3 from the oracle without refit, and should 1542 # almost equal the oracle with refit. 1543 expected_oracle = {False: 3, True: 1} 1544 1545 # Used for regression testing 1546 expected_params = { 1547 False: np.r_[ 1548 0.87397122, 0.96051874, 0.9905915, 0.93868953, 0.90771773 1549 ], 1550 True: np.r_[0.95114241, 1.0302987, 1.01723074, 0.97587343, 0.99846403], 1551 } 1552 1553 for refit in False, True: 1554 1555 rslt = OLS(y, x).fit_regularized( 1556 method="sqrt_lasso", alpha=alpha, refit=refit 1557 ) 1558 err = rslt.params - b 1559 numer = np.sqrt(np.dot(err, np.dot(cx, err))) 1560 1561 oracle = OLS(y, x[:, 0:5]).fit() 1562 oracle_err = np.zeros(p) 1563 oracle_err[0:5] = oracle.params - b[0:5] 1564 denom = np.sqrt(np.dot(oracle_err, np.dot(cx, oracle_err))) 1565 1566 # Check performance relative to oracle, should be around 1567 assert_allclose( 1568 numer / denom, expected_oracle[refit], rtol=0.5, atol=0.1 1569 ) 1570 1571 # Regression test the parameters 1572 assert_allclose( 1573 rslt.params[0:5], expected_params[refit], rtol=1e-5, atol=1e-5 1574 ) 1575 1576 1577def test_bool_regressor(reset_randomstate): 1578 exog = np.random.randint(0, 2, size=(100, 2)).astype(bool) 1579 endog = np.random.standard_normal(100) 1580 bool_res = OLS(endog, exog).fit() 1581 res = OLS(endog, exog.astype(np.double)).fit() 1582 assert_allclose(bool_res.params, res.params) 1583 1584 1585def test_ols_constant(reset_randomstate): 1586 y = np.random.standard_normal((200)) 1587 x = np.ones((200, 1)) 1588 res = OLS(y, x).fit() 1589 with pytest.warns(None) as recording: 1590 assert np.isnan(res.fvalue) 1591 assert np.isnan(res.f_pvalue) 1592 assert len(recording) == 0 1593 1594 1595def test_summary_no_constant(): 1596 rs = np.random.RandomState(0) 1597 x = rs.standard_normal((100, 2)) 1598 y = rs.standard_normal(100) 1599 summary = OLS(y, x).fit().summary() 1600 assert "R² is computed " in summary.as_text() 1601