1"""
2Test functions for models.regression
3"""
4# TODO: Test for LM
5from statsmodels.compat.python import lrange
6
7import warnings
8
9import numpy as np
10from numpy.testing import (
11    assert_,
12    assert_allclose,
13    assert_almost_equal,
14    assert_equal,
15    assert_raises,
16)
17import pandas as pd
18import pytest
19from scipy.linalg import toeplitz
20from scipy.stats import t as student_t
21
22from statsmodels.datasets import longley
23from statsmodels.regression.linear_model import (
24    GLS,
25    OLS,
26    WLS,
27    burg,
28    yule_walker,
29)
30from statsmodels.tools.tools import add_constant
31
32DECIMAL_4 = 4
33DECIMAL_3 = 3
34DECIMAL_2 = 2
35DECIMAL_1 = 1
36DECIMAL_7 = 7
37DECIMAL_0 = 0
38
39try:
40    import cvxopt  # noqa:F401
41
42    has_cvxopt = True
43except ImportError:
44    has_cvxopt = False
45
46
47class CheckRegressionResults(object):
48    """
49    res2 contains results from Rmodelwrap or were obtained from a statistical
50    packages such as R, Stata, or SAS and were written to model_results
51    """
52
53    decimal_params = DECIMAL_4
54
55    def test_params(self):
56        assert_almost_equal(
57            self.res1.params, self.res2.params, self.decimal_params
58        )
59
60    decimal_standarderrors = DECIMAL_4
61
62    def test_standarderrors(self):
63        assert_allclose(
64            self.res1.bse, self.res2.bse, self.decimal_standarderrors
65        )
66
67    decimal_confidenceintervals = DECIMAL_4
68
69    def test_confidenceintervals(self):
70        # NOTE: stata rounds residuals (at least) to sig digits so approx_equal
71        conf1 = self.res1.conf_int()
72        conf2 = self.res2.conf_int()
73        for i in range(len(conf1)):
74            assert_allclose(
75                conf1[i][0],
76                conf2[i][0],
77                rtol=10 ** -self.decimal_confidenceintervals,
78            )
79            assert_allclose(
80                conf1[i][1],
81                conf2[i][1],
82                rtol=10 ** -self.decimal_confidenceintervals,
83            )
84
85    decimal_conf_int_subset = DECIMAL_4
86
87    def test_conf_int_subset(self):
88        if len(self.res1.params) > 1:
89            ci1 = self.res1.conf_int(cols=(1, 2))
90            ci2 = self.res1.conf_int()[1:3]
91            assert_almost_equal(ci1, ci2, self.decimal_conf_int_subset)
92        else:
93            pass
94
95    decimal_scale = DECIMAL_4
96
97    def test_scale(self):
98        assert_almost_equal(
99            self.res1.scale, self.res2.scale, self.decimal_scale
100        )
101
102    decimal_rsquared = DECIMAL_4
103
104    def test_rsquared(self):
105        assert_almost_equal(
106            self.res1.rsquared, self.res2.rsquared, self.decimal_rsquared
107        )
108
109    decimal_rsquared_adj = DECIMAL_4
110
111    def test_rsquared_adj(self):
112        assert_almost_equal(
113            self.res1.rsquared_adj,
114            self.res2.rsquared_adj,
115            self.decimal_rsquared_adj,
116        )
117
118    def test_degrees(self):
119        assert_equal(self.res1.model.df_model, self.res2.df_model)
120        assert_equal(self.res1.model.df_resid, self.res2.df_resid)
121
122    decimal_ess = DECIMAL_4
123
124    def test_ess(self):
125        # Explained Sum of Squares
126        assert_almost_equal(self.res1.ess, self.res2.ess, self.decimal_ess)
127
128    decimal_ssr = DECIMAL_4
129
130    def test_sumof_squaredresids(self):
131        assert_almost_equal(self.res1.ssr, self.res2.ssr, self.decimal_ssr)
132
133    decimal_mse_resid = DECIMAL_4
134
135    def test_mse_resid(self):
136        # Mean squared error of residuals
137        assert_almost_equal(
138            self.res1.mse_model, self.res2.mse_model, self.decimal_mse_resid
139        )
140
141    decimal_mse_model = DECIMAL_4
142
143    def test_mse_model(self):
144        assert_almost_equal(
145            self.res1.mse_resid, self.res2.mse_resid, self.decimal_mse_model
146        )
147
148    decimal_mse_total = DECIMAL_4
149
150    def test_mse_total(self):
151        assert_almost_equal(
152            self.res1.mse_total,
153            self.res2.mse_total,
154            self.decimal_mse_total,
155            err_msg="Test class %s" % self,
156        )
157
158    decimal_fvalue = DECIMAL_4
159
160    def test_fvalue(self):
161        # did not change this, not sure it should complain -inf not equal -inf
162        # if not (np.isinf(self.res1.fvalue) and np.isinf(self.res2.fvalue)):
163        assert_almost_equal(
164            self.res1.fvalue, self.res2.fvalue, self.decimal_fvalue
165        )
166
167    decimal_loglike = DECIMAL_4
168
169    def test_loglike(self):
170        assert_almost_equal(self.res1.llf, self.res2.llf, self.decimal_loglike)
171
172    decimal_aic = DECIMAL_4
173
174    def test_aic(self):
175        assert_almost_equal(self.res1.aic, self.res2.aic, self.decimal_aic)
176        # the following just checks the definition
177        aicc1 = self.res1.info_criteria("aicc")
178        k = self.res1.df_model + self.res1.model.k_constant
179        nobs = self.res1.model.nobs
180        aicc2 = self.res1.aic + 2 * (k**2 + k) / (nobs - k - 1)
181        assert_allclose(aicc1, aicc2, rtol=1e-10)
182        hqic1 = self.res1.info_criteria("hqic")
183        hqic2 = (self.res1.aic - 2 * k) + 2 * np.log(np.log(nobs)) * k
184        assert_allclose(hqic1, hqic2, rtol=1e-10)
185
186    decimal_bic = DECIMAL_4
187
188    def test_bic(self):
189        assert_almost_equal(self.res1.bic, self.res2.bic, self.decimal_bic)
190
191    decimal_pvalues = DECIMAL_4
192
193    def test_pvalues(self):
194        assert_almost_equal(
195            self.res1.pvalues, self.res2.pvalues, self.decimal_pvalues
196        )
197
198    decimal_wresid = DECIMAL_4
199
200    def test_wresid(self):
201        assert_almost_equal(
202            self.res1.wresid, self.res2.wresid, self.decimal_wresid
203        )
204
205    decimal_resids = DECIMAL_4
206
207    def test_resids(self):
208        assert_almost_equal(
209            self.res1.resid, self.res2.resid, self.decimal_resids
210        )
211
212    decimal_norm_resids = DECIMAL_4
213
214    def test_norm_resids(self):
215        assert_almost_equal(
216            self.res1.resid_pearson,
217            self.res2.resid_pearson,
218            self.decimal_norm_resids,
219        )
220
221
222# TODO: test fittedvalues and what else?
223
224
225class TestOLS(CheckRegressionResults):
226    @classmethod
227    def setup_class(cls):
228        from .results.results_regression import Longley
229
230        data = longley.load()
231        endog = np.asarray(data.endog)
232        exog = np.asarray(data.exog)
233        exog = add_constant(exog, prepend=False)
234        res1 = OLS(endog, exog).fit()
235        res2 = Longley()
236        res2.wresid = res1.wresid  # workaround hack
237        cls.res1 = res1
238        cls.res2 = res2
239
240        res_qr = OLS(endog, exog).fit(method="qr")
241
242        model_qr = OLS(endog, exog)
243        Q, R = np.linalg.qr(exog)
244        model_qr.exog_Q, model_qr.exog_R = Q, R
245        model_qr.normalized_cov_params = np.linalg.inv(np.dot(R.T, R))
246        model_qr.rank = np.linalg.matrix_rank(R)
247        res_qr2 = model_qr.fit(method="qr")
248
249        cls.res_qr = res_qr
250        cls.res_qr_manual = res_qr2
251
252    def test_eigenvalues(self):
253        eigenval_perc_diff = (
254            self.res_qr.eigenvals - self.res_qr_manual.eigenvals
255        )
256        eigenval_perc_diff /= self.res_qr.eigenvals
257        zeros = np.zeros_like(eigenval_perc_diff)
258        assert_almost_equal(eigenval_perc_diff, zeros, DECIMAL_7)
259
260    # Robust error tests.  Compare values computed with SAS
261    def test_HC0_errors(self):
262        # They are split up because the copied results do not have any
263        # DECIMAL_4 places for the last place.
264        assert_almost_equal(
265            self.res1.HC0_se[:-1], self.res2.HC0_se[:-1], DECIMAL_4
266        )
267        assert_allclose(np.round(self.res1.HC0_se[-1]), self.res2.HC0_se[-1])
268
269    def test_HC1_errors(self):
270        assert_almost_equal(
271            self.res1.HC1_se[:-1], self.res2.HC1_se[:-1], DECIMAL_4
272        )
273        # Note: tolerance is tight; rtol=3e-7 fails while 4e-7 passes
274        assert_allclose(self.res1.HC1_se[-1], self.res2.HC1_se[-1], rtol=4e-7)
275
276    def test_HC2_errors(self):
277        assert_almost_equal(
278            self.res1.HC2_se[:-1], self.res2.HC2_se[:-1], DECIMAL_4
279        )
280        # Note: tolerance is tight; rtol=4e-7 fails while 5e-7 passes
281        assert_allclose(self.res1.HC2_se[-1], self.res2.HC2_se[-1], rtol=5e-7)
282
283    def test_HC3_errors(self):
284        assert_almost_equal(
285            self.res1.HC3_se[:-1], self.res2.HC3_se[:-1], DECIMAL_4
286        )
287        # Note: tolerance is tight; rtol=1e-7 fails while 1.5e-7 passes
288        assert_allclose(
289            self.res1.HC3_se[-1], self.res2.HC3_se[-1], rtol=1.5e-7
290        )
291
292    def test_qr_params(self):
293        assert_almost_equal(self.res1.params, self.res_qr.params, 6)
294
295    def test_qr_normalized_cov_params(self):
296        # todo: need assert_close
297        assert_almost_equal(
298            np.ones_like(self.res1.normalized_cov_params),
299            self.res1.normalized_cov_params
300            / self.res_qr.normalized_cov_params,
301            5,
302        )
303
304    def test_missing(self):
305        data = longley.load()
306        data.exog = add_constant(data.exog, prepend=False)
307        data.endog[[3, 7, 14]] = np.nan
308        mod = OLS(data.endog, data.exog, missing="drop")
309        assert_equal(mod.endog.shape[0], 13)
310        assert_equal(mod.exog.shape[0], 13)
311
312    def test_rsquared_adj_overfit(self):
313        # Test that if df_resid = 0, rsquared_adj = 0.
314        # This is a regression test for user issue:
315        # https://github.com/statsmodels/statsmodels/issues/868
316        with warnings.catch_warnings(record=True):
317            x = np.random.randn(5)
318            y = np.random.randn(5, 6)
319            results = OLS(x, y).fit()
320            rsquared_adj = results.rsquared_adj
321            assert_equal(rsquared_adj, np.nan)
322
323    def test_qr_alternatives(self):
324        assert_allclose(
325            self.res_qr.params, self.res_qr_manual.params, rtol=5e-12
326        )
327
328    def test_norm_resid(self):
329        resid = self.res1.wresid
330        norm_resid = resid / np.sqrt(np.sum(resid ** 2.0) / self.res1.df_resid)
331        model_norm_resid = self.res1.resid_pearson
332        assert_almost_equal(model_norm_resid, norm_resid, DECIMAL_7)
333
334    def test_summary_slim(self):
335        # check that slim summary is smaller, does not verify content
336        summ = self.res1.summary(slim=True)
337        assert len(summ.tables) == 2
338        assert len(str(summ)) < 6700
339
340    def test_norm_resid_zero_variance(self):
341        with warnings.catch_warnings(record=True):
342            y = self.res1.model.endog
343            res = OLS(y, y).fit()
344            assert_allclose(res.scale, 0, atol=1e-20)
345            assert_allclose(res.wresid, res.resid_pearson, atol=5e-11)
346
347
348class TestRTO(CheckRegressionResults):
349    @classmethod
350    def setup_class(cls):
351        from .results.results_regression import LongleyRTO
352
353        data = longley.load()
354        endog = np.asarray(data.endog)
355        exog = np.asarray(data.exog)
356        res1 = OLS(endog, exog).fit()
357        res2 = LongleyRTO()
358        res2.wresid = res1.wresid  # workaround hack
359        cls.res1 = res1
360        cls.res2 = res2
361
362        res_qr = OLS(endog, exog).fit(method="qr")
363        cls.res_qr = res_qr
364
365
366class TestFtest(object):
367    """
368    Tests f_test vs. RegressionResults
369    """
370
371    @classmethod
372    def setup_class(cls):
373        data = longley.load()
374        data.exog = add_constant(data.exog, prepend=False)
375        cls.res1 = OLS(data.endog, data.exog).fit()
376        R = np.identity(7)[:-1, :]
377        cls.Ftest = cls.res1.f_test(R)
378
379    def test_F(self):
380        assert_almost_equal(self.Ftest.fvalue, self.res1.fvalue, DECIMAL_4)
381
382    def test_p(self):
383        assert_almost_equal(self.Ftest.pvalue, self.res1.f_pvalue, DECIMAL_4)
384
385    def test_Df_denom(self):
386        assert_equal(self.Ftest.df_denom, self.res1.model.df_resid)
387
388    def test_Df_num(self):
389        assert_equal(self.Ftest.df_num, 6)
390
391
392class TestFTest2(object):
393    """
394    A joint test that the coefficient on
395    GNP = the coefficient on UNEMP  and that the coefficient on
396    POP = the coefficient on YEAR for the Longley dataset.
397
398    Ftest1 is from statsmodels.  Results are from Rpy using R's car library.
399    """
400
401    @classmethod
402    def setup_class(cls):
403        data = longley.load()
404        columns = [f"x{i}" for i in range(1, data.exog.shape[1] + 1)]
405        data.exog.columns = columns
406        data.exog = add_constant(data.exog, prepend=False)
407        res1 = OLS(data.endog, data.exog).fit()
408        R2 = [[0, 1, -1, 0, 0, 0, 0], [0, 0, 0, 0, 1, -1, 0]]
409        cls.Ftest1 = res1.f_test(R2)
410        hyp = "x2 = x3, x5 = x6"
411        cls.NewFtest1 = res1.f_test(hyp)
412
413    def test_new_ftest(self):
414        assert_equal(self.NewFtest1.fvalue, self.Ftest1.fvalue)
415
416    def test_fvalue(self):
417        assert_almost_equal(self.Ftest1.fvalue, 9.7404618732968196, DECIMAL_4)
418
419    def test_pvalue(self):
420        assert_almost_equal(
421            self.Ftest1.pvalue, 0.0056052885317493459, DECIMAL_4
422        )
423
424    def test_df_denom(self):
425        assert_equal(self.Ftest1.df_denom, 9)
426
427    def test_df_num(self):
428        assert_equal(self.Ftest1.df_num, 2)
429
430
431class TestFtestQ(object):
432    """
433    A joint hypothesis test that Rb = q.  Coefficient tests are essentially
434    made up.  Test values taken from Stata.
435    """
436
437    @classmethod
438    def setup_class(cls):
439        data = longley.load()
440        data.exog = add_constant(data.exog, prepend=False)
441        res1 = OLS(data.endog, data.exog).fit()
442        R = np.array(
443            [
444                [0, 1, 1, 0, 0, 0, 0],
445                [0, 1, 0, 1, 0, 0, 0],
446                [0, 1, 0, 0, 0, 0, 0],
447                [0, 0, 0, 0, 1, 0, 0],
448                [0, 0, 0, 0, 0, 1, 0],
449            ]
450        )
451        q = np.array([0, 0, 0, 1, 0])
452        cls.Ftest1 = res1.f_test((R, q))
453
454    def test_fvalue(self):
455        assert_almost_equal(self.Ftest1.fvalue, 70.115557, 5)
456
457    def test_pvalue(self):
458        assert_almost_equal(self.Ftest1.pvalue, 6.229e-07, 10)
459
460    def test_df_denom(self):
461        assert_equal(self.Ftest1.df_denom, 9)
462
463    def test_df_num(self):
464        assert_equal(self.Ftest1.df_num, 5)
465
466
467class TestTtest(object):
468    """
469    Test individual t-tests.  Ie., are the coefficients significantly
470    different than zero.
471    """
472
473    @classmethod
474    def setup_class(cls):
475        data = longley.load()
476        columns = [f"x{i}" for i in range(1, data.exog.shape[1] + 1)]
477        data.exog.columns = columns
478        data.exog = add_constant(data.exog, prepend=False)
479        cls.res1 = OLS(data.endog, data.exog).fit()
480        R = np.identity(7)
481        cls.Ttest = cls.res1.t_test(R)
482        hyp = "x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0, x6 = 0, const = 0"
483        cls.NewTTest = cls.res1.t_test(hyp)
484
485    def test_new_tvalue(self):
486        assert_equal(self.NewTTest.tvalue, self.Ttest.tvalue)
487
488    def test_tvalue(self):
489        assert_almost_equal(self.Ttest.tvalue, self.res1.tvalues, DECIMAL_4)
490
491    def test_sd(self):
492        assert_almost_equal(self.Ttest.sd, self.res1.bse, DECIMAL_4)
493
494    def test_pvalue(self):
495        assert_almost_equal(
496            self.Ttest.pvalue,
497            student_t.sf(np.abs(self.res1.tvalues), self.res1.model.df_resid)
498            * 2,
499            DECIMAL_4,
500        )
501
502    def test_df_denom(self):
503        assert_equal(self.Ttest.df_denom, self.res1.model.df_resid)
504
505    def test_effect(self):
506        assert_almost_equal(self.Ttest.effect, self.res1.params)
507
508
509class TestTtest2(object):
510    """
511    Tests the hypothesis that the coefficients on POP and YEAR
512    are equal.
513
514    Results from RPy using 'car' package.
515    """
516
517    @classmethod
518    def setup_class(cls):
519        R = np.zeros(7)
520        R[4:6] = [1, -1]
521        data = longley.load()
522        data.exog = add_constant(data.exog, prepend=False)
523        res1 = OLS(data.endog, data.exog).fit()
524        cls.Ttest1 = res1.t_test(R)
525
526    def test_tvalue(self):
527        assert_almost_equal(self.Ttest1.tvalue, -4.0167754636397284, DECIMAL_4)
528
529    def test_sd(self):
530        assert_almost_equal(self.Ttest1.sd, 455.39079425195314, DECIMAL_4)
531
532    def test_pvalue(self):
533        assert_almost_equal(
534            self.Ttest1.pvalue, 2 * 0.0015163772380932246, DECIMAL_4
535        )
536
537    def test_df_denom(self):
538        assert_equal(self.Ttest1.df_denom, 9)
539
540    def test_effect(self):
541        assert_almost_equal(self.Ttest1.effect, -1829.2025687186533, DECIMAL_4)
542
543
544class TestGLS(object):
545    """
546    These test results were obtained by replication with R.
547    """
548
549    @classmethod
550    def setup_class(cls):
551        from .results.results_regression import LongleyGls
552
553        data = longley.load()
554        exog = add_constant(
555            np.column_stack((data.exog.iloc[:, 1], data.exog.iloc[:, 4])),
556            prepend=False,
557        )
558        tmp_results = OLS(data.endog, exog).fit()
559        rho = np.corrcoef(tmp_results.resid[1:], tmp_results.resid[:-1])[0][
560            1
561        ]  # by assumption
562        order = toeplitz(np.arange(16))
563        sigma = rho ** order
564        GLS_results = GLS(data.endog, exog, sigma=sigma).fit()
565        cls.res1 = GLS_results
566        cls.res2 = LongleyGls()
567        # attach for test_missing
568        cls.sigma = sigma
569        cls.exog = exog
570        cls.endog = data.endog
571
572    def test_aic(self):
573        # Note: tolerance is tight; rtol=3e-3 fails while 4e-3 passes
574        assert_allclose(self.res1.aic + 2, self.res2.aic, rtol=4e-3)
575
576    def test_bic(self):
577        # Note: tolerance is tight; rtol=1e-2 fails while 1.5e-2 passes
578        assert_allclose(self.res1.bic, self.res2.bic, rtol=1.5e-2)
579
580    def test_loglike(self):
581        assert_almost_equal(self.res1.llf, self.res2.llf, DECIMAL_0)
582
583    def test_params(self):
584        assert_almost_equal(self.res1.params, self.res2.params, DECIMAL_1)
585
586    def test_resid(self):
587        assert_almost_equal(self.res1.resid, self.res2.resid, DECIMAL_4)
588
589    def test_scale(self):
590        assert_almost_equal(self.res1.scale, self.res2.scale, DECIMAL_4)
591
592    def test_tvalues(self):
593        assert_almost_equal(self.res1.tvalues, self.res2.tvalues, DECIMAL_4)
594
595    def test_standarderrors(self):
596        assert_almost_equal(self.res1.bse, self.res2.bse, DECIMAL_4)
597
598    def test_fittedvalues(self):
599        assert_almost_equal(
600            self.res1.fittedvalues, self.res2.fittedvalues, DECIMAL_4
601        )
602
603    def test_pvalues(self):
604        assert_almost_equal(self.res1.pvalues, self.res2.pvalues, DECIMAL_4)
605
606    def test_missing(self):
607        endog = self.endog.copy()  # copy or changes endog for other methods
608        endog[[4, 7, 14]] = np.nan
609        mod = GLS(endog, self.exog, sigma=self.sigma, missing="drop")
610        assert_equal(mod.endog.shape[0], 13)
611        assert_equal(mod.exog.shape[0], 13)
612        assert_equal(mod.sigma.shape, (13, 13))
613
614
615class TestGLS_alt_sigma(CheckRegressionResults):
616    """
617    Test that GLS with no argument is equivalent to OLS.
618    """
619
620    @classmethod
621    def setup_class(cls):
622        data = longley.load()
623        endog = np.asarray(data.endog)
624        exog = np.asarray(data.exog)
625        exog = add_constant(exog, prepend=False)
626        ols_res = OLS(endog, exog).fit()
627        gls_res = GLS(endog, exog).fit()
628        gls_res_scalar = GLS(endog, exog, sigma=1)
629        cls.endog = endog
630        cls.exog = exog
631        cls.res1 = gls_res
632        cls.res2 = ols_res
633        cls.res3 = gls_res_scalar
634
635    #        self.res2.conf_int = self.res2.conf_int()
636
637    def test_wrong_size_sigma_1d(self):
638        n = len(self.endog)
639        assert_raises(
640            ValueError, GLS, self.endog, self.exog, sigma=np.ones(n - 1)
641        )
642
643    def test_wrong_size_sigma_2d(self):
644        n = len(self.endog)
645        assert_raises(
646            ValueError,
647            GLS,
648            self.endog,
649            self.exog,
650            sigma=np.ones((n - 1, n - 1)),
651        )
652
653
654# FIXME: do not leave commented-out, use or move/remove
655#    def check_confidenceintervals(self, conf1, conf2):
656#        assert_almost_equal(conf1, conf2, DECIMAL_4)
657
658
659class TestLM(object):
660    @classmethod
661    def setup_class(cls):
662        # TODO: Test HAC method
663        rs = np.random.RandomState(1234)
664        x = rs.randn(100, 3)
665        b = np.ones((3, 1))
666        e = rs.randn(100, 1)
667        y = np.dot(x, b) + e
668        # Cases?
669        # Homoskedastic
670        # HC0
671        cls.res1_full = OLS(y, x).fit()
672        cls.res1_restricted = OLS(y, x[:, 0]).fit()
673
674        cls.res2_full = cls.res1_full.get_robustcov_results("HC0")
675        cls.res2_restricted = cls.res1_restricted.get_robustcov_results("HC0")
676
677        cls.x = x
678        cls.Y = y
679
680    def test_LM_homoskedastic(self):
681        resid = self.res1_restricted.wresid
682        n = resid.shape[0]
683        x = self.x
684        S = np.dot(resid, resid) / n * np.dot(x.T, x) / n
685        Sinv = np.linalg.inv(S)
686        s = np.mean(x * resid[:, None], 0)
687        LMstat = n * np.dot(np.dot(s, Sinv), s.T)
688        LMstat_OLS = self.res1_full.compare_lm_test(self.res1_restricted)
689        LMstat2 = LMstat_OLS[0]
690        assert_almost_equal(LMstat, LMstat2, DECIMAL_7)
691
692    def test_LM_heteroskedastic_nodemean(self):
693        resid = self.res1_restricted.wresid
694        n = resid.shape[0]
695        x = self.x
696        scores = x * resid[:, None]
697        S = np.dot(scores.T, scores) / n
698        Sinv = np.linalg.inv(S)
699        s = np.mean(scores, 0)
700        LMstat = n * np.dot(np.dot(s, Sinv), s.T)
701        LMstat_OLS = self.res2_full.compare_lm_test(
702            self.res2_restricted, demean=False
703        )
704        LMstat2 = LMstat_OLS[0]
705        assert_almost_equal(LMstat, LMstat2, DECIMAL_7)
706
707    def test_LM_heteroskedastic_demean(self):
708        resid = self.res1_restricted.wresid
709        n = resid.shape[0]
710        x = self.x
711        scores = x * resid[:, None]
712        scores_demean = scores - scores.mean(0)
713        S = np.dot(scores_demean.T, scores_demean) / n
714        Sinv = np.linalg.inv(S)
715        s = np.mean(scores, 0)
716        LMstat = n * np.dot(np.dot(s, Sinv), s.T)
717        LMstat_OLS = self.res2_full.compare_lm_test(self.res2_restricted)
718        LMstat2 = LMstat_OLS[0]
719        assert_almost_equal(LMstat, LMstat2, DECIMAL_7)
720
721    def test_LM_heteroskedastic_LRversion(self):
722        resid = self.res1_restricted.wresid
723        resid_full = self.res1_full.wresid
724        n = resid.shape[0]
725        x = self.x
726        scores = x * resid[:, None]
727        s = np.mean(scores, 0)
728        scores = x * resid_full[:, None]
729        S = np.dot(scores.T, scores) / n
730        Sinv = np.linalg.inv(S)
731        LMstat = n * np.dot(np.dot(s, Sinv), s.T)
732        LMstat_OLS = self.res2_full.compare_lm_test(
733            self.res2_restricted, use_lr=True
734        )
735        LMstat2 = LMstat_OLS[0]
736        assert_almost_equal(LMstat, LMstat2, DECIMAL_7)
737
738    def test_LM_nonnested(self):
739        assert_raises(
740            ValueError, self.res2_restricted.compare_lm_test, self.res2_full
741        )
742
743
744class TestOLS_GLS_WLS_equivalence(object):
745    @classmethod
746    def setup_class(cls):
747        data = longley.load()
748        data.exog = add_constant(data.exog, prepend=False)
749        y = data.endog
750        x = data.exog
751        n = y.shape[0]
752        w = np.ones(n)
753        cls.results = []
754        cls.results.append(OLS(y, x).fit())
755        cls.results.append(WLS(y, x, w).fit())
756        # scaling weights does not change main results (except scale)
757        cls.results.append(GLS(y, x, 100 * w).fit())
758        cls.results.append(GLS(y, x, np.diag(0.1 * w)).fit())
759
760    def test_ll(self):
761        llf = np.array([r.llf for r in self.results])
762        llf_1 = np.ones_like(llf) * self.results[0].llf
763        assert_almost_equal(llf, llf_1, DECIMAL_7)
764
765        ic = np.array([r.aic for r in self.results])
766        ic_1 = np.ones_like(ic) * self.results[0].aic
767        assert_almost_equal(ic, ic_1, DECIMAL_7)
768
769        ic = np.array([r.bic for r in self.results])
770        ic_1 = np.ones_like(ic) * self.results[0].bic
771        assert_almost_equal(ic, ic_1, DECIMAL_7)
772
773    def test_params(self):
774        params = np.array([r.params for r in self.results])
775        params_1 = np.array([self.results[0].params] * len(self.results))
776        assert_allclose(params, params_1)
777
778    def test_ss(self):
779        bse = np.array([r.bse for r in self.results])
780        bse_1 = np.array([self.results[0].bse] * len(self.results))
781        assert_allclose(bse, bse_1)
782
783    def test_rsquared(self):
784        rsquared = np.array([r.rsquared for r in self.results])
785        rsquared_1 = np.array([self.results[0].rsquared] * len(self.results))
786        assert_almost_equal(rsquared, rsquared_1, DECIMAL_7)
787
788
789class TestGLS_WLS_equivalence(TestOLS_GLS_WLS_equivalence):
790    # reuse test methods
791
792    @classmethod
793    def setup_class(cls):
794        data = longley.load()
795        data.exog = add_constant(data.exog, prepend=False)
796        y = data.endog
797        x = data.exog
798        n = y.shape[0]
799        np.random.seed(5)
800        w = np.random.uniform(0.5, 1, n)
801        w_inv = 1.0 / w
802        cls.results = []
803        cls.results.append(WLS(y, x, w).fit())
804        # scaling weights does not change main results (except scale)
805        cls.results.append(WLS(y, x, 0.01 * w).fit())
806        cls.results.append(GLS(y, x, 100 * w_inv).fit())
807        cls.results.append(GLS(y, x, np.diag(0.1 * w_inv)).fit())
808
809
810class TestNonFit(object):
811    @classmethod
812    def setup_class(cls):
813        data = longley.load()
814        data.exog = add_constant(data.exog, prepend=False)
815        cls.endog = data.endog
816        cls.exog = data.exog
817        cls.ols_model = OLS(data.endog, data.exog)
818
819    def test_df_resid(self):
820        df_resid = self.endog.shape[0] - self.exog.shape[1]
821        assert_equal(self.ols_model.df_resid, 9)
822
823
824class TestWLS_CornerCases(object):
825    @classmethod
826    def setup_class(cls):
827        cls.exog = np.ones((1,))
828        cls.endog = np.ones((1,))
829        weights = 1
830        cls.wls_res = WLS(cls.endog, cls.exog, weights=weights).fit()
831
832    def test_wrong_size_weights(self):
833        weights = np.ones((10, 10))
834        assert_raises(ValueError, WLS, self.endog, self.exog, weights=weights)
835
836
837class TestWLSExogWeights(CheckRegressionResults):
838    # Test WLS with Greene's credit card data
839    # reg avgexp age income incomesq ownrent [aw=1/incomesq]
840    @classmethod
841    def setup_class(cls):
842        from statsmodels.datasets.ccard import load
843
844        from .results.results_regression import CCardWLS
845
846        dta = load()
847        endog = np.asarray(dta.endog)
848        exog = np.asarray(dta.exog)
849        exog = add_constant(exog, prepend=False)
850        nobs = 72.0
851
852        weights = 1 / exog[:, 2]
853        # for comparison with stata analytic weights
854        scaled_weights = (weights * nobs) / weights.sum()
855
856        cls.res1 = WLS(endog, exog, weights=scaled_weights).fit()
857        cls.res2 = CCardWLS()
858        cls.res2.wresid = scaled_weights ** 0.5 * cls.res2.resid
859
860        # correction because we use different definition for loglike/llf
861        corr_ic = 2 * (cls.res1.llf - cls.res2.llf)
862        cls.res2.aic -= corr_ic
863        cls.res2.bic -= corr_ic
864        cls.res2.llf += 0.5 * np.sum(np.log(cls.res1.model.weights))
865
866
867def test_wls_example():
868    # example from the docstring, there was a note about a bug, should
869    # be fixed now
870    Y = [1, 3, 4, 5, 2, 3, 4]
871    x = lrange(1, 8)
872    x = add_constant(x, prepend=False)
873    wls_model = WLS(Y, x, weights=lrange(1, 8)).fit()
874    # taken from R lm.summary
875    assert_almost_equal(wls_model.fvalue, 0.127337843215, 6)
876    assert_almost_equal(wls_model.scale, 2.44608530786 ** 2, 6)
877
878
879def test_wls_tss():
880    y = np.array([22, 22, 22, 23, 23, 23])
881    x = [[1, 0], [1, 0], [1, 1], [0, 1], [0, 1], [0, 1]]
882
883    ols_mod = OLS(y, add_constant(x, prepend=False)).fit()
884
885    yw = np.array([22, 22, 23.0])
886    Xw = [[1, 0], [1, 1], [0, 1]]
887    w = np.array([2, 1, 3.0])
888
889    wls_mod = WLS(yw, add_constant(Xw, prepend=False), weights=w).fit()
890    assert_equal(ols_mod.centered_tss, wls_mod.centered_tss)
891
892
893class TestWLSScalarVsArray(CheckRegressionResults):
894    @classmethod
895    def setup_class(cls):
896        from statsmodels.datasets.longley import load
897
898        dta = load()
899        endog = np.asarray(dta.endog)
900        exog = np.asarray(dta.exog)
901        exog = add_constant(exog, prepend=True)
902        wls_scalar = WLS(endog, exog, weights=1.0 / 3).fit()
903        weights = [1 / 3.0] * len(endog)
904        wls_array = WLS(endog, exog, weights=weights).fit()
905        cls.res1 = wls_scalar
906        cls.res2 = wls_array
907
908
909class TestWLS_GLS(CheckRegressionResults):
910    @classmethod
911    def setup_class(cls):
912        from statsmodels.datasets.ccard import load
913
914        data = load()
915        endog = np.asarray(data.endog)
916        exog = np.asarray(data.exog)
917        sigma = exog[:, 2]
918        cls.res1 = WLS(endog, exog, weights=1 / sigma).fit()
919        cls.res2 = GLS(endog, exog, sigma=sigma).fit()
920
921    def check_confidenceintervals(self, conf1, conf2):  # FIXME: never called
922        assert_almost_equal(conf1, conf2(), DECIMAL_4)
923
924
925def test_wls_missing():
926    from statsmodels.datasets.ccard import load
927
928    data = load()
929    endog = data.endog
930    endog[[10, 25]] = np.nan
931    mod = WLS(
932        data.endog, data.exog, weights=1 / data.exog.iloc[:, 2], missing="drop"
933    )
934    assert_equal(mod.endog.shape[0], 70)
935    assert_equal(mod.exog.shape[0], 70)
936    assert_equal(mod.weights.shape[0], 70)
937
938
939class TestWLS_OLS(CheckRegressionResults):
940    @classmethod
941    def setup_class(cls):
942        data = longley.load()
943        endog = np.asarray(data.endog)
944        exog = np.asarray(data.exog)
945        exog = add_constant(exog, prepend=False)
946        cls.res1 = OLS(endog, exog).fit()
947        cls.res2 = WLS(endog, exog).fit()
948
949    def check_confidenceintervals(self, conf1, conf2):  # FIXME: never called
950        assert_almost_equal(conf1, conf2(), DECIMAL_4)
951
952
953class TestGLS_OLS(CheckRegressionResults):
954    @classmethod
955    def setup_class(cls):
956        data = longley.load()
957        endog = np.asarray(data.endog)
958        exog = np.asarray(data.exog)
959        exog = add_constant(exog, prepend=False)
960        cls.res1 = GLS(endog, exog).fit()
961        cls.res2 = OLS(endog, exog).fit()
962
963    def check_confidenceintervals(self, conf1, conf2):  # FIXME: never called
964        assert_almost_equal(conf1, conf2(), DECIMAL_4)
965
966
967# FIXME: do not leave this commented-out sitting here
968# TODO: test AR
969# why the two-stage in AR?
970# class TestAR(object):
971#     from statsmodels.datasets.sunspots import load
972#     data = load()
973#     model = AR(data.endog, rho=4).fit()
974#     R_res = RModel(data.endog, aic="FALSE", order_max=4)#
975
976#     def test_params(self):
977#         assert_almost_equal(self.model.rho,
978#         pass
979
980#     def test_order(self):
981# In R this can be defined or chosen by minimizing the AIC if aic=True
982#        pass
983
984
985class TestYuleWalker(object):
986    @classmethod
987    def setup_class(cls):
988        from statsmodels.datasets.sunspots import load
989
990        data = load()
991        cls.rho, cls.sigma = yule_walker(data.endog, order=4, method="mle")
992        cls.R_params = [
993            1.2831003105694765,
994            -0.45240924374091945,
995            -0.20770298557575195,
996            0.047943648089542337,
997        ]
998
999    def test_params(self):
1000        assert_almost_equal(self.rho, self.R_params, DECIMAL_4)
1001
1002
1003class TestDataDimensions(CheckRegressionResults):
1004    @classmethod
1005    def setup_class(cls):
1006        np.random.seed(54321)
1007        cls.endog_n_ = np.random.uniform(0, 20, size=30)
1008        cls.endog_n_one = cls.endog_n_[:, None]
1009        cls.exog_n_ = np.random.uniform(0, 20, size=30)
1010        cls.exog_n_one = cls.exog_n_[:, None]
1011        cls.degen_exog = cls.exog_n_one[:-1]
1012        cls.mod1 = OLS(cls.endog_n_one, cls.exog_n_one)
1013        cls.mod1.df_model += 1
1014        cls.res1 = cls.mod1.fit()
1015        # Note that these are created for every subclass..
1016        # A little extra overhead probably
1017        cls.mod2 = OLS(cls.endog_n_one, cls.exog_n_one)
1018        cls.mod2.df_model += 1
1019        cls.res2 = cls.mod2.fit()
1020
1021    def check_confidenceintervals(self, conf1, conf2):  # FIXME: never called
1022        assert_almost_equal(conf1, conf2(), DECIMAL_4)
1023
1024
1025class TestGLS_large_data(TestDataDimensions):
1026    @classmethod
1027    def setup_class(cls):
1028        super(TestGLS_large_data, cls).setup_class()
1029        nobs = 1000
1030        y = np.random.randn(nobs, 1)
1031        x = np.random.randn(nobs, 20)
1032        sigma = np.ones_like(y)
1033        cls.gls_res = GLS(y, x, sigma=sigma).fit()
1034        cls.gls_res_scalar = GLS(y, x, sigma=1).fit()
1035        cls.gls_res_none = GLS(y, x).fit()
1036        cls.ols_res = OLS(y, x).fit()
1037
1038    def test_large_equal_params(self):
1039        assert_almost_equal(
1040            self.ols_res.params, self.gls_res.params, DECIMAL_7
1041        )
1042
1043    def test_large_equal_loglike(self):
1044        assert_almost_equal(self.ols_res.llf, self.gls_res.llf, DECIMAL_7)
1045
1046    def test_large_equal_params_none(self):
1047        assert_almost_equal(
1048            self.gls_res.params, self.gls_res_none.params, DECIMAL_7
1049        )
1050
1051
1052class TestNxNx(TestDataDimensions):
1053    @classmethod
1054    def setup_class(cls):
1055        super(TestNxNx, cls).setup_class()
1056        cls.mod2 = OLS(cls.endog_n_, cls.exog_n_)
1057        cls.mod2.df_model += 1
1058        cls.res2 = cls.mod2.fit()
1059
1060
1061class TestNxOneNx(TestDataDimensions):
1062    @classmethod
1063    def setup_class(cls):
1064        super(TestNxOneNx, cls).setup_class()
1065        cls.mod2 = OLS(cls.endog_n_one, cls.exog_n_)
1066        cls.mod2.df_model += 1
1067        cls.res2 = cls.mod2.fit()
1068
1069
1070class TestNxNxOne(TestDataDimensions):
1071    @classmethod
1072    def setup_class(cls):
1073        super(TestNxNxOne, cls).setup_class()
1074        cls.mod2 = OLS(cls.endog_n_, cls.exog_n_one)
1075        cls.mod2.df_model += 1
1076        cls.res2 = cls.mod2.fit()
1077
1078
1079def test_bad_size():
1080    np.random.seed(54321)
1081    data = np.random.uniform(0, 20, 31)
1082    assert_raises(ValueError, OLS, data, data[1:])
1083
1084
1085def test_const_indicator():
1086    rs = np.random.RandomState(12345)
1087    x = rs.randint(0, 3, size=30)
1088    x = pd.get_dummies(pd.Series(x, dtype="category"), drop_first=False)
1089    y = np.dot(x, [1.0, 2.0, 3.0]) + rs.normal(size=30)
1090    resc = OLS(y, add_constant(x.iloc[:, 1:], prepend=True)).fit()
1091    res = OLS(y, x, hasconst=True).fit()
1092    assert_almost_equal(resc.rsquared, res.rsquared, 12)
1093    assert res.model.data.k_constant == 1
1094    assert resc.model.data.k_constant == 1
1095
1096
1097def test_fvalue_const_only():
1098    rs = np.random.RandomState(12345)
1099    x = rs.randint(0, 3, size=30)
1100    x = pd.get_dummies(pd.Series(x, dtype="category"), drop_first=False)
1101    x.iloc[:, 0] = 1
1102    y = np.dot(x, [1.0, 2.0, 3.0]) + rs.normal(size=30)
1103    res = OLS(y, x, hasconst=True).fit(cov_type="HC1")
1104    assert not np.isnan(res.fvalue)
1105    assert isinstance(res.fvalue, float)
1106    assert isinstance(res.f_pvalue, float)
1107
1108
1109def test_conf_int_single_regressor():
1110    # GH#706 single-regressor model (i.e. no intercept) with 1D exog
1111    # should get passed to DataFrame for conf_int
1112    y = pd.Series(np.random.randn(10))
1113    x = pd.Series(np.ones(10))
1114    res = OLS(y, x).fit()
1115    conf_int = res.conf_int()
1116    np.testing.assert_equal(conf_int.shape, (1, 2))
1117    np.testing.assert_(isinstance(conf_int, pd.DataFrame))
1118
1119
1120def test_summary_as_latex():
1121    # GH#734
1122    import re
1123
1124    dta = longley.load_pandas()
1125    x = dta.exog
1126    x["constant"] = 1
1127    y = dta.endog
1128    res = OLS(y, x).fit()
1129    with pytest.warns(UserWarning):
1130        table = res.summary().as_latex()
1131    # replace the date and time
1132    table = re.sub(
1133        "(?<=\n\\\\textbf\\{Date:\\}             &).+?&",
1134        " Sun, 07 Apr 2013 &",
1135        table,
1136    )
1137    table = re.sub(
1138        "(?<=\n\\\\textbf\\{Time:\\}             &).+?&",
1139        "     13:46:07     &",
1140        table,
1141    )
1142
1143    expected = """\\begin{center}
1144\\begin{tabular}{lclc}
1145\\toprule
1146\\textbf{Dep. Variable:}    &      TOTEMP      & \\textbf{  R-squared:         } &     0.995   \\\\
1147\\textbf{Model:}            &       OLS        & \\textbf{  Adj. R-squared:    } &     0.992   \\\\
1148\\textbf{Method:}           &  Least Squares   & \\textbf{  F-statistic:       } &     330.3   \\\\
1149\\textbf{Date:}             & Sun, 07 Apr 2013 & \\textbf{  Prob (F-statistic):} &  4.98e-10   \\\\
1150\\textbf{Time:}             &     13:46:07     & \\textbf{  Log-Likelihood:    } &   -109.62   \\\\
1151\\textbf{No. Observations:} &          16      & \\textbf{  AIC:               } &     233.2   \\\\
1152\\textbf{Df Residuals:}     &           9      & \\textbf{  BIC:               } &     238.6   \\\\
1153\\textbf{Df Model:}         &           6      & \\textbf{                     } &             \\\\
1154\\textbf{Covariance Type:}  &    nonrobust     & \\textbf{                     } &             \\\\
1155\\bottomrule
1156\\end{tabular}
1157\\begin{tabular}{lcccccc}
1158                  & \\textbf{coef} & \\textbf{std err} & \\textbf{t} & \\textbf{P$> |$t$|$} & \\textbf{[0.025} & \\textbf{0.975]}  \\\\
1159\\midrule
1160\\textbf{GNPDEFL}  &      15.0619  &       84.915     &     0.177  &         0.863        &     -177.029    &      207.153     \\\\
1161\\textbf{GNP}      &      -0.0358  &        0.033     &    -1.070  &         0.313        &       -0.112    &        0.040     \\\\
1162\\textbf{UNEMP}    &      -2.0202  &        0.488     &    -4.136  &         0.003        &       -3.125    &       -0.915     \\\\
1163\\textbf{ARMED}    &      -1.0332  &        0.214     &    -4.822  &         0.001        &       -1.518    &       -0.549     \\\\
1164\\textbf{POP}      &      -0.0511  &        0.226     &    -0.226  &         0.826        &       -0.563    &        0.460     \\\\
1165\\textbf{YEAR}     &    1829.1515  &      455.478     &     4.016  &         0.003        &      798.788    &     2859.515     \\\\
1166\\textbf{constant} &   -3.482e+06  &      8.9e+05     &    -3.911  &         0.004        &     -5.5e+06    &    -1.47e+06     \\\\
1167\\bottomrule
1168\\end{tabular}
1169\\begin{tabular}{lclc}
1170\\textbf{Omnibus:}       &  0.749 & \\textbf{  Durbin-Watson:     } &    2.559  \\\\
1171\\textbf{Prob(Omnibus):} &  0.688 & \\textbf{  Jarque-Bera (JB):  } &    0.684  \\\\
1172\\textbf{Skew:}          &  0.420 & \\textbf{  Prob(JB):          } &    0.710  \\\\
1173\\textbf{Kurtosis:}      &  2.434 & \\textbf{  Cond. No.          } & 4.86e+09  \\\\
1174\\bottomrule
1175\\end{tabular}
1176%\\caption{OLS Regression Results}
1177\\end{center}
1178
1179Notes: \\newline
1180 [1] Standard Errors assume that the covariance matrix of the errors is correctly specified. \\newline
1181 [2] The condition number is large, 4.86e+09. This might indicate that there are \\newline
1182 strong multicollinearity or other numerical problems."""
1183    assert_equal(table, expected)
1184
1185
1186class TestRegularizedFit(object):
1187
1188    # Make sure there are no problems when no variables are selected.
1189    def test_empty_model(self):
1190
1191        np.random.seed(742)
1192        n = 100
1193        endog = np.random.normal(size=n)
1194        exog = np.random.normal(size=(n, 3))
1195
1196        for cls in OLS, WLS, GLS:
1197            model = cls(endog, exog)
1198            result = model.fit_regularized(alpha=1000)
1199            assert_equal(result.params, 0.0)
1200
1201    def test_regularized(self):
1202
1203        import os
1204
1205        from .results import glmnet_r_results
1206
1207        cur_dir = os.path.dirname(os.path.abspath(__file__))
1208        data = np.loadtxt(
1209            os.path.join(cur_dir, "results", "lasso_data.csv"), delimiter=","
1210        )
1211
1212        tests = [x for x in dir(glmnet_r_results) if x.startswith("rslt_")]
1213
1214        for test in tests:
1215
1216            vec = getattr(glmnet_r_results, test)
1217
1218            n = vec[0]
1219            p = vec[1]
1220            L1_wt = float(vec[2])
1221            lam = float(vec[3])
1222            params = vec[4:].astype(np.float64)
1223
1224            endog = data[0 : int(n), 0]
1225            exog = data[0 : int(n), 1 : (int(p) + 1)]
1226
1227            endog = endog - endog.mean()
1228            endog /= endog.std(ddof=1)
1229            exog = exog - exog.mean(0)
1230            exog /= exog.std(0, ddof=1)
1231
1232            for cls in OLS, WLS, GLS:
1233                mod = cls(endog, exog)
1234                rslt = mod.fit_regularized(L1_wt=L1_wt, alpha=lam)
1235                assert_almost_equal(rslt.params, params, decimal=3)
1236
1237                # Smoke test for profile likelihood
1238                mod.fit_regularized(L1_wt=L1_wt, alpha=lam, profile_scale=True)
1239
1240    def test_regularized_weights(self):
1241
1242        np.random.seed(1432)
1243        exog1 = np.random.normal(size=(100, 3))
1244        endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100)
1245        exog2 = np.random.normal(size=(100, 3))
1246        endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100)
1247
1248        exog_a = np.vstack((exog1, exog1, exog2))
1249        endog_a = np.concatenate((endog1, endog1, endog2))
1250
1251        # Should be equivalent to exog_a, endog_a.
1252        exog_b = np.vstack((exog1, exog2))
1253        endog_b = np.concatenate((endog1, endog2))
1254        wgts = np.ones(200)
1255        wgts[0:100] = 2
1256        sigma = np.diag(1 / wgts)
1257
1258        for L1_wt in 0, 0.5, 1:
1259            for alpha in 0, 1:
1260                mod1 = OLS(endog_a, exog_a)
1261                rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha)
1262
1263                mod2 = WLS(endog_b, exog_b, weights=wgts)
1264                rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha)
1265
1266                mod3 = GLS(endog_b, exog_b, sigma=sigma)
1267                rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha)
1268
1269                assert_almost_equal(rslt1.params, rslt2.params, decimal=3)
1270                assert_almost_equal(rslt1.params, rslt3.params, decimal=3)
1271
1272    def test_regularized_weights_list(self):
1273
1274        np.random.seed(132)
1275        exog1 = np.random.normal(size=(100, 3))
1276        endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100)
1277        exog2 = np.random.normal(size=(100, 3))
1278        endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100)
1279
1280        exog_a = np.vstack((exog1, exog1, exog2))
1281        endog_a = np.concatenate((endog1, endog1, endog2))
1282
1283        # Should be equivalent to exog_a, endog_a.
1284        exog_b = np.vstack((exog1, exog2))
1285        endog_b = np.concatenate((endog1, endog2))
1286        wgts = np.ones(200)
1287        wgts[0:100] = 2
1288        sigma = np.diag(1 / wgts)
1289
1290        for L1_wt in 0, 0.5, 1:
1291            for alpha_element in 0, 1:
1292                alpha = [
1293                    alpha_element,
1294                ] * 3
1295
1296                mod1 = OLS(endog_a, exog_a)
1297                rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha)
1298
1299                mod2 = WLS(endog_b, exog_b, weights=wgts)
1300                rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha)
1301
1302                mod3 = GLS(endog_b, exog_b, sigma=sigma)
1303                rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha)
1304
1305                assert_almost_equal(rslt1.params, rslt2.params, decimal=3)
1306                assert_almost_equal(rslt1.params, rslt3.params, decimal=3)
1307
1308
1309def test_formula_missing_cat():
1310    # gh-805
1311
1312    from patsy import PatsyError
1313
1314    import statsmodels.api as sm
1315    from statsmodels.formula.api import ols
1316
1317    dta = sm.datasets.grunfeld.load_pandas().data
1318    dta.loc[dta.index[0], "firm"] = np.nan
1319
1320    mod = ols(
1321        formula="value ~ invest + capital + firm + year", data=dta.dropna()
1322    )
1323    res = mod.fit()
1324
1325    mod2 = ols(formula="value ~ invest + capital + firm + year", data=dta)
1326    res2 = mod2.fit()
1327
1328    assert_almost_equal(res.params.values, res2.params.values)
1329
1330    assert_raises(
1331        PatsyError,
1332        ols,
1333        "value ~ invest + capital + firm + year",
1334        data=dta,
1335        missing="raise",
1336    )
1337
1338
1339def test_missing_formula_predict():
1340    # see 2171
1341    nsample = 30
1342
1343    data = np.linspace(0, 10, nsample)
1344    null = np.array([np.nan])
1345    data = pd.DataFrame({"x": np.concatenate((data, null))})
1346    beta = np.array([1, 0.1])
1347    e = np.random.normal(size=nsample + 1)
1348    data["y"] = beta[0] + beta[1] * data["x"] + e
1349    model = OLS.from_formula("y ~ x", data=data)
1350    fit = model.fit()
1351    fit.predict(exog=data[:-1])
1352
1353
1354def test_fvalue_implicit_constant():
1355    # if constant is implicit, return nan see #2444
1356    nobs = 100
1357    np.random.seed(2)
1358    x = np.random.randn(nobs, 1)
1359    x = ((x > 0) == [True, False]).astype(int)
1360    y = x.sum(1) + np.random.randn(nobs)
1361
1362    from statsmodels.regression.linear_model import OLS, WLS
1363
1364    res = OLS(y, x).fit(cov_type="HC1")
1365    assert_(np.isnan(res.fvalue))
1366    assert_(np.isnan(res.f_pvalue))
1367    res.summary()
1368
1369    res = WLS(y, x).fit(cov_type="HC1")
1370    assert_(np.isnan(res.fvalue))
1371    assert_(np.isnan(res.f_pvalue))
1372    res.summary()
1373
1374
1375def test_fvalue_only_constant():
1376    # if only constant in model, return nan see #3642
1377    nobs = 20
1378    np.random.seed(2)
1379    x = np.ones(nobs)
1380    y = np.random.randn(nobs)
1381
1382    from statsmodels.regression.linear_model import OLS, WLS
1383
1384    res = OLS(y, x).fit(cov_type="hac", cov_kwds={"maxlags": 3})
1385    assert_(np.isnan(res.fvalue))
1386    assert_(np.isnan(res.f_pvalue))
1387    res.summary()
1388
1389    res = WLS(y, x).fit(cov_type="HC1")
1390    assert_(np.isnan(res.fvalue))
1391    assert_(np.isnan(res.f_pvalue))
1392    res.summary()
1393
1394
1395def test_ridge():
1396    n = 100
1397    p = 5
1398    np.random.seed(3132)
1399    xmat = np.random.normal(size=(n, p))
1400    yvec = xmat.sum(1) + np.random.normal(size=n)
1401
1402    v = np.ones(p)
1403    v[0] = 0
1404
1405    for a in (0, 1, 10):
1406        for alpha in (a, a * np.ones(p), a * v):
1407            model1 = OLS(yvec, xmat)
1408            result1 = model1._fit_ridge(alpha=alpha)
1409            model2 = OLS(yvec, xmat)
1410            result2 = model2.fit_regularized(alpha=alpha, L1_wt=0)
1411            assert_allclose(result1.params, result2.params)
1412            model3 = OLS(yvec, xmat)
1413            result3 = model3.fit_regularized(alpha=alpha, L1_wt=1e-10)
1414            assert_allclose(result1.params, result3.params)
1415
1416    fv1 = result1.fittedvalues
1417    fv2 = np.dot(xmat, result1.params)
1418    assert_allclose(fv1, fv2)
1419
1420
1421def test_regularized_refit():
1422    n = 100
1423    p = 5
1424    np.random.seed(3132)
1425    xmat = np.random.normal(size=(n, p))
1426    # covariates 0 and 2 matter
1427    yvec = xmat[:, 0] + xmat[:, 2] + np.random.normal(size=n)
1428    model1 = OLS(yvec, xmat)
1429    result1 = model1.fit_regularized(alpha=2.0, L1_wt=0.5, refit=True)
1430    model2 = OLS(yvec, xmat[:, [0, 2]])
1431    result2 = model2.fit()
1432    ii = [0, 2]
1433    assert_allclose(result1.params[ii], result2.params)
1434    assert_allclose(result1.bse[ii], result2.bse)
1435
1436
1437def test_regularized_predict():
1438    # this also compares WLS with GLS
1439    n = 100
1440    p = 5
1441    np.random.seed(3132)
1442    xmat = np.random.normal(size=(n, p))
1443    yvec = xmat.sum(1) + np.random.normal(size=n)
1444    wgt = np.random.uniform(1, 2, n)
1445    model_wls = WLS(yvec, xmat, weights=wgt)
1446    # TODO: params is not the same in GLS if sigma=1 / wgt, i.e 1-dim, #7755
1447    model_gls1 = GLS(yvec, xmat, sigma=np.diag(1 / wgt))
1448    model_gls2 = GLS(yvec, xmat, sigma=1 / wgt)
1449    res = []
1450    for model1 in [model_wls, model_gls1, model_gls2]:
1451        result1 = model1.fit_regularized(alpha=20.0, L1_wt=0.5, refit=True)
1452        res.append(result1)
1453        params = result1.params
1454        fittedvalues = np.dot(xmat, params)
1455        pr = model1.predict(result1.params)
1456        assert_allclose(fittedvalues, pr)
1457        assert_allclose(result1.fittedvalues, pr)
1458
1459        pr = result1.predict()
1460        assert_allclose(fittedvalues, pr)
1461
1462    assert_allclose(res[0].model.wendog, res[1].model.wendog, rtol=1e-10)
1463    assert_allclose(res[0].model.wexog, res[1].model.wexog, rtol=1e-10)
1464    assert_allclose(res[0].fittedvalues, res[1].fittedvalues, rtol=1e-10)
1465    assert_allclose(res[0].params, res[1].params, rtol=1e-10)
1466
1467    assert_allclose(res[0].model.wendog, res[2].model.wendog, rtol=1e-10)
1468    assert_allclose(res[0].model.wexog, res[2].model.wexog, rtol=1e-10)
1469    assert_allclose(res[0].fittedvalues, res[2].fittedvalues, rtol=1e-10)
1470    assert_allclose(res[0].params, res[2].params, rtol=1e-10)
1471
1472
1473def test_regularized_options():
1474    n = 100
1475    p = 5
1476    np.random.seed(3132)
1477    xmat = np.random.normal(size=(n, p))
1478    yvec = xmat.sum(1) + np.random.normal(size=n)
1479    model1 = OLS(yvec - 1, xmat)
1480    result1 = model1.fit_regularized(alpha=1.0, L1_wt=0.5)
1481    model2 = OLS(yvec, xmat, offset=1)
1482    result2 = model2.fit_regularized(
1483        alpha=1.0, L1_wt=0.5, start_params=np.zeros(5)
1484    )
1485    assert_allclose(result1.params, result2.params)
1486
1487
1488def test_burg():
1489    rnd = np.random.RandomState(12345)
1490    e = rnd.randn(10001)
1491    y = e[1:] + 0.5 * e[:-1]
1492    # R, ar.burg
1493    expected = [
1494        [0.3909931],
1495        [0.4602607, -0.1771582],
1496        [0.47473245, -0.21475602, 0.08168813],
1497        [0.4787017, -0.2251910, 0.1047554, -0.0485900],
1498        [0.47975462, -0.22746106, 0.10963527, -0.05896347, 0.02167001],
1499    ]
1500
1501    for i in range(1, 6):
1502        ar, _ = burg(y, i)
1503        assert_allclose(ar, expected[i - 1], atol=1e-6)
1504        as_nodemean, _ = burg(1 + y, i, False)
1505        assert np.all(ar != as_nodemean)
1506
1507
1508def test_burg_errors():
1509    with pytest.raises(ValueError):
1510        burg(np.ones((100, 2)))
1511    with pytest.raises(ValueError):
1512        burg(np.random.randn(100), 0)
1513    with pytest.raises(ValueError):
1514        burg(np.random.randn(100), "apple")
1515
1516
1517@pytest.mark.skipif(not has_cvxopt, reason="sqrt_lasso requires cvxopt")
1518def test_sqrt_lasso():
1519
1520    np.random.seed(234923)
1521
1522    # Based on the example in the Belloni paper
1523    n = 100
1524    p = 500
1525    ii = np.arange(p)
1526    cx = 0.5 ** np.abs(np.subtract.outer(ii, ii))
1527    cxr = np.linalg.cholesky(cx)
1528
1529    x = np.dot(np.random.normal(size=(n, p)), cxr.T)
1530    b = np.zeros(p)
1531    b[0:5] = [1, 1, 1, 1, 1]
1532
1533    from scipy.stats.distributions import norm
1534
1535    alpha = 1.1 * np.sqrt(n) * norm.ppf(1 - 0.05 / (2 * p))
1536
1537    # Use very low noise level for a unit test
1538    y = np.dot(x, b) + 0.25 * np.random.normal(size=n)
1539
1540    # At low noise levels, the sqrt lasso should be around a
1541    # factor of 3 from the oracle without refit, and should
1542    # almost equal the oracle with refit.
1543    expected_oracle = {False: 3, True: 1}
1544
1545    # Used for regression testing
1546    expected_params = {
1547        False: np.r_[
1548            0.87397122, 0.96051874, 0.9905915, 0.93868953, 0.90771773
1549        ],
1550        True: np.r_[0.95114241, 1.0302987, 1.01723074, 0.97587343, 0.99846403],
1551    }
1552
1553    for refit in False, True:
1554
1555        rslt = OLS(y, x).fit_regularized(
1556            method="sqrt_lasso", alpha=alpha, refit=refit
1557        )
1558        err = rslt.params - b
1559        numer = np.sqrt(np.dot(err, np.dot(cx, err)))
1560
1561        oracle = OLS(y, x[:, 0:5]).fit()
1562        oracle_err = np.zeros(p)
1563        oracle_err[0:5] = oracle.params - b[0:5]
1564        denom = np.sqrt(np.dot(oracle_err, np.dot(cx, oracle_err)))
1565
1566        # Check performance relative to oracle, should be around
1567        assert_allclose(
1568            numer / denom, expected_oracle[refit], rtol=0.5, atol=0.1
1569        )
1570
1571        # Regression test the parameters
1572        assert_allclose(
1573            rslt.params[0:5], expected_params[refit], rtol=1e-5, atol=1e-5
1574        )
1575
1576
1577def test_bool_regressor(reset_randomstate):
1578    exog = np.random.randint(0, 2, size=(100, 2)).astype(bool)
1579    endog = np.random.standard_normal(100)
1580    bool_res = OLS(endog, exog).fit()
1581    res = OLS(endog, exog.astype(np.double)).fit()
1582    assert_allclose(bool_res.params, res.params)
1583
1584
1585def test_ols_constant(reset_randomstate):
1586    y = np.random.standard_normal((200))
1587    x = np.ones((200, 1))
1588    res = OLS(y, x).fit()
1589    with pytest.warns(None) as recording:
1590        assert np.isnan(res.fvalue)
1591        assert np.isnan(res.f_pvalue)
1592    assert len(recording) == 0
1593
1594
1595def test_summary_no_constant():
1596    rs = np.random.RandomState(0)
1597    x = rs.standard_normal((100, 2))
1598    y = rs.standard_normal(100)
1599    summary = OLS(y, x).fit().summary()
1600    assert "R² is computed " in summary.as_text()
1601