1# -*- coding: utf-8 -*-
2"""Tests for finding a positive semi-definite correlation or covariance matrix
3
4Created on Mon May 27 12:07:02 2013
5
6Author: Josef Perktold
7"""
8import warnings
9
10import numpy as np
11from numpy.testing import assert_almost_equal, assert_allclose
12import scipy.sparse as sparse
13import pytest
14
15from statsmodels.stats.correlation_tools import (
16    corr_nearest, corr_clipped, cov_nearest,
17    _project_correlation_factors, corr_nearest_factor, _spg_optim,
18    corr_thresholded, cov_nearest_factor_homog, FactoredPSDMatrix)
19from statsmodels.tools.testing import Holder
20
21
22def norm_f(x, y):
23    '''Frobenious norm (squared sum) of difference between two arrays
24    '''
25    d = ((x - y)**2).sum()
26    return np.sqrt(d)
27
28
29# R library Matrix results
30cov1_r = Holder()
31#> nc  <- nearPD(pr, conv.tol = 1e-7, keepDiag = TRUE, doDykstra =FALSE, corr=TRUE)
32#> cat_items(nc, prefix="cov1_r.")
33cov1_r.mat = '''<S4 object of class structure("dpoMatrix", package = "Matrix")>'''
34cov1_r.eigenvalues = np.array([
35     4.197315628646795, 0.7540460243978023, 0.5077608149667492,
36     0.3801267599652769, 0.1607508970775889, 4.197315628646795e-08
37    ])
38cov1_r.corr = '''TRUE'''
39cov1_r.normF = 0.0743805226512533
40cov1_r.iterations = 11
41cov1_r.rel_tol = 8.288594638441735e-08
42cov1_r.converged = '''TRUE'''
43#> mkarray2(as.matrix(nc$mat), name="cov1_r.mat")
44cov1_r.mat = np.array([
45     1, 0.487968018215892, 0.642651880010906, 0.4906386709070835,
46     0.6440990530811909, 0.8087111845493985, 0.487968018215892, 1,
47     0.5141147294352735, 0.2506688108312097, 0.672351311297074,
48     0.725832055882795, 0.642651880010906, 0.5141147294352735, 1,
49     0.596827778712154, 0.5821917790519067, 0.7449631633814129,
50     0.4906386709070835, 0.2506688108312097, 0.596827778712154, 1,
51     0.729882058012399, 0.772150225146826, 0.6440990530811909,
52     0.672351311297074, 0.5821917790519067, 0.729882058012399, 1,
53     0.813191720191944, 0.8087111845493985, 0.725832055882795,
54     0.7449631633814129, 0.772150225146826, 0.813191720191944, 1
55    ]).reshape(6,6, order='F')
56
57
58cov_r = Holder()
59#nc  <- nearPD(pr+0.01*diag(6), conv.tol = 1e-7, keepDiag = TRUE, doDykstra =FALSE, corr=FALSE)
60#> cat_items(nc, prefix="cov_r.")
61#cov_r.mat = '''<S4 object of class structure("dpoMatrix", package = "Matrix")>'''
62cov_r.eigenvalues = np.array([
63     4.209897516692652, 0.7668341923072066, 0.518956980021938,
64     0.390838551407132, 0.1734728460460068, 4.209897516692652e-08
65    ])
66cov_r.corr = '''FALSE'''
67cov_r.normF = 0.0623948693159157
68cov_r.iterations = 11
69cov_r.rel_tol = 5.83987595937896e-08
70cov_r.converged = '''TRUE'''
71
72#> mkarray2(as.matrix(nc$mat), name="cov_r.mat")
73cov_r.mat = np.array([
74     1.01, 0.486207476951913, 0.6428524769306785, 0.4886092840296514,
75     0.645175579158233, 0.811533860074678, 0.486207476951913, 1.01,
76     0.514394615153752, 0.2478398278204047, 0.673852495852274,
77     0.7297661648968664, 0.6428524769306785, 0.514394615153752, 1.01,
78     0.5971503271420517, 0.582018469844712, 0.7445177382760834,
79     0.4886092840296514, 0.2478398278204047, 0.5971503271420517, 1.01,
80     0.73161232298669, 0.7766852947049376, 0.645175579158233,
81     0.673852495852274, 0.582018469844712, 0.73161232298669, 1.01,
82     0.8107916469252828, 0.811533860074678, 0.7297661648968664,
83     0.7445177382760834, 0.7766852947049376, 0.8107916469252828, 1.01
84    ]).reshape(6,6, order='F')
85
86
87def test_corr_psd():
88    # test positive definite matrix is unchanged
89    x = np.array([[1, -0.2, -0.9], [-0.2, 1, -0.2], [-0.9, -0.2, 1]])
90
91    y = corr_nearest(x, n_fact=100)
92    #print np.max(np.abs(x - y))
93    assert_almost_equal(x, y, decimal=14)
94
95    y = corr_clipped(x)
96    assert_almost_equal(x, y, decimal=14)
97
98    y = cov_nearest(x, n_fact=100)
99    assert_almost_equal(x, y, decimal=14)
100
101    x2 = x + 0.001 * np.eye(3)
102    y = cov_nearest(x2, n_fact=100)
103    assert_almost_equal(x2, y, decimal=14)
104
105
106class CheckCorrPSDMixin(object):
107
108    def test_nearest(self):
109        x = self.x
110        res_r = self.res
111        y = corr_nearest(x, threshold=1e-7, n_fact=100)
112        #print np.max(np.abs(x - y))
113        assert_almost_equal(y, res_r.mat, decimal=3)
114        d = norm_f(x, y)
115        assert_allclose(d, res_r.normF, rtol=0.0015)
116        evals = np.linalg.eigvalsh(y)
117        #print 'evals', evals / res_r.eigenvalues[::-1] - 1
118        assert_allclose(evals, res_r.eigenvalues[::-1], rtol=0.003, atol=1e-7)
119        #print evals[0] / 1e-7 - 1
120        assert_allclose(evals[0], 1e-7, rtol=1e-6)
121
122    def test_clipped(self):
123        x = self.x
124        res_r = self.res
125        y = corr_clipped(x, threshold=1e-7)
126        #print np.max(np.abs(x - y)), np.max(np.abs((x - y) / y))
127        assert_almost_equal(y, res_r.mat, decimal=1)
128        d = norm_f(x, y)
129        assert_allclose(d, res_r.normF, rtol=0.15)
130
131        evals = np.linalg.eigvalsh(y)
132        assert_allclose(evals, res_r.eigenvalues[::-1], rtol=0.1, atol=1e-7)
133        assert_allclose(evals[0], 1e-7, rtol=0.02)
134
135    def test_cov_nearest(self):
136        x = self.x
137        res_r = self.res
138        with warnings.catch_warnings():
139            warnings.simplefilter("ignore")
140            y = cov_nearest(x, method='nearest', threshold=1e-7)
141        #print np.max(np.abs(x - y))
142        assert_almost_equal(y, res_r.mat, decimal=2)
143        d = norm_f(x, y)
144        assert_allclose(d, res_r.normF, rtol=0.0015)
145
146
147class TestCovPSD(object):
148
149    @classmethod
150    def setup_class(cls):
151        x = np.array([ 1,     0.477, 0.644, 0.478, 0.651, 0.826,
152                       0.477, 1,     0.516, 0.233, 0.682, 0.75,
153                       0.644, 0.516, 1,     0.599, 0.581, 0.742,
154                       0.478, 0.233, 0.599, 1,     0.741, 0.8,
155                       0.651, 0.682, 0.581, 0.741, 1,     0.798,
156                       0.826, 0.75,  0.742, 0.8,   0.798, 1]).reshape(6,6)
157        cls.x = x + 0.01 * np.eye(6)
158        cls.res = cov_r
159
160    def test_cov_nearest(self):
161        x = self.x
162        res_r = self.res
163        y = cov_nearest(x, method='nearest')
164        #print np.max(np.abs(x - y))
165        assert_almost_equal(y, res_r.mat, decimal=3)
166        d = norm_f(x, y)
167        assert_allclose(d, res_r.normF, rtol=0.001)
168
169        y = cov_nearest(x, method='clipped')
170        #print np.max(np.abs(x - y))
171        assert_almost_equal(y, res_r.mat, decimal=2)
172        d = norm_f(x, y)
173        assert_allclose(d, res_r.normF, rtol=0.15)
174
175
176class TestCorrPSD1(CheckCorrPSDMixin):
177
178    @classmethod
179    def setup_class(cls):
180        x = np.array([ 1,     0.477, 0.644, 0.478, 0.651, 0.826,
181                       0.477, 1,     0.516, 0.233, 0.682, 0.75,
182                       0.644, 0.516, 1,     0.599, 0.581, 0.742,
183                       0.478, 0.233, 0.599, 1,     0.741, 0.8,
184                       0.651, 0.682, 0.581, 0.741, 1,     0.798,
185                       0.826, 0.75,  0.742, 0.8,   0.798, 1]).reshape(6,6)
186        cls.x = x
187        cls.res = cov1_r
188
189
190@pytest.mark.parametrize('threshold', [0, 1e-15, 1e-10, 1e-6])
191def test_corrpsd_threshold(threshold):
192    x = np.array([[1, -0.9, -0.9], [-0.9, 1, -0.9], [-0.9, -0.9, 1]])
193
194    y = corr_nearest(x, n_fact=100, threshold=threshold)
195    evals = np.linalg.eigvalsh(y)
196    assert_allclose(evals[0], threshold, rtol=1e-6, atol=1e-15)
197
198    y = corr_clipped(x, threshold=threshold)
199    evals = np.linalg.eigvalsh(y)
200    assert_allclose(evals[0], threshold, rtol=0.25, atol=1e-15)
201
202    y = cov_nearest(x, method='nearest', n_fact=100, threshold=threshold)
203    evals = np.linalg.eigvalsh(y)
204    assert_allclose(evals[0], threshold, rtol=1e-6, atol=1e-15)
205
206    y = cov_nearest(x, n_fact=100, threshold=threshold)
207    evals = np.linalg.eigvalsh(y)
208    assert_allclose(evals[0], threshold, rtol=0.25, atol=1e-15)
209
210
211class Test_Factor(object):
212
213    def test_corr_nearest_factor_arrpack(self):
214
215        # regression results for svds call
216        u2 = np.array([[
217         6.39407581e-19,   9.15225947e-03,   1.82631698e-02,
218         2.72917181e-02,   3.61975557e-02,   4.49413101e-02,
219         5.34848732e-02,   6.17916613e-02,   6.98268388e-02,
220         7.75575058e-02,   8.49528448e-02,   9.19842264e-02,
221         9.86252769e-02,   1.04851906e-01,   1.10642305e-01,
222         1.15976906e-01,   1.20838331e-01,   1.25211306e-01,
223         1.29082570e-01,   1.32440778e-01,   1.35276397e-01,
224         1.37581605e-01,   1.39350201e-01,   1.40577526e-01,
225         1.41260396e-01,   1.41397057e-01,   1.40987160e-01,
226         1.40031756e-01,   1.38533306e-01,   1.36495727e-01,
227         1.33924439e-01,   1.30826443e-01,   1.27210404e-01,
228         1.23086750e-01,   1.18467769e-01,   1.13367717e-01,
229         1.07802909e-01,   1.01791811e-01,   9.53551023e-02,
230         8.85157320e-02,   8.12989329e-02,   7.37322125e-02,
231         6.58453049e-02,   5.76700847e-02,   4.92404406e-02,
232         4.05921079e-02,   3.17624629e-02,   2.27902803e-02,
233         1.37154584e-02,   4.57871801e-03,  -4.57871801e-03,
234        -1.37154584e-02,  -2.27902803e-02,  -3.17624629e-02,
235        -4.05921079e-02,  -4.92404406e-02,  -5.76700847e-02,
236        -6.58453049e-02,  -7.37322125e-02,  -8.12989329e-02,
237        -8.85157320e-02,  -9.53551023e-02,  -1.01791811e-01,
238        -1.07802909e-01,  -1.13367717e-01,  -1.18467769e-01,
239        -1.23086750e-01,  -1.27210404e-01,  -1.30826443e-01,
240        -1.33924439e-01,  -1.36495727e-01,  -1.38533306e-01,
241        -1.40031756e-01,  -1.40987160e-01,  -1.41397057e-01,
242        -1.41260396e-01,  -1.40577526e-01,  -1.39350201e-01,
243        -1.37581605e-01,  -1.35276397e-01,  -1.32440778e-01,
244        -1.29082570e-01,  -1.25211306e-01,  -1.20838331e-01,
245        -1.15976906e-01,  -1.10642305e-01,  -1.04851906e-01,
246        -9.86252769e-02,  -9.19842264e-02,  -8.49528448e-02,
247        -7.75575058e-02,  -6.98268388e-02,  -6.17916613e-02,
248        -5.34848732e-02,  -4.49413101e-02,  -3.61975557e-02,
249        -2.72917181e-02,  -1.82631698e-02,  -9.15225947e-03,
250        -3.51829569e-17]]).T
251        s2 = np.array([ 24.88812183])
252
253        d = 100
254        dm = 1
255
256        # Construct a test matrix with exact factor structure
257        X = np.zeros((d,dm), dtype=np.float64)
258        x = np.linspace(0, 2*np.pi, d)
259        for j in range(dm):
260            X[:,j] = np.sin(x*(j+1))
261        _project_correlation_factors(X)
262        X *= 0.7
263        mat = np.dot(X, X.T)
264        np.fill_diagonal(mat, 1.)
265
266        from scipy.sparse.linalg import svds
267        u, s, vt = svds(mat, dm)
268
269        #difference in sign
270        dsign = np.sign(u[1]) * np.sign(u2[1])
271
272        assert_allclose(u, dsign * u2, rtol=1e-6, atol=1e-14)
273        assert_allclose(s, s2, rtol=1e-6)
274
275    @pytest.mark.parametrize('dm', [1, 2])
276    def test_corr_nearest_factor(self, dm):
277
278        objvals = [np.array([6241.8, 6241.8, 579.4, 264.6, 264.3]),
279                   np.array([2104.9, 2104.9, 710.5, 266.3, 286.1])]
280
281        d = 100
282
283        # Construct a test matrix with exact factor structure
284        X = np.zeros((d, dm), dtype=np.float64)
285        x = np.linspace(0, 2 * np.pi, d)
286        np.random.seed(10)
287        for j in range(dm):
288            X[:, j] = np.sin(x * (j + 1)) + 1e-10 * np.random.randn(d)
289
290        _project_correlation_factors(X)
291        assert np.isfinite(X).all()
292        X *= 0.7
293        mat = np.dot(X, X.T)
294        np.fill_diagonal(mat, 1.)
295
296        # Try to recover the structure
297        rslt = corr_nearest_factor(mat, dm, maxiter=10000)
298        err_msg = 'rank=%d, niter=%d' % (dm, len(rslt.objective_values))
299        assert_allclose(rslt.objective_values[:5], objvals[dm - 1],
300                        rtol=0.5, err_msg=err_msg)
301        assert rslt.Converged
302
303        mat1 = rslt.corr.to_matrix()
304        assert_allclose(mat, mat1, rtol=0.25, atol=1e-3, err_msg=err_msg)
305
306    @pytest.mark.slow
307    @pytest.mark.parametrize('dm', [1, 2])
308    def test_corr_nearest_factor_sparse(self, dm):
309        # Test that result is the same if the input is dense or sparse
310        d = 200
311
312        # Generate a test matrix of factors
313        X = np.zeros((d, dm), dtype=np.float64)
314        x = np.linspace(0, 2 * np.pi, d)
315        rs = np.random.RandomState(10)
316        for j in range(dm):
317            X[:, j] = np.sin(x * (j + 1)) + rs.randn(d)
318
319        # Get the correlation matrix
320        _project_correlation_factors(X)
321        X *= 0.7
322        mat = np.dot(X, X.T)
323        np.fill_diagonal(mat, 1)
324
325        # Threshold it
326        mat.flat[np.abs(mat.flat) < 0.35] = 0.0
327        smat = sparse.csr_matrix(mat)
328
329        dense_rslt = corr_nearest_factor(mat, dm, maxiter=10000)
330        sparse_rslt = corr_nearest_factor(smat, dm, maxiter=10000)
331
332        mat_dense = dense_rslt.corr.to_matrix()
333        mat_sparse = sparse_rslt.corr.to_matrix()
334
335        assert dense_rslt.Converged is sparse_rslt.Converged
336        assert dense_rslt.Converged is True
337
338        assert_allclose(mat_dense, mat_sparse, rtol=.25, atol=1e-3)
339
340    # Test on a quadratic function.
341    def test_spg_optim(self, reset_randomstate):
342
343        dm = 100
344
345        ind = np.arange(dm)
346        indmat = np.abs(ind[:,None] - ind[None,:])
347        M = 0.8**indmat
348
349        def obj(x):
350            return np.dot(x, np.dot(M, x))
351
352        def grad(x):
353            return 2*np.dot(M, x)
354
355        def project(x):
356            return x
357
358        x = np.random.normal(size=dm)
359        rslt = _spg_optim(obj, grad, x, project)
360        xnew = rslt.params
361        assert rslt.Converged is True
362        assert_almost_equal(obj(xnew), 0, decimal=3)
363
364    def test_decorrelate(self, reset_randomstate):
365
366        d = 30
367        dg = np.linspace(1, 2, d)
368        root = np.random.normal(size=(d, 4))
369        fac = FactoredPSDMatrix(dg, root)
370        mat = fac.to_matrix()
371        rmat = np.linalg.cholesky(mat)
372        dcr = fac.decorrelate(rmat)
373        idm = np.dot(dcr, dcr.T)
374        assert_almost_equal(idm, np.eye(d))
375
376        rhs = np.random.normal(size=(d, 5))
377        mat2 = np.dot(rhs.T, np.linalg.solve(mat, rhs))
378        mat3 = fac.decorrelate(rhs)
379        mat3 = np.dot(mat3.T, mat3)
380        assert_almost_equal(mat2, mat3)
381
382    def test_logdet(self, reset_randomstate):
383
384        d = 30
385        dg = np.linspace(1, 2, d)
386        root = np.random.normal(size=(d, 4))
387        fac = FactoredPSDMatrix(dg, root)
388        mat = fac.to_matrix()
389
390        _, ld = np.linalg.slogdet(mat)
391        ld2 = fac.logdet()
392
393        assert_almost_equal(ld, ld2)
394
395    def test_solve(self, reset_randomstate):
396
397        d = 30
398        dg = np.linspace(1, 2, d)
399        root = np.random.normal(size=(d, 2))
400        fac = FactoredPSDMatrix(dg, root)
401        rhs = np.random.normal(size=(d, 5))
402        sr1 = fac.solve(rhs)
403        mat = fac.to_matrix()
404        sr2 = np.linalg.solve(mat, rhs)
405        assert_almost_equal(sr1, sr2)
406
407    @pytest.mark.parametrize('dm', [1, 2])
408    def test_cov_nearest_factor_homog(self, dm):
409
410        d = 100
411
412        # Construct a test matrix with exact factor structure
413        X = np.zeros((d, dm), dtype=np.float64)
414        x = np.linspace(0, 2*np.pi, d)
415        for j in range(dm):
416            X[:, j] = np.sin(x*(j+1))
417        mat = np.dot(X, X.T)
418        np.fill_diagonal(mat, np.diag(mat) + 3.1)
419
420        # Try to recover the structure
421        rslt = cov_nearest_factor_homog(mat, dm)
422        mat1 = rslt.to_matrix()
423
424        assert_allclose(mat, mat1, rtol=0.25, atol=1e-3)
425
426    @pytest.mark.parametrize('dm', [1, 2])
427    def test_cov_nearest_factor_homog_sparse(self, dm):
428        # Check that dense and sparse inputs give the same result
429
430        d = 100
431
432        # Construct a test matrix with exact factor structure
433        X = np.zeros((d, dm), dtype=np.float64)
434        x = np.linspace(0, 2*np.pi, d)
435        for j in range(dm):
436            X[:, j] = np.sin(x*(j+1))
437        mat = np.dot(X, X.T)
438        np.fill_diagonal(mat, np.diag(mat) + 3.1)
439
440        # Fit to dense
441        rslt = cov_nearest_factor_homog(mat, dm)
442        mat1 = rslt.to_matrix()
443
444        # Fit to sparse
445        smat = sparse.csr_matrix(mat)
446        rslt = cov_nearest_factor_homog(smat, dm)
447        mat2 = rslt.to_matrix()
448
449        assert_allclose(mat1, mat2, rtol=0.25, atol=1e-3)
450
451    def test_corr_thresholded(self, reset_randomstate):
452
453        import datetime
454
455        t1 = datetime.datetime.now()
456        X = np.random.normal(size=(2000,10))
457        tcor = corr_thresholded(X, 0.2, max_elt=4e6)
458        t2 = datetime.datetime.now()
459        ss = (t2-t1).seconds
460
461        fcor = np.corrcoef(X)
462        fcor *= (np.abs(fcor) >= 0.2)
463
464        assert_allclose(tcor.todense(), fcor, rtol=0.25, atol=1e-3)
465