1# -*- coding: utf-8 -*- 2"""Tests for finding a positive semi-definite correlation or covariance matrix 3 4Created on Mon May 27 12:07:02 2013 5 6Author: Josef Perktold 7""" 8import warnings 9 10import numpy as np 11from numpy.testing import assert_almost_equal, assert_allclose 12import scipy.sparse as sparse 13import pytest 14 15from statsmodels.stats.correlation_tools import ( 16 corr_nearest, corr_clipped, cov_nearest, 17 _project_correlation_factors, corr_nearest_factor, _spg_optim, 18 corr_thresholded, cov_nearest_factor_homog, FactoredPSDMatrix) 19from statsmodels.tools.testing import Holder 20 21 22def norm_f(x, y): 23 '''Frobenious norm (squared sum) of difference between two arrays 24 ''' 25 d = ((x - y)**2).sum() 26 return np.sqrt(d) 27 28 29# R library Matrix results 30cov1_r = Holder() 31#> nc <- nearPD(pr, conv.tol = 1e-7, keepDiag = TRUE, doDykstra =FALSE, corr=TRUE) 32#> cat_items(nc, prefix="cov1_r.") 33cov1_r.mat = '''<S4 object of class structure("dpoMatrix", package = "Matrix")>''' 34cov1_r.eigenvalues = np.array([ 35 4.197315628646795, 0.7540460243978023, 0.5077608149667492, 36 0.3801267599652769, 0.1607508970775889, 4.197315628646795e-08 37 ]) 38cov1_r.corr = '''TRUE''' 39cov1_r.normF = 0.0743805226512533 40cov1_r.iterations = 11 41cov1_r.rel_tol = 8.288594638441735e-08 42cov1_r.converged = '''TRUE''' 43#> mkarray2(as.matrix(nc$mat), name="cov1_r.mat") 44cov1_r.mat = np.array([ 45 1, 0.487968018215892, 0.642651880010906, 0.4906386709070835, 46 0.6440990530811909, 0.8087111845493985, 0.487968018215892, 1, 47 0.5141147294352735, 0.2506688108312097, 0.672351311297074, 48 0.725832055882795, 0.642651880010906, 0.5141147294352735, 1, 49 0.596827778712154, 0.5821917790519067, 0.7449631633814129, 50 0.4906386709070835, 0.2506688108312097, 0.596827778712154, 1, 51 0.729882058012399, 0.772150225146826, 0.6440990530811909, 52 0.672351311297074, 0.5821917790519067, 0.729882058012399, 1, 53 0.813191720191944, 0.8087111845493985, 0.725832055882795, 54 0.7449631633814129, 0.772150225146826, 0.813191720191944, 1 55 ]).reshape(6,6, order='F') 56 57 58cov_r = Holder() 59#nc <- nearPD(pr+0.01*diag(6), conv.tol = 1e-7, keepDiag = TRUE, doDykstra =FALSE, corr=FALSE) 60#> cat_items(nc, prefix="cov_r.") 61#cov_r.mat = '''<S4 object of class structure("dpoMatrix", package = "Matrix")>''' 62cov_r.eigenvalues = np.array([ 63 4.209897516692652, 0.7668341923072066, 0.518956980021938, 64 0.390838551407132, 0.1734728460460068, 4.209897516692652e-08 65 ]) 66cov_r.corr = '''FALSE''' 67cov_r.normF = 0.0623948693159157 68cov_r.iterations = 11 69cov_r.rel_tol = 5.83987595937896e-08 70cov_r.converged = '''TRUE''' 71 72#> mkarray2(as.matrix(nc$mat), name="cov_r.mat") 73cov_r.mat = np.array([ 74 1.01, 0.486207476951913, 0.6428524769306785, 0.4886092840296514, 75 0.645175579158233, 0.811533860074678, 0.486207476951913, 1.01, 76 0.514394615153752, 0.2478398278204047, 0.673852495852274, 77 0.7297661648968664, 0.6428524769306785, 0.514394615153752, 1.01, 78 0.5971503271420517, 0.582018469844712, 0.7445177382760834, 79 0.4886092840296514, 0.2478398278204047, 0.5971503271420517, 1.01, 80 0.73161232298669, 0.7766852947049376, 0.645175579158233, 81 0.673852495852274, 0.582018469844712, 0.73161232298669, 1.01, 82 0.8107916469252828, 0.811533860074678, 0.7297661648968664, 83 0.7445177382760834, 0.7766852947049376, 0.8107916469252828, 1.01 84 ]).reshape(6,6, order='F') 85 86 87def test_corr_psd(): 88 # test positive definite matrix is unchanged 89 x = np.array([[1, -0.2, -0.9], [-0.2, 1, -0.2], [-0.9, -0.2, 1]]) 90 91 y = corr_nearest(x, n_fact=100) 92 #print np.max(np.abs(x - y)) 93 assert_almost_equal(x, y, decimal=14) 94 95 y = corr_clipped(x) 96 assert_almost_equal(x, y, decimal=14) 97 98 y = cov_nearest(x, n_fact=100) 99 assert_almost_equal(x, y, decimal=14) 100 101 x2 = x + 0.001 * np.eye(3) 102 y = cov_nearest(x2, n_fact=100) 103 assert_almost_equal(x2, y, decimal=14) 104 105 106class CheckCorrPSDMixin(object): 107 108 def test_nearest(self): 109 x = self.x 110 res_r = self.res 111 y = corr_nearest(x, threshold=1e-7, n_fact=100) 112 #print np.max(np.abs(x - y)) 113 assert_almost_equal(y, res_r.mat, decimal=3) 114 d = norm_f(x, y) 115 assert_allclose(d, res_r.normF, rtol=0.0015) 116 evals = np.linalg.eigvalsh(y) 117 #print 'evals', evals / res_r.eigenvalues[::-1] - 1 118 assert_allclose(evals, res_r.eigenvalues[::-1], rtol=0.003, atol=1e-7) 119 #print evals[0] / 1e-7 - 1 120 assert_allclose(evals[0], 1e-7, rtol=1e-6) 121 122 def test_clipped(self): 123 x = self.x 124 res_r = self.res 125 y = corr_clipped(x, threshold=1e-7) 126 #print np.max(np.abs(x - y)), np.max(np.abs((x - y) / y)) 127 assert_almost_equal(y, res_r.mat, decimal=1) 128 d = norm_f(x, y) 129 assert_allclose(d, res_r.normF, rtol=0.15) 130 131 evals = np.linalg.eigvalsh(y) 132 assert_allclose(evals, res_r.eigenvalues[::-1], rtol=0.1, atol=1e-7) 133 assert_allclose(evals[0], 1e-7, rtol=0.02) 134 135 def test_cov_nearest(self): 136 x = self.x 137 res_r = self.res 138 with warnings.catch_warnings(): 139 warnings.simplefilter("ignore") 140 y = cov_nearest(x, method='nearest', threshold=1e-7) 141 #print np.max(np.abs(x - y)) 142 assert_almost_equal(y, res_r.mat, decimal=2) 143 d = norm_f(x, y) 144 assert_allclose(d, res_r.normF, rtol=0.0015) 145 146 147class TestCovPSD(object): 148 149 @classmethod 150 def setup_class(cls): 151 x = np.array([ 1, 0.477, 0.644, 0.478, 0.651, 0.826, 152 0.477, 1, 0.516, 0.233, 0.682, 0.75, 153 0.644, 0.516, 1, 0.599, 0.581, 0.742, 154 0.478, 0.233, 0.599, 1, 0.741, 0.8, 155 0.651, 0.682, 0.581, 0.741, 1, 0.798, 156 0.826, 0.75, 0.742, 0.8, 0.798, 1]).reshape(6,6) 157 cls.x = x + 0.01 * np.eye(6) 158 cls.res = cov_r 159 160 def test_cov_nearest(self): 161 x = self.x 162 res_r = self.res 163 y = cov_nearest(x, method='nearest') 164 #print np.max(np.abs(x - y)) 165 assert_almost_equal(y, res_r.mat, decimal=3) 166 d = norm_f(x, y) 167 assert_allclose(d, res_r.normF, rtol=0.001) 168 169 y = cov_nearest(x, method='clipped') 170 #print np.max(np.abs(x - y)) 171 assert_almost_equal(y, res_r.mat, decimal=2) 172 d = norm_f(x, y) 173 assert_allclose(d, res_r.normF, rtol=0.15) 174 175 176class TestCorrPSD1(CheckCorrPSDMixin): 177 178 @classmethod 179 def setup_class(cls): 180 x = np.array([ 1, 0.477, 0.644, 0.478, 0.651, 0.826, 181 0.477, 1, 0.516, 0.233, 0.682, 0.75, 182 0.644, 0.516, 1, 0.599, 0.581, 0.742, 183 0.478, 0.233, 0.599, 1, 0.741, 0.8, 184 0.651, 0.682, 0.581, 0.741, 1, 0.798, 185 0.826, 0.75, 0.742, 0.8, 0.798, 1]).reshape(6,6) 186 cls.x = x 187 cls.res = cov1_r 188 189 190@pytest.mark.parametrize('threshold', [0, 1e-15, 1e-10, 1e-6]) 191def test_corrpsd_threshold(threshold): 192 x = np.array([[1, -0.9, -0.9], [-0.9, 1, -0.9], [-0.9, -0.9, 1]]) 193 194 y = corr_nearest(x, n_fact=100, threshold=threshold) 195 evals = np.linalg.eigvalsh(y) 196 assert_allclose(evals[0], threshold, rtol=1e-6, atol=1e-15) 197 198 y = corr_clipped(x, threshold=threshold) 199 evals = np.linalg.eigvalsh(y) 200 assert_allclose(evals[0], threshold, rtol=0.25, atol=1e-15) 201 202 y = cov_nearest(x, method='nearest', n_fact=100, threshold=threshold) 203 evals = np.linalg.eigvalsh(y) 204 assert_allclose(evals[0], threshold, rtol=1e-6, atol=1e-15) 205 206 y = cov_nearest(x, n_fact=100, threshold=threshold) 207 evals = np.linalg.eigvalsh(y) 208 assert_allclose(evals[0], threshold, rtol=0.25, atol=1e-15) 209 210 211class Test_Factor(object): 212 213 def test_corr_nearest_factor_arrpack(self): 214 215 # regression results for svds call 216 u2 = np.array([[ 217 6.39407581e-19, 9.15225947e-03, 1.82631698e-02, 218 2.72917181e-02, 3.61975557e-02, 4.49413101e-02, 219 5.34848732e-02, 6.17916613e-02, 6.98268388e-02, 220 7.75575058e-02, 8.49528448e-02, 9.19842264e-02, 221 9.86252769e-02, 1.04851906e-01, 1.10642305e-01, 222 1.15976906e-01, 1.20838331e-01, 1.25211306e-01, 223 1.29082570e-01, 1.32440778e-01, 1.35276397e-01, 224 1.37581605e-01, 1.39350201e-01, 1.40577526e-01, 225 1.41260396e-01, 1.41397057e-01, 1.40987160e-01, 226 1.40031756e-01, 1.38533306e-01, 1.36495727e-01, 227 1.33924439e-01, 1.30826443e-01, 1.27210404e-01, 228 1.23086750e-01, 1.18467769e-01, 1.13367717e-01, 229 1.07802909e-01, 1.01791811e-01, 9.53551023e-02, 230 8.85157320e-02, 8.12989329e-02, 7.37322125e-02, 231 6.58453049e-02, 5.76700847e-02, 4.92404406e-02, 232 4.05921079e-02, 3.17624629e-02, 2.27902803e-02, 233 1.37154584e-02, 4.57871801e-03, -4.57871801e-03, 234 -1.37154584e-02, -2.27902803e-02, -3.17624629e-02, 235 -4.05921079e-02, -4.92404406e-02, -5.76700847e-02, 236 -6.58453049e-02, -7.37322125e-02, -8.12989329e-02, 237 -8.85157320e-02, -9.53551023e-02, -1.01791811e-01, 238 -1.07802909e-01, -1.13367717e-01, -1.18467769e-01, 239 -1.23086750e-01, -1.27210404e-01, -1.30826443e-01, 240 -1.33924439e-01, -1.36495727e-01, -1.38533306e-01, 241 -1.40031756e-01, -1.40987160e-01, -1.41397057e-01, 242 -1.41260396e-01, -1.40577526e-01, -1.39350201e-01, 243 -1.37581605e-01, -1.35276397e-01, -1.32440778e-01, 244 -1.29082570e-01, -1.25211306e-01, -1.20838331e-01, 245 -1.15976906e-01, -1.10642305e-01, -1.04851906e-01, 246 -9.86252769e-02, -9.19842264e-02, -8.49528448e-02, 247 -7.75575058e-02, -6.98268388e-02, -6.17916613e-02, 248 -5.34848732e-02, -4.49413101e-02, -3.61975557e-02, 249 -2.72917181e-02, -1.82631698e-02, -9.15225947e-03, 250 -3.51829569e-17]]).T 251 s2 = np.array([ 24.88812183]) 252 253 d = 100 254 dm = 1 255 256 # Construct a test matrix with exact factor structure 257 X = np.zeros((d,dm), dtype=np.float64) 258 x = np.linspace(0, 2*np.pi, d) 259 for j in range(dm): 260 X[:,j] = np.sin(x*(j+1)) 261 _project_correlation_factors(X) 262 X *= 0.7 263 mat = np.dot(X, X.T) 264 np.fill_diagonal(mat, 1.) 265 266 from scipy.sparse.linalg import svds 267 u, s, vt = svds(mat, dm) 268 269 #difference in sign 270 dsign = np.sign(u[1]) * np.sign(u2[1]) 271 272 assert_allclose(u, dsign * u2, rtol=1e-6, atol=1e-14) 273 assert_allclose(s, s2, rtol=1e-6) 274 275 @pytest.mark.parametrize('dm', [1, 2]) 276 def test_corr_nearest_factor(self, dm): 277 278 objvals = [np.array([6241.8, 6241.8, 579.4, 264.6, 264.3]), 279 np.array([2104.9, 2104.9, 710.5, 266.3, 286.1])] 280 281 d = 100 282 283 # Construct a test matrix with exact factor structure 284 X = np.zeros((d, dm), dtype=np.float64) 285 x = np.linspace(0, 2 * np.pi, d) 286 np.random.seed(10) 287 for j in range(dm): 288 X[:, j] = np.sin(x * (j + 1)) + 1e-10 * np.random.randn(d) 289 290 _project_correlation_factors(X) 291 assert np.isfinite(X).all() 292 X *= 0.7 293 mat = np.dot(X, X.T) 294 np.fill_diagonal(mat, 1.) 295 296 # Try to recover the structure 297 rslt = corr_nearest_factor(mat, dm, maxiter=10000) 298 err_msg = 'rank=%d, niter=%d' % (dm, len(rslt.objective_values)) 299 assert_allclose(rslt.objective_values[:5], objvals[dm - 1], 300 rtol=0.5, err_msg=err_msg) 301 assert rslt.Converged 302 303 mat1 = rslt.corr.to_matrix() 304 assert_allclose(mat, mat1, rtol=0.25, atol=1e-3, err_msg=err_msg) 305 306 @pytest.mark.slow 307 @pytest.mark.parametrize('dm', [1, 2]) 308 def test_corr_nearest_factor_sparse(self, dm): 309 # Test that result is the same if the input is dense or sparse 310 d = 200 311 312 # Generate a test matrix of factors 313 X = np.zeros((d, dm), dtype=np.float64) 314 x = np.linspace(0, 2 * np.pi, d) 315 rs = np.random.RandomState(10) 316 for j in range(dm): 317 X[:, j] = np.sin(x * (j + 1)) + rs.randn(d) 318 319 # Get the correlation matrix 320 _project_correlation_factors(X) 321 X *= 0.7 322 mat = np.dot(X, X.T) 323 np.fill_diagonal(mat, 1) 324 325 # Threshold it 326 mat.flat[np.abs(mat.flat) < 0.35] = 0.0 327 smat = sparse.csr_matrix(mat) 328 329 dense_rslt = corr_nearest_factor(mat, dm, maxiter=10000) 330 sparse_rslt = corr_nearest_factor(smat, dm, maxiter=10000) 331 332 mat_dense = dense_rslt.corr.to_matrix() 333 mat_sparse = sparse_rslt.corr.to_matrix() 334 335 assert dense_rslt.Converged is sparse_rslt.Converged 336 assert dense_rslt.Converged is True 337 338 assert_allclose(mat_dense, mat_sparse, rtol=.25, atol=1e-3) 339 340 # Test on a quadratic function. 341 def test_spg_optim(self, reset_randomstate): 342 343 dm = 100 344 345 ind = np.arange(dm) 346 indmat = np.abs(ind[:,None] - ind[None,:]) 347 M = 0.8**indmat 348 349 def obj(x): 350 return np.dot(x, np.dot(M, x)) 351 352 def grad(x): 353 return 2*np.dot(M, x) 354 355 def project(x): 356 return x 357 358 x = np.random.normal(size=dm) 359 rslt = _spg_optim(obj, grad, x, project) 360 xnew = rslt.params 361 assert rslt.Converged is True 362 assert_almost_equal(obj(xnew), 0, decimal=3) 363 364 def test_decorrelate(self, reset_randomstate): 365 366 d = 30 367 dg = np.linspace(1, 2, d) 368 root = np.random.normal(size=(d, 4)) 369 fac = FactoredPSDMatrix(dg, root) 370 mat = fac.to_matrix() 371 rmat = np.linalg.cholesky(mat) 372 dcr = fac.decorrelate(rmat) 373 idm = np.dot(dcr, dcr.T) 374 assert_almost_equal(idm, np.eye(d)) 375 376 rhs = np.random.normal(size=(d, 5)) 377 mat2 = np.dot(rhs.T, np.linalg.solve(mat, rhs)) 378 mat3 = fac.decorrelate(rhs) 379 mat3 = np.dot(mat3.T, mat3) 380 assert_almost_equal(mat2, mat3) 381 382 def test_logdet(self, reset_randomstate): 383 384 d = 30 385 dg = np.linspace(1, 2, d) 386 root = np.random.normal(size=(d, 4)) 387 fac = FactoredPSDMatrix(dg, root) 388 mat = fac.to_matrix() 389 390 _, ld = np.linalg.slogdet(mat) 391 ld2 = fac.logdet() 392 393 assert_almost_equal(ld, ld2) 394 395 def test_solve(self, reset_randomstate): 396 397 d = 30 398 dg = np.linspace(1, 2, d) 399 root = np.random.normal(size=(d, 2)) 400 fac = FactoredPSDMatrix(dg, root) 401 rhs = np.random.normal(size=(d, 5)) 402 sr1 = fac.solve(rhs) 403 mat = fac.to_matrix() 404 sr2 = np.linalg.solve(mat, rhs) 405 assert_almost_equal(sr1, sr2) 406 407 @pytest.mark.parametrize('dm', [1, 2]) 408 def test_cov_nearest_factor_homog(self, dm): 409 410 d = 100 411 412 # Construct a test matrix with exact factor structure 413 X = np.zeros((d, dm), dtype=np.float64) 414 x = np.linspace(0, 2*np.pi, d) 415 for j in range(dm): 416 X[:, j] = np.sin(x*(j+1)) 417 mat = np.dot(X, X.T) 418 np.fill_diagonal(mat, np.diag(mat) + 3.1) 419 420 # Try to recover the structure 421 rslt = cov_nearest_factor_homog(mat, dm) 422 mat1 = rslt.to_matrix() 423 424 assert_allclose(mat, mat1, rtol=0.25, atol=1e-3) 425 426 @pytest.mark.parametrize('dm', [1, 2]) 427 def test_cov_nearest_factor_homog_sparse(self, dm): 428 # Check that dense and sparse inputs give the same result 429 430 d = 100 431 432 # Construct a test matrix with exact factor structure 433 X = np.zeros((d, dm), dtype=np.float64) 434 x = np.linspace(0, 2*np.pi, d) 435 for j in range(dm): 436 X[:, j] = np.sin(x*(j+1)) 437 mat = np.dot(X, X.T) 438 np.fill_diagonal(mat, np.diag(mat) + 3.1) 439 440 # Fit to dense 441 rslt = cov_nearest_factor_homog(mat, dm) 442 mat1 = rslt.to_matrix() 443 444 # Fit to sparse 445 smat = sparse.csr_matrix(mat) 446 rslt = cov_nearest_factor_homog(smat, dm) 447 mat2 = rslt.to_matrix() 448 449 assert_allclose(mat1, mat2, rtol=0.25, atol=1e-3) 450 451 def test_corr_thresholded(self, reset_randomstate): 452 453 import datetime 454 455 t1 = datetime.datetime.now() 456 X = np.random.normal(size=(2000,10)) 457 tcor = corr_thresholded(X, 0.2, max_elt=4e6) 458 t2 = datetime.datetime.now() 459 ss = (t2-t1).seconds 460 461 fcor = np.corrcoef(X) 462 fcor *= (np.abs(fcor) >= 0.2) 463 464 assert_allclose(tcor.todense(), fcor, rtol=0.25, atol=1e-3) 465