1# -*- coding: utf-8 -*- 2""" 3 4Created on Fri Aug 17 13:10:52 2012 5 6Author: Josef Perktold 7License: BSD-3 8""" 9 10import numpy as np 11import scipy.sparse as sparse 12from scipy.sparse.linalg import svds 13from scipy.optimize import fminbound 14import warnings 15 16from statsmodels.tools.tools import Bunch 17from statsmodels.tools.sm_exceptions import ( 18 IterationLimitWarning, iteration_limit_doc) 19 20 21def clip_evals(x, value=0): # threshold=0, value=0): 22 evals, evecs = np.linalg.eigh(x) 23 clipped = np.any(evals < value) 24 x_new = np.dot(evecs * np.maximum(evals, value), evecs.T) 25 return x_new, clipped 26 27 28def corr_nearest(corr, threshold=1e-15, n_fact=100): 29 ''' 30 Find the nearest correlation matrix that is positive semi-definite. 31 32 The function iteratively adjust the correlation matrix by clipping the 33 eigenvalues of a difference matrix. The diagonal elements are set to one. 34 35 Parameters 36 ---------- 37 corr : ndarray, (k, k) 38 initial correlation matrix 39 threshold : float 40 clipping threshold for smallest eigenvalue, see Notes 41 n_fact : int or float 42 factor to determine the maximum number of iterations. The maximum 43 number of iterations is the integer part of the number of columns in 44 the correlation matrix times n_fact. 45 46 Returns 47 ------- 48 corr_new : ndarray, (optional) 49 corrected correlation matrix 50 51 Notes 52 ----- 53 The smallest eigenvalue of the corrected correlation matrix is 54 approximately equal to the ``threshold``. 55 If the threshold=0, then the smallest eigenvalue of the correlation matrix 56 might be negative, but zero within a numerical error, for example in the 57 range of -1e-16. 58 59 Assumes input correlation matrix is symmetric. 60 61 Stops after the first step if correlation matrix is already positive 62 semi-definite or positive definite, so that smallest eigenvalue is above 63 threshold. In this case, the returned array is not the original, but 64 is equal to it within numerical precision. 65 66 See Also 67 -------- 68 corr_clipped 69 cov_nearest 70 71 ''' 72 k_vars = corr.shape[0] 73 if k_vars != corr.shape[1]: 74 raise ValueError("matrix is not square") 75 76 diff = np.zeros(corr.shape) 77 x_new = corr.copy() 78 diag_idx = np.arange(k_vars) 79 80 for ii in range(int(len(corr) * n_fact)): 81 x_adj = x_new - diff 82 x_psd, clipped = clip_evals(x_adj, value=threshold) 83 if not clipped: 84 x_new = x_psd 85 break 86 diff = x_psd - x_adj 87 x_new = x_psd.copy() 88 x_new[diag_idx, diag_idx] = 1 89 else: 90 warnings.warn(iteration_limit_doc, IterationLimitWarning) 91 92 return x_new 93 94 95def corr_clipped(corr, threshold=1e-15): 96 ''' 97 Find a near correlation matrix that is positive semi-definite 98 99 This function clips the eigenvalues, replacing eigenvalues smaller than 100 the threshold by the threshold. The new matrix is normalized, so that the 101 diagonal elements are one. 102 Compared to corr_nearest, the distance between the original correlation 103 matrix and the positive definite correlation matrix is larger, however, 104 it is much faster since it only computes eigenvalues once. 105 106 Parameters 107 ---------- 108 corr : ndarray, (k, k) 109 initial correlation matrix 110 threshold : float 111 clipping threshold for smallest eigenvalue, see Notes 112 113 Returns 114 ------- 115 corr_new : ndarray, (optional) 116 corrected correlation matrix 117 118 119 Notes 120 ----- 121 The smallest eigenvalue of the corrected correlation matrix is 122 approximately equal to the ``threshold``. In examples, the 123 smallest eigenvalue can be by a factor of 10 smaller than the threshold, 124 e.g. threshold 1e-8 can result in smallest eigenvalue in the range 125 between 1e-9 and 1e-8. 126 If the threshold=0, then the smallest eigenvalue of the correlation matrix 127 might be negative, but zero within a numerical error, for example in the 128 range of -1e-16. 129 130 Assumes input correlation matrix is symmetric. The diagonal elements of 131 returned correlation matrix is set to ones. 132 133 If the correlation matrix is already positive semi-definite given the 134 threshold, then the original correlation matrix is returned. 135 136 ``cov_clipped`` is 40 or more times faster than ``cov_nearest`` in simple 137 example, but has a slightly larger approximation error. 138 139 See Also 140 -------- 141 corr_nearest 142 cov_nearest 143 144 ''' 145 x_new, clipped = clip_evals(corr, value=threshold) 146 if not clipped: 147 return corr 148 149 # cov2corr 150 x_std = np.sqrt(np.diag(x_new)) 151 x_new = x_new / x_std / x_std[:, None] 152 return x_new 153 154 155def cov_nearest(cov, method='clipped', threshold=1e-15, n_fact=100, 156 return_all=False): 157 """ 158 Find the nearest covariance matrix that is positive (semi-) definite 159 160 This leaves the diagonal, i.e. the variance, unchanged 161 162 Parameters 163 ---------- 164 cov : ndarray, (k,k) 165 initial covariance matrix 166 method : str 167 if "clipped", then the faster but less accurate ``corr_clipped`` is 168 used.if "nearest", then ``corr_nearest`` is used 169 threshold : float 170 clipping threshold for smallest eigen value, see Notes 171 n_fact : int or float 172 factor to determine the maximum number of iterations in 173 ``corr_nearest``. See its doc string 174 return_all : bool 175 if False (default), then only the covariance matrix is returned. 176 If True, then correlation matrix and standard deviation are 177 additionally returned. 178 179 Returns 180 ------- 181 cov_ : ndarray 182 corrected covariance matrix 183 corr_ : ndarray, (optional) 184 corrected correlation matrix 185 std_ : ndarray, (optional) 186 standard deviation 187 188 189 Notes 190 ----- 191 This converts the covariance matrix to a correlation matrix. Then, finds 192 the nearest correlation matrix that is positive semidefinite and converts 193 it back to a covariance matrix using the initial standard deviation. 194 195 The smallest eigenvalue of the intermediate correlation matrix is 196 approximately equal to the ``threshold``. 197 If the threshold=0, then the smallest eigenvalue of the correlation matrix 198 might be negative, but zero within a numerical error, for example in the 199 range of -1e-16. 200 201 Assumes input covariance matrix is symmetric. 202 203 See Also 204 -------- 205 corr_nearest 206 corr_clipped 207 """ 208 209 from statsmodels.stats.moment_helpers import cov2corr, corr2cov 210 cov_, std_ = cov2corr(cov, return_std=True) 211 if method == 'clipped': 212 corr_ = corr_clipped(cov_, threshold=threshold) 213 else: # method == 'nearest' 214 corr_ = corr_nearest(cov_, threshold=threshold, n_fact=n_fact) 215 216 cov_ = corr2cov(corr_, std_) 217 218 if return_all: 219 return cov_, corr_, std_ 220 else: 221 return cov_ 222 223 224def _nmono_linesearch(obj, grad, x, d, obj_hist, M=10, sig1=0.1, 225 sig2=0.9, gam=1e-4, maxiter=100): 226 """ 227 Implements the non-monotone line search of Grippo et al. (1986), 228 as described in Birgin, Martinez and Raydan (2013). 229 230 Parameters 231 ---------- 232 obj : real-valued function 233 The objective function, to be minimized 234 grad : vector-valued function 235 The gradient of the objective function 236 x : array_like 237 The starting point for the line search 238 d : array_like 239 The search direction 240 obj_hist : array_like 241 Objective function history (must contain at least one value) 242 M : positive int 243 Number of previous function points to consider (see references 244 for details). 245 sig1 : real 246 Tuning parameter, see references for details. 247 sig2 : real 248 Tuning parameter, see references for details. 249 gam : real 250 Tuning parameter, see references for details. 251 maxiter : int 252 The maximum number of iterations; returns Nones if convergence 253 does not occur by this point 254 255 Returns 256 ------- 257 alpha : real 258 The step value 259 x : Array_like 260 The function argument at the final step 261 obval : Real 262 The function value at the final step 263 g : Array_like 264 The gradient at the final step 265 266 Notes 267 ----- 268 The basic idea is to take a big step in the direction of the 269 gradient, even if the function value is not decreased (but there 270 is a maximum allowed increase in terms of the recent history of 271 the iterates). 272 273 References 274 ---------- 275 Grippo L, Lampariello F, Lucidi S (1986). A Nonmonotone Line 276 Search Technique for Newton's Method. SIAM Journal on Numerical 277 Analysis, 23, 707-716. 278 279 E. Birgin, J.M. Martinez, and M. Raydan. Spectral projected 280 gradient methods: Review and perspectives. Journal of Statistical 281 Software (preprint). 282 """ 283 284 alpha = 1. 285 last_obval = obj(x) 286 obj_max = max(obj_hist[-M:]) 287 288 for iter in range(maxiter): 289 290 obval = obj(x + alpha*d) 291 g = grad(x) 292 gtd = (g * d).sum() 293 294 if obval <= obj_max + gam*alpha*gtd: 295 return alpha, x + alpha*d, obval, g 296 297 a1 = -0.5*alpha**2*gtd / (obval - last_obval - alpha*gtd) 298 299 if (sig1 <= a1) and (a1 <= sig2*alpha): 300 alpha = a1 301 else: 302 alpha /= 2. 303 304 last_obval = obval 305 306 return None, None, None, None 307 308 309def _spg_optim(func, grad, start, project, maxiter=1e4, M=10, 310 ctol=1e-3, maxiter_nmls=200, lam_min=1e-30, 311 lam_max=1e30, sig1=0.1, sig2=0.9, gam=1e-4): 312 """ 313 Implements the spectral projected gradient method for minimizing a 314 differentiable function on a convex domain. 315 316 Parameters 317 ---------- 318 func : real valued function 319 The objective function to be minimized. 320 grad : real array-valued function 321 The gradient of the objective function 322 start : array_like 323 The starting point 324 project : function 325 In-place projection of the argument to the domain 326 of func. 327 ... See notes regarding additional arguments 328 329 Returns 330 ------- 331 rslt : Bunch 332 rslt.params is the final iterate, other fields describe 333 convergence status. 334 335 Notes 336 ----- 337 This can be an effective heuristic algorithm for problems where no 338 guaranteed algorithm for computing a global minimizer is known. 339 340 There are a number of tuning parameters, but these generally 341 should not be changed except for `maxiter` (positive integer) and 342 `ctol` (small positive real). See the Birgin et al reference for 343 more information about the tuning parameters. 344 345 Reference 346 --------- 347 E. Birgin, J.M. Martinez, and M. Raydan. Spectral projected 348 gradient methods: Review and perspectives. Journal of Statistical 349 Software (preprint). Available at: 350 http://www.ime.usp.br/~egbirgin/publications/bmr5.pdf 351 """ 352 353 lam = min(10*lam_min, lam_max) 354 355 params = start.copy() 356 gval = grad(params) 357 358 obj_hist = [func(params), ] 359 360 for itr in range(int(maxiter)): 361 362 # Check convergence 363 df = params - gval 364 project(df) 365 df -= params 366 if np.max(np.abs(df)) < ctol: 367 return Bunch(**{"Converged": True, "params": params, 368 "objective_values": obj_hist, 369 "Message": "Converged successfully"}) 370 371 # The line search direction 372 d = params - lam*gval 373 project(d) 374 d -= params 375 376 # Carry out the nonmonotone line search 377 alpha, params1, fval, gval1 = _nmono_linesearch( 378 func, 379 grad, 380 params, 381 d, 382 obj_hist, 383 M=M, 384 sig1=sig1, 385 sig2=sig2, 386 gam=gam, 387 maxiter=maxiter_nmls) 388 389 if alpha is None: 390 return Bunch(**{"Converged": False, "params": params, 391 "objective_values": obj_hist, 392 "Message": "Failed in nmono_linesearch"}) 393 394 obj_hist.append(fval) 395 s = params1 - params 396 y = gval1 - gval 397 398 sy = (s*y).sum() 399 if sy <= 0: 400 lam = lam_max 401 else: 402 ss = (s*s).sum() 403 lam = max(lam_min, min(ss/sy, lam_max)) 404 405 params = params1 406 gval = gval1 407 408 return Bunch(**{"Converged": False, "params": params, 409 "objective_values": obj_hist, 410 "Message": "spg_optim did not converge"}) 411 412 413def _project_correlation_factors(X): 414 """ 415 Project a matrix into the domain of matrices whose row-wise sums 416 of squares are less than or equal to 1. 417 418 The input matrix is modified in-place. 419 """ 420 nm = np.sqrt((X*X).sum(1)) 421 ii = np.flatnonzero(nm > 1) 422 if len(ii) > 0: 423 X[ii, :] /= nm[ii][:, None] 424 425 426class FactoredPSDMatrix: 427 """ 428 Representation of a positive semidefinite matrix in factored form. 429 430 The representation is constructed based on a vector `diag` and 431 rectangular matrix `root`, such that the PSD matrix represented by 432 the class instance is Diag + root * root', where Diag is the 433 square diagonal matrix with `diag` on its main diagonal. 434 435 Parameters 436 ---------- 437 diag : 1d array_like 438 See above 439 root : 2d array_like 440 See above 441 442 Notes 443 ----- 444 The matrix is represented internally in the form Diag^{1/2}(I + 445 factor * scales * factor')Diag^{1/2}, where `Diag` and `scales` 446 are diagonal matrices, and `factor` is an orthogonal matrix. 447 """ 448 449 def __init__(self, diag, root): 450 self.diag = diag 451 self.root = root 452 root = root / np.sqrt(diag)[:, None] 453 u, s, vt = np.linalg.svd(root, 0) 454 self.factor = u 455 self.scales = s**2 456 457 def to_matrix(self): 458 """ 459 Returns the PSD matrix represented by this instance as a full 460 (square) matrix. 461 """ 462 return np.diag(self.diag) + np.dot(self.root, self.root.T) 463 464 def decorrelate(self, rhs): 465 """ 466 Decorrelate the columns of `rhs`. 467 468 Parameters 469 ---------- 470 rhs : array_like 471 A 2 dimensional array with the same number of rows as the 472 PSD matrix represented by the class instance. 473 474 Returns 475 ------- 476 C^{-1/2} * rhs, where C is the covariance matrix represented 477 by this class instance. 478 479 Notes 480 ----- 481 The returned matrix has the identity matrix as its row-wise 482 population covariance matrix. 483 484 This function exploits the factor structure for efficiency. 485 """ 486 487 # I + factor * qval * factor' is the inverse square root of 488 # the covariance matrix in the homogeneous case where diag = 489 # 1. 490 qval = -1 + 1 / np.sqrt(1 + self.scales) 491 492 # Decorrelate in the general case. 493 rhs = rhs / np.sqrt(self.diag)[:, None] 494 rhs1 = np.dot(self.factor.T, rhs) 495 rhs1 *= qval[:, None] 496 rhs1 = np.dot(self.factor, rhs1) 497 rhs += rhs1 498 499 return rhs 500 501 def solve(self, rhs): 502 """ 503 Solve a linear system of equations with factor-structured 504 coefficients. 505 506 Parameters 507 ---------- 508 rhs : array_like 509 A 2 dimensional array with the same number of rows as the 510 PSD matrix represented by the class instance. 511 512 Returns 513 ------- 514 C^{-1} * rhs, where C is the covariance matrix represented 515 by this class instance. 516 517 Notes 518 ----- 519 This function exploits the factor structure for efficiency. 520 """ 521 522 qval = -self.scales / (1 + self.scales) 523 dr = np.sqrt(self.diag) 524 rhs = rhs / dr[:, None] 525 mat = qval[:, None] * np.dot(self.factor.T, rhs) 526 rhs = rhs + np.dot(self.factor, mat) 527 return rhs / dr[:, None] 528 529 def logdet(self): 530 """ 531 Returns the logarithm of the determinant of a 532 factor-structured matrix. 533 """ 534 535 logdet = np.sum(np.log(self.diag)) 536 logdet += np.sum(np.log(self.scales)) 537 logdet += np.sum(np.log(1 + 1 / self.scales)) 538 539 return logdet 540 541 542def corr_nearest_factor(corr, rank, ctol=1e-6, lam_min=1e-30, 543 lam_max=1e30, maxiter=1000): 544 """ 545 Find the nearest correlation matrix with factor structure to a 546 given square matrix. 547 548 Parameters 549 ---------- 550 corr : square array 551 The target matrix (to which the nearest correlation matrix is 552 sought). Must be square, but need not be positive 553 semidefinite. 554 rank : int 555 The rank of the factor structure of the solution, i.e., the 556 number of linearly independent columns of X. 557 ctol : positive real 558 Convergence criterion. 559 lam_min : float 560 Tuning parameter for spectral projected gradient optimization 561 (smallest allowed step in the search direction). 562 lam_max : float 563 Tuning parameter for spectral projected gradient optimization 564 (largest allowed step in the search direction). 565 maxiter : int 566 Maximum number of iterations in spectral projected gradient 567 optimization. 568 569 Returns 570 ------- 571 rslt : Bunch 572 rslt.corr is a FactoredPSDMatrix defining the estimated 573 correlation structure. Other fields of `rslt` contain 574 returned values from spg_optim. 575 576 Notes 577 ----- 578 A correlation matrix has factor structure if it can be written in 579 the form I + XX' - diag(XX'), where X is n x k with linearly 580 independent columns, and with each row having sum of squares at 581 most equal to 1. The approximation is made in terms of the 582 Frobenius norm. 583 584 This routine is useful when one has an approximate correlation 585 matrix that is not positive semidefinite, and there is need to 586 estimate the inverse, square root, or inverse square root of the 587 population correlation matrix. The factor structure allows these 588 tasks to be done without constructing any n x n matrices. 589 590 This is a non-convex problem with no known guaranteed globally 591 convergent algorithm for computing the solution. Borsdof, Higham 592 and Raydan (2010) compared several methods for this problem and 593 found the spectral projected gradient (SPG) method (used here) to 594 perform best. 595 596 The input matrix `corr` can be a dense numpy array or any scipy 597 sparse matrix. The latter is useful if the input matrix is 598 obtained by thresholding a very large sample correlation matrix. 599 If `corr` is sparse, the calculations are optimized to save 600 memory, so no working matrix with more than 10^6 elements is 601 constructed. 602 603 References 604 ---------- 605 .. [*] R Borsdof, N Higham, M Raydan (2010). Computing a nearest 606 correlation matrix with factor structure. SIAM J Matrix Anal Appl, 607 31:5, 2603-2622. 608 http://eprints.ma.man.ac.uk/1523/01/covered/MIMS_ep2009_87.pdf 609 610 Examples 611 -------- 612 Hard thresholding a correlation matrix may result in a matrix that 613 is not positive semidefinite. We can approximate a hard 614 thresholded correlation matrix with a PSD matrix as follows, where 615 `corr` is the input correlation matrix. 616 617 >>> import numpy as np 618 >>> from statsmodels.stats.correlation_tools import corr_nearest_factor 619 >>> np.random.seed(1234) 620 >>> b = 1.5 - np.random.rand(10, 1) 621 >>> x = np.random.randn(100,1).dot(b.T) + np.random.randn(100,10) 622 >>> corr = np.corrcoef(x.T) 623 >>> corr = corr * (np.abs(corr) >= 0.3) 624 >>> rslt = corr_nearest_factor(corr, 3) 625 """ 626 627 p, _ = corr.shape 628 629 # Starting values (following the PCA method in BHR). 630 u, s, vt = svds(corr, rank) 631 X = u * np.sqrt(s) 632 nm = np.sqrt((X**2).sum(1)) 633 ii = np.flatnonzero(nm > 1e-5) 634 X[ii, :] /= nm[ii][:, None] 635 636 # Zero the diagonal 637 corr1 = corr.copy() 638 if type(corr1) == np.ndarray: 639 np.fill_diagonal(corr1, 0) 640 elif sparse.issparse(corr1): 641 corr1.setdiag(np.zeros(corr1.shape[0])) 642 corr1.eliminate_zeros() 643 corr1.sort_indices() 644 else: 645 raise ValueError("Matrix type not supported") 646 647 # The gradient, from lemma 4.1 of BHR. 648 def grad(X): 649 gr = np.dot(X, np.dot(X.T, X)) 650 if type(corr1) == np.ndarray: 651 gr -= np.dot(corr1, X) 652 else: 653 gr -= corr1.dot(X) 654 gr -= (X*X).sum(1)[:, None] * X 655 return 4*gr 656 657 # The objective function (sum of squared deviations between fitted 658 # and observed arrays). 659 def func(X): 660 if type(corr1) == np.ndarray: 661 M = np.dot(X, X.T) 662 np.fill_diagonal(M, 0) 663 M -= corr1 664 fval = (M*M).sum() 665 return fval 666 else: 667 fval = 0. 668 # Control the size of intermediates 669 max_ws = 1e6 670 bs = int(max_ws / X.shape[0]) 671 ir = 0 672 while ir < X.shape[0]: 673 ir2 = min(ir+bs, X.shape[0]) 674 u = np.dot(X[ir:ir2, :], X.T) 675 ii = np.arange(u.shape[0]) 676 u[ii, ir+ii] = 0 677 u -= np.asarray(corr1[ir:ir2, :].todense()) 678 fval += (u*u).sum() 679 ir += bs 680 return fval 681 682 rslt = _spg_optim(func, grad, X, _project_correlation_factors, ctol=ctol, 683 lam_min=lam_min, lam_max=lam_max, maxiter=maxiter) 684 root = rslt.params 685 diag = 1 - (root**2).sum(1) 686 soln = FactoredPSDMatrix(diag, root) 687 rslt.corr = soln 688 del rslt.params 689 return rslt 690 691 692def cov_nearest_factor_homog(cov, rank): 693 """ 694 Approximate an arbitrary square matrix with a factor-structured 695 matrix of the form k*I + XX'. 696 697 Parameters 698 ---------- 699 cov : array_like 700 The input array, must be square but need not be positive 701 semidefinite 702 rank : int 703 The rank of the fitted factor structure 704 705 Returns 706 ------- 707 A FactoredPSDMatrix instance containing the fitted matrix 708 709 Notes 710 ----- 711 This routine is useful if one has an estimated covariance matrix 712 that is not SPD, and the ultimate goal is to estimate the inverse, 713 square root, or inverse square root of the true covariance 714 matrix. The factor structure allows these tasks to be performed 715 without constructing any n x n matrices. 716 717 The calculations use the fact that if k is known, then X can be 718 determined from the eigen-decomposition of cov - k*I, which can 719 in turn be easily obtained form the eigen-decomposition of `cov`. 720 Thus the problem can be reduced to a 1-dimensional search for k 721 that does not require repeated eigen-decompositions. 722 723 If the input matrix is sparse, then cov - k*I is also sparse, so 724 the eigen-decomposition can be done efficiently using sparse 725 routines. 726 727 The one-dimensional search for the optimal value of k is not 728 convex, so a local minimum could be obtained. 729 730 Examples 731 -------- 732 Hard thresholding a covariance matrix may result in a matrix that 733 is not positive semidefinite. We can approximate a hard 734 thresholded covariance matrix with a PSD matrix as follows: 735 736 >>> import numpy as np 737 >>> np.random.seed(1234) 738 >>> b = 1.5 - np.random.rand(10, 1) 739 >>> x = np.random.randn(100,1).dot(b.T) + np.random.randn(100,10) 740 >>> cov = np.cov(x) 741 >>> cov = cov * (np.abs(cov) >= 0.3) 742 >>> rslt = cov_nearest_factor_homog(cov, 3) 743 """ 744 745 m, n = cov.shape 746 747 Q, Lambda, _ = svds(cov, rank) 748 749 if sparse.issparse(cov): 750 QSQ = np.dot(Q.T, cov.dot(Q)) 751 ts = cov.diagonal().sum() 752 tss = cov.dot(cov).diagonal().sum() 753 else: 754 QSQ = np.dot(Q.T, np.dot(cov, Q)) 755 ts = np.trace(cov) 756 tss = np.trace(np.dot(cov, cov)) 757 758 def fun(k): 759 Lambda_t = Lambda - k 760 v = tss + m*(k**2) + np.sum(Lambda_t**2) - 2*k*ts 761 v += 2*k*np.sum(Lambda_t) - 2*np.sum(np.diag(QSQ) * Lambda_t) 762 return v 763 764 # Get the optimal decomposition 765 k_opt = fminbound(fun, 0, 1e5) 766 Lambda_opt = Lambda - k_opt 767 fac_opt = Q * np.sqrt(Lambda_opt) 768 769 diag = k_opt * np.ones(m, dtype=np.float64) # - (fac_opt**2).sum(1) 770 return FactoredPSDMatrix(diag, fac_opt) 771 772 773def corr_thresholded(data, minabs=None, max_elt=1e7): 774 r""" 775 Construct a sparse matrix containing the thresholded row-wise 776 correlation matrix from a data array. 777 778 Parameters 779 ---------- 780 data : array_like 781 The data from which the row-wise thresholded correlation 782 matrix is to be computed. 783 minabs : non-negative real 784 The threshold value; correlation coefficients smaller in 785 magnitude than minabs are set to zero. If None, defaults 786 to 1 / sqrt(n), see Notes for more information. 787 788 Returns 789 ------- 790 cormat : sparse.coo_matrix 791 The thresholded correlation matrix, in COO format. 792 793 Notes 794 ----- 795 This is an alternative to C = np.corrcoef(data); C \*= (np.abs(C) 796 >= absmin), suitable for very tall data matrices. 797 798 If the data are jointly Gaussian, the marginal sampling 799 distributions of the elements of the sample correlation matrix are 800 approximately Gaussian with standard deviation 1 / sqrt(n). The 801 default value of ``minabs`` is thus equal to 1 standard error, which 802 will set to zero approximately 68% of the estimated correlation 803 coefficients for which the population value is zero. 804 805 No intermediate matrix with more than ``max_elt`` values will be 806 constructed. However memory use could still be high if a large 807 number of correlation values exceed `minabs` in magnitude. 808 809 The thresholded matrix is returned in COO format, which can easily 810 be converted to other sparse formats. 811 812 Examples 813 -------- 814 Here X is a tall data matrix (e.g. with 100,000 rows and 50 815 columns). The row-wise correlation matrix of X is calculated 816 and stored in sparse form, with all entries smaller than 0.3 817 treated as 0. 818 819 >>> import numpy as np 820 >>> np.random.seed(1234) 821 >>> b = 1.5 - np.random.rand(10, 1) 822 >>> x = np.random.randn(100,1).dot(b.T) + np.random.randn(100,10) 823 >>> cmat = corr_thresholded(x, 0.3) 824 """ 825 826 nrow, ncol = data.shape 827 828 if minabs is None: 829 minabs = 1. / float(ncol) 830 831 # Row-standardize the data 832 data = data.copy() 833 data -= data.mean(1)[:, None] 834 sd = data.std(1, ddof=1) 835 ii = np.flatnonzero(sd > 1e-5) 836 data[ii, :] /= sd[ii][:, None] 837 ii = np.flatnonzero(sd <= 1e-5) 838 data[ii, :] = 0 839 840 # Number of rows to process in one pass 841 bs = int(np.floor(max_elt / nrow)) 842 843 ipos_all, jpos_all, cor_values = [], [], [] 844 845 ir = 0 846 while ir < nrow: 847 ir2 = min(data.shape[0], ir + bs) 848 cm = np.dot(data[ir:ir2, :], data.T) / (ncol - 1) 849 cma = np.abs(cm) 850 ipos, jpos = np.nonzero(cma >= minabs) 851 ipos_all.append(ipos + ir) 852 jpos_all.append(jpos) 853 cor_values.append(cm[ipos, jpos]) 854 ir += bs 855 856 ipos = np.concatenate(ipos_all) 857 jpos = np.concatenate(jpos_all) 858 cor_values = np.concatenate(cor_values) 859 860 cmat = sparse.coo_matrix((cor_values, (ipos, jpos)), (nrow, nrow)) 861 862 return cmat 863 864 865class MultivariateKernel(object): 866 """ 867 Base class for multivariate kernels. 868 869 An instance of MultivariateKernel implements a `call` method having 870 signature `call(x, loc)`, returning the kernel weights comparing `x` 871 (a 1d ndarray) to each row of `loc` (a 2d ndarray). 872 """ 873 874 def call(self, x, loc): 875 raise NotImplementedError 876 877 def set_bandwidth(self, bw): 878 """ 879 Set the bandwidth to the given vector. 880 881 Parameters 882 ---------- 883 bw : array_like 884 A vector of non-negative bandwidth values. 885 """ 886 887 self.bw = bw 888 self._setup() 889 890 def _setup(self): 891 892 # Precompute the squared bandwidth values. 893 self.bwk = np.prod(self.bw) 894 self.bw2 = self.bw * self.bw 895 896 def set_default_bw(self, loc, bwm=None): 897 """ 898 Set default bandwiths based on domain values. 899 900 Parameters 901 ---------- 902 loc : array_like 903 Values from the domain to which the kernel will 904 be applied. 905 bwm : scalar, optional 906 A non-negative scalar that is used to multiply 907 the default bandwidth. 908 """ 909 910 sd = loc.std(0) 911 q25, q75 = np.percentile(loc, [25, 75], axis=0) 912 iqr = (q75 - q25) / 1.349 913 bw = np.where(iqr < sd, iqr, sd) 914 bw *= 0.9 / loc.shape[0] ** 0.2 915 916 if bwm is not None: 917 bw *= bwm 918 919 # The final bandwidths 920 self.bw = np.asarray(bw, dtype=np.float64) 921 922 self._setup() 923 924 925class GaussianMultivariateKernel(MultivariateKernel): 926 """ 927 The Gaussian (squared exponential) multivariate kernel. 928 """ 929 930 def call(self, x, loc): 931 return np.exp(-(x - loc)**2 / (2 * self.bw2)).sum(1) / self.bwk 932 933 934def kernel_covariance(exog, loc, groups, kernel=None, bw=None): 935 """ 936 Use kernel averaging to estimate a multivariate covariance function. 937 938 The goal is to estimate a covariance function C(x, y) = 939 cov(Z(x), Z(y)) where x, y are vectors in R^p (e.g. representing 940 locations in time or space), and Z(.) represents a multivariate 941 process on R^p. 942 943 The data used for estimation can be observed at arbitrary values of the 944 position vector, and there can be multiple independent observations 945 from the process. 946 947 Parameters 948 ---------- 949 exog : array_like 950 The rows of exog are realizations of the process obtained at 951 specified points. 952 loc : array_like 953 The rows of loc are the locations (e.g. in space or time) at 954 which the rows of exog are observed. 955 groups : array_like 956 The values of groups are labels for distinct independent copies 957 of the process. 958 kernel : MultivariateKernel instance, optional 959 An instance of MultivariateKernel, defaults to 960 GaussianMultivariateKernel. 961 bw : array_like or scalar 962 A bandwidth vector, or bandwidth multiplier. If a 1d array, it 963 contains kernel bandwidths for each component of the process, and 964 must have length equal to the number of columns of exog. If a scalar, 965 bw is a bandwidth multiplier used to adjust the default bandwidth; if 966 None, a default bandwidth is used. 967 968 Returns 969 ------- 970 A real-valued function C(x, y) that returns an estimate of the covariance 971 between values of the process located at x and y. 972 973 References 974 ---------- 975 .. [1] Genton M, W Kleiber (2015). Cross covariance functions for 976 multivariate geostatics. Statistical Science 30(2). 977 https://arxiv.org/pdf/1507.08017.pdf 978 """ 979 980 exog = np.asarray(exog) 981 loc = np.asarray(loc) 982 groups = np.asarray(groups) 983 984 if loc.ndim == 1: 985 loc = loc[:, None] 986 987 v = [exog.shape[0], loc.shape[0], len(groups)] 988 if min(v) != max(v): 989 msg = "exog, loc, and groups must have the same number of rows" 990 raise ValueError(msg) 991 992 # Map from group labels to the row indices in each group. 993 ix = {} 994 for i, g in enumerate(groups): 995 if g not in ix: 996 ix[g] = [] 997 ix[g].append(i) 998 for g in ix.keys(): 999 ix[g] = np.sort(ix[g]) 1000 1001 if kernel is None: 1002 kernel = GaussianMultivariateKernel() 1003 1004 if bw is None: 1005 kernel.set_default_bw(loc) 1006 elif np.isscalar(bw): 1007 kernel.set_default_bw(loc, bwm=bw) 1008 else: 1009 kernel.set_bandwidth(bw) 1010 1011 def cov(x, y): 1012 1013 kx = kernel.call(x, loc) 1014 ky = kernel.call(y, loc) 1015 1016 cm, cw = 0., 0. 1017 1018 for g, ii in ix.items(): 1019 1020 m = len(ii) 1021 j1, j2 = np.indices((m, m)) 1022 j1 = ii[j1.flat] 1023 j2 = ii[j2.flat] 1024 w = kx[j1] * ky[j2] 1025 1026 # TODO: some other form of broadcasting may be faster than 1027 # einsum here 1028 cm += np.einsum("ij,ik,i->jk", exog[j1, :], exog[j2, :], w) 1029 cw += w.sum() 1030 1031 if cw < 1e-10: 1032 msg = ("Effective sample size is 0. The bandwidth may be too " + 1033 "small, or you are outside the range of your data.") 1034 warnings.warn(msg) 1035 return np.nan * np.ones_like(cm) 1036 1037 return cm / cw 1038 1039 return cov 1040