1# -*- coding: utf-8 -*-
2"""
3
4Created on Fri Aug 17 13:10:52 2012
5
6Author: Josef Perktold
7License: BSD-3
8"""
9
10import numpy as np
11import scipy.sparse as sparse
12from scipy.sparse.linalg import svds
13from scipy.optimize import fminbound
14import warnings
15
16from statsmodels.tools.tools import Bunch
17from statsmodels.tools.sm_exceptions import (
18    IterationLimitWarning, iteration_limit_doc)
19
20
21def clip_evals(x, value=0):  # threshold=0, value=0):
22    evals, evecs = np.linalg.eigh(x)
23    clipped = np.any(evals < value)
24    x_new = np.dot(evecs * np.maximum(evals, value), evecs.T)
25    return x_new, clipped
26
27
28def corr_nearest(corr, threshold=1e-15, n_fact=100):
29    '''
30    Find the nearest correlation matrix that is positive semi-definite.
31
32    The function iteratively adjust the correlation matrix by clipping the
33    eigenvalues of a difference matrix. The diagonal elements are set to one.
34
35    Parameters
36    ----------
37    corr : ndarray, (k, k)
38        initial correlation matrix
39    threshold : float
40        clipping threshold for smallest eigenvalue, see Notes
41    n_fact : int or float
42        factor to determine the maximum number of iterations. The maximum
43        number of iterations is the integer part of the number of columns in
44        the correlation matrix times n_fact.
45
46    Returns
47    -------
48    corr_new : ndarray, (optional)
49        corrected correlation matrix
50
51    Notes
52    -----
53    The smallest eigenvalue of the corrected correlation matrix is
54    approximately equal to the ``threshold``.
55    If the threshold=0, then the smallest eigenvalue of the correlation matrix
56    might be negative, but zero within a numerical error, for example in the
57    range of -1e-16.
58
59    Assumes input correlation matrix is symmetric.
60
61    Stops after the first step if correlation matrix is already positive
62    semi-definite or positive definite, so that smallest eigenvalue is above
63    threshold. In this case, the returned array is not the original, but
64    is equal to it within numerical precision.
65
66    See Also
67    --------
68    corr_clipped
69    cov_nearest
70
71    '''
72    k_vars = corr.shape[0]
73    if k_vars != corr.shape[1]:
74        raise ValueError("matrix is not square")
75
76    diff = np.zeros(corr.shape)
77    x_new = corr.copy()
78    diag_idx = np.arange(k_vars)
79
80    for ii in range(int(len(corr) * n_fact)):
81        x_adj = x_new - diff
82        x_psd, clipped = clip_evals(x_adj, value=threshold)
83        if not clipped:
84            x_new = x_psd
85            break
86        diff = x_psd - x_adj
87        x_new = x_psd.copy()
88        x_new[diag_idx, diag_idx] = 1
89    else:
90        warnings.warn(iteration_limit_doc, IterationLimitWarning)
91
92    return x_new
93
94
95def corr_clipped(corr, threshold=1e-15):
96    '''
97    Find a near correlation matrix that is positive semi-definite
98
99    This function clips the eigenvalues, replacing eigenvalues smaller than
100    the threshold by the threshold. The new matrix is normalized, so that the
101    diagonal elements are one.
102    Compared to corr_nearest, the distance between the original correlation
103    matrix and the positive definite correlation matrix is larger, however,
104    it is much faster since it only computes eigenvalues once.
105
106    Parameters
107    ----------
108    corr : ndarray, (k, k)
109        initial correlation matrix
110    threshold : float
111        clipping threshold for smallest eigenvalue, see Notes
112
113    Returns
114    -------
115    corr_new : ndarray, (optional)
116        corrected correlation matrix
117
118
119    Notes
120    -----
121    The smallest eigenvalue of the corrected correlation matrix is
122    approximately equal to the ``threshold``. In examples, the
123    smallest eigenvalue can be by a factor of 10 smaller than the threshold,
124    e.g. threshold 1e-8 can result in smallest eigenvalue in the range
125    between 1e-9 and 1e-8.
126    If the threshold=0, then the smallest eigenvalue of the correlation matrix
127    might be negative, but zero within a numerical error, for example in the
128    range of -1e-16.
129
130    Assumes input correlation matrix is symmetric. The diagonal elements of
131    returned correlation matrix is set to ones.
132
133    If the correlation matrix is already positive semi-definite given the
134    threshold, then the original correlation matrix is returned.
135
136    ``cov_clipped`` is 40 or more times faster than ``cov_nearest`` in simple
137    example, but has a slightly larger approximation error.
138
139    See Also
140    --------
141    corr_nearest
142    cov_nearest
143
144    '''
145    x_new, clipped = clip_evals(corr, value=threshold)
146    if not clipped:
147        return corr
148
149    # cov2corr
150    x_std = np.sqrt(np.diag(x_new))
151    x_new = x_new / x_std / x_std[:, None]
152    return x_new
153
154
155def cov_nearest(cov, method='clipped', threshold=1e-15, n_fact=100,
156                return_all=False):
157    """
158    Find the nearest covariance matrix that is positive (semi-) definite
159
160    This leaves the diagonal, i.e. the variance, unchanged
161
162    Parameters
163    ----------
164    cov : ndarray, (k,k)
165        initial covariance matrix
166    method : str
167        if "clipped", then the faster but less accurate ``corr_clipped`` is
168        used.if "nearest", then ``corr_nearest`` is used
169    threshold : float
170        clipping threshold for smallest eigen value, see Notes
171    n_fact : int or float
172        factor to determine the maximum number of iterations in
173        ``corr_nearest``. See its doc string
174    return_all : bool
175        if False (default), then only the covariance matrix is returned.
176        If True, then correlation matrix and standard deviation are
177        additionally returned.
178
179    Returns
180    -------
181    cov_ : ndarray
182        corrected covariance matrix
183    corr_ : ndarray, (optional)
184        corrected correlation matrix
185    std_ : ndarray, (optional)
186        standard deviation
187
188
189    Notes
190    -----
191    This converts the covariance matrix to a correlation matrix. Then, finds
192    the nearest correlation matrix that is positive semidefinite and converts
193    it back to a covariance matrix using the initial standard deviation.
194
195    The smallest eigenvalue of the intermediate correlation matrix is
196    approximately equal to the ``threshold``.
197    If the threshold=0, then the smallest eigenvalue of the correlation matrix
198    might be negative, but zero within a numerical error, for example in the
199    range of -1e-16.
200
201    Assumes input covariance matrix is symmetric.
202
203    See Also
204    --------
205    corr_nearest
206    corr_clipped
207    """
208
209    from statsmodels.stats.moment_helpers import cov2corr, corr2cov
210    cov_, std_ = cov2corr(cov, return_std=True)
211    if method == 'clipped':
212        corr_ = corr_clipped(cov_, threshold=threshold)
213    else:  # method == 'nearest'
214        corr_ = corr_nearest(cov_, threshold=threshold, n_fact=n_fact)
215
216    cov_ = corr2cov(corr_, std_)
217
218    if return_all:
219        return cov_, corr_, std_
220    else:
221        return cov_
222
223
224def _nmono_linesearch(obj, grad, x, d, obj_hist, M=10, sig1=0.1,
225                      sig2=0.9, gam=1e-4, maxiter=100):
226    """
227    Implements the non-monotone line search of Grippo et al. (1986),
228    as described in Birgin, Martinez and Raydan (2013).
229
230    Parameters
231    ----------
232    obj : real-valued function
233        The objective function, to be minimized
234    grad : vector-valued function
235        The gradient of the objective function
236    x : array_like
237        The starting point for the line search
238    d : array_like
239        The search direction
240    obj_hist : array_like
241        Objective function history (must contain at least one value)
242    M : positive int
243        Number of previous function points to consider (see references
244        for details).
245    sig1 : real
246        Tuning parameter, see references for details.
247    sig2 : real
248        Tuning parameter, see references for details.
249    gam : real
250        Tuning parameter, see references for details.
251    maxiter : int
252        The maximum number of iterations; returns Nones if convergence
253        does not occur by this point
254
255    Returns
256    -------
257    alpha : real
258        The step value
259    x : Array_like
260        The function argument at the final step
261    obval : Real
262        The function value at the final step
263    g : Array_like
264        The gradient at the final step
265
266    Notes
267    -----
268    The basic idea is to take a big step in the direction of the
269    gradient, even if the function value is not decreased (but there
270    is a maximum allowed increase in terms of the recent history of
271    the iterates).
272
273    References
274    ----------
275    Grippo L, Lampariello F, Lucidi S (1986). A Nonmonotone Line
276    Search Technique for Newton's Method. SIAM Journal on Numerical
277    Analysis, 23, 707-716.
278
279    E. Birgin, J.M. Martinez, and M. Raydan. Spectral projected
280    gradient methods: Review and perspectives. Journal of Statistical
281    Software (preprint).
282    """
283
284    alpha = 1.
285    last_obval = obj(x)
286    obj_max = max(obj_hist[-M:])
287
288    for iter in range(maxiter):
289
290        obval = obj(x + alpha*d)
291        g = grad(x)
292        gtd = (g * d).sum()
293
294        if obval <= obj_max + gam*alpha*gtd:
295            return alpha, x + alpha*d, obval, g
296
297        a1 = -0.5*alpha**2*gtd / (obval - last_obval - alpha*gtd)
298
299        if (sig1 <= a1) and (a1 <= sig2*alpha):
300            alpha = a1
301        else:
302            alpha /= 2.
303
304        last_obval = obval
305
306    return None, None, None, None
307
308
309def _spg_optim(func, grad, start, project, maxiter=1e4, M=10,
310               ctol=1e-3, maxiter_nmls=200, lam_min=1e-30,
311               lam_max=1e30, sig1=0.1, sig2=0.9, gam=1e-4):
312    """
313    Implements the spectral projected gradient method for minimizing a
314    differentiable function on a convex domain.
315
316    Parameters
317    ----------
318    func : real valued function
319        The objective function to be minimized.
320    grad : real array-valued function
321        The gradient of the objective function
322    start : array_like
323        The starting point
324    project : function
325        In-place projection of the argument to the domain
326        of func.
327    ... See notes regarding additional arguments
328
329    Returns
330    -------
331    rslt : Bunch
332        rslt.params is the final iterate, other fields describe
333        convergence status.
334
335    Notes
336    -----
337    This can be an effective heuristic algorithm for problems where no
338    guaranteed algorithm for computing a global minimizer is known.
339
340    There are a number of tuning parameters, but these generally
341    should not be changed except for `maxiter` (positive integer) and
342    `ctol` (small positive real).  See the Birgin et al reference for
343    more information about the tuning parameters.
344
345    Reference
346    ---------
347    E. Birgin, J.M. Martinez, and M. Raydan. Spectral projected
348    gradient methods: Review and perspectives. Journal of Statistical
349    Software (preprint).  Available at:
350    http://www.ime.usp.br/~egbirgin/publications/bmr5.pdf
351    """
352
353    lam = min(10*lam_min, lam_max)
354
355    params = start.copy()
356    gval = grad(params)
357
358    obj_hist = [func(params), ]
359
360    for itr in range(int(maxiter)):
361
362        # Check convergence
363        df = params - gval
364        project(df)
365        df -= params
366        if np.max(np.abs(df)) < ctol:
367            return Bunch(**{"Converged": True, "params": params,
368                            "objective_values": obj_hist,
369                            "Message": "Converged successfully"})
370
371        # The line search direction
372        d = params - lam*gval
373        project(d)
374        d -= params
375
376        # Carry out the nonmonotone line search
377        alpha, params1, fval, gval1 = _nmono_linesearch(
378            func,
379            grad,
380            params,
381            d,
382            obj_hist,
383            M=M,
384            sig1=sig1,
385            sig2=sig2,
386            gam=gam,
387            maxiter=maxiter_nmls)
388
389        if alpha is None:
390            return Bunch(**{"Converged": False, "params": params,
391                            "objective_values": obj_hist,
392                            "Message": "Failed in nmono_linesearch"})
393
394        obj_hist.append(fval)
395        s = params1 - params
396        y = gval1 - gval
397
398        sy = (s*y).sum()
399        if sy <= 0:
400            lam = lam_max
401        else:
402            ss = (s*s).sum()
403            lam = max(lam_min, min(ss/sy, lam_max))
404
405        params = params1
406        gval = gval1
407
408    return Bunch(**{"Converged": False, "params": params,
409                    "objective_values": obj_hist,
410                    "Message": "spg_optim did not converge"})
411
412
413def _project_correlation_factors(X):
414    """
415    Project a matrix into the domain of matrices whose row-wise sums
416    of squares are less than or equal to 1.
417
418    The input matrix is modified in-place.
419    """
420    nm = np.sqrt((X*X).sum(1))
421    ii = np.flatnonzero(nm > 1)
422    if len(ii) > 0:
423        X[ii, :] /= nm[ii][:, None]
424
425
426class FactoredPSDMatrix:
427    """
428    Representation of a positive semidefinite matrix in factored form.
429
430    The representation is constructed based on a vector `diag` and
431    rectangular matrix `root`, such that the PSD matrix represented by
432    the class instance is Diag + root * root', where Diag is the
433    square diagonal matrix with `diag` on its main diagonal.
434
435    Parameters
436    ----------
437    diag : 1d array_like
438        See above
439    root : 2d array_like
440        See above
441
442    Notes
443    -----
444    The matrix is represented internally in the form Diag^{1/2}(I +
445    factor * scales * factor')Diag^{1/2}, where `Diag` and `scales`
446    are diagonal matrices, and `factor` is an orthogonal matrix.
447    """
448
449    def __init__(self, diag, root):
450        self.diag = diag
451        self.root = root
452        root = root / np.sqrt(diag)[:, None]
453        u, s, vt = np.linalg.svd(root, 0)
454        self.factor = u
455        self.scales = s**2
456
457    def to_matrix(self):
458        """
459        Returns the PSD matrix represented by this instance as a full
460        (square) matrix.
461        """
462        return np.diag(self.diag) + np.dot(self.root, self.root.T)
463
464    def decorrelate(self, rhs):
465        """
466        Decorrelate the columns of `rhs`.
467
468        Parameters
469        ----------
470        rhs : array_like
471            A 2 dimensional array with the same number of rows as the
472            PSD matrix represented by the class instance.
473
474        Returns
475        -------
476        C^{-1/2} * rhs, where C is the covariance matrix represented
477        by this class instance.
478
479        Notes
480        -----
481        The returned matrix has the identity matrix as its row-wise
482        population covariance matrix.
483
484        This function exploits the factor structure for efficiency.
485        """
486
487        # I + factor * qval * factor' is the inverse square root of
488        # the covariance matrix in the homogeneous case where diag =
489        # 1.
490        qval = -1 + 1 / np.sqrt(1 + self.scales)
491
492        # Decorrelate in the general case.
493        rhs = rhs / np.sqrt(self.diag)[:, None]
494        rhs1 = np.dot(self.factor.T, rhs)
495        rhs1 *= qval[:, None]
496        rhs1 = np.dot(self.factor, rhs1)
497        rhs += rhs1
498
499        return rhs
500
501    def solve(self, rhs):
502        """
503        Solve a linear system of equations with factor-structured
504        coefficients.
505
506        Parameters
507        ----------
508        rhs : array_like
509            A 2 dimensional array with the same number of rows as the
510            PSD matrix represented by the class instance.
511
512        Returns
513        -------
514        C^{-1} * rhs, where C is the covariance matrix represented
515        by this class instance.
516
517        Notes
518        -----
519        This function exploits the factor structure for efficiency.
520        """
521
522        qval = -self.scales / (1 + self.scales)
523        dr = np.sqrt(self.diag)
524        rhs = rhs / dr[:, None]
525        mat = qval[:, None] * np.dot(self.factor.T, rhs)
526        rhs = rhs + np.dot(self.factor, mat)
527        return rhs / dr[:, None]
528
529    def logdet(self):
530        """
531        Returns the logarithm of the determinant of a
532        factor-structured matrix.
533        """
534
535        logdet = np.sum(np.log(self.diag))
536        logdet += np.sum(np.log(self.scales))
537        logdet += np.sum(np.log(1 + 1 / self.scales))
538
539        return logdet
540
541
542def corr_nearest_factor(corr, rank, ctol=1e-6, lam_min=1e-30,
543                        lam_max=1e30, maxiter=1000):
544    """
545    Find the nearest correlation matrix with factor structure to a
546    given square matrix.
547
548    Parameters
549    ----------
550    corr : square array
551        The target matrix (to which the nearest correlation matrix is
552        sought).  Must be square, but need not be positive
553        semidefinite.
554    rank : int
555        The rank of the factor structure of the solution, i.e., the
556        number of linearly independent columns of X.
557    ctol : positive real
558        Convergence criterion.
559    lam_min : float
560        Tuning parameter for spectral projected gradient optimization
561        (smallest allowed step in the search direction).
562    lam_max : float
563        Tuning parameter for spectral projected gradient optimization
564        (largest allowed step in the search direction).
565    maxiter : int
566        Maximum number of iterations in spectral projected gradient
567        optimization.
568
569    Returns
570    -------
571    rslt : Bunch
572        rslt.corr is a FactoredPSDMatrix defining the estimated
573        correlation structure.  Other fields of `rslt` contain
574        returned values from spg_optim.
575
576    Notes
577    -----
578    A correlation matrix has factor structure if it can be written in
579    the form I + XX' - diag(XX'), where X is n x k with linearly
580    independent columns, and with each row having sum of squares at
581    most equal to 1.  The approximation is made in terms of the
582    Frobenius norm.
583
584    This routine is useful when one has an approximate correlation
585    matrix that is not positive semidefinite, and there is need to
586    estimate the inverse, square root, or inverse square root of the
587    population correlation matrix.  The factor structure allows these
588    tasks to be done without constructing any n x n matrices.
589
590    This is a non-convex problem with no known guaranteed globally
591    convergent algorithm for computing the solution.  Borsdof, Higham
592    and Raydan (2010) compared several methods for this problem and
593    found the spectral projected gradient (SPG) method (used here) to
594    perform best.
595
596    The input matrix `corr` can be a dense numpy array or any scipy
597    sparse matrix.  The latter is useful if the input matrix is
598    obtained by thresholding a very large sample correlation matrix.
599    If `corr` is sparse, the calculations are optimized to save
600    memory, so no working matrix with more than 10^6 elements is
601    constructed.
602
603    References
604    ----------
605    .. [*] R Borsdof, N Higham, M Raydan (2010).  Computing a nearest
606       correlation matrix with factor structure. SIAM J Matrix Anal Appl,
607       31:5, 2603-2622.
608       http://eprints.ma.man.ac.uk/1523/01/covered/MIMS_ep2009_87.pdf
609
610    Examples
611    --------
612    Hard thresholding a correlation matrix may result in a matrix that
613    is not positive semidefinite.  We can approximate a hard
614    thresholded correlation matrix with a PSD matrix as follows, where
615    `corr` is the input correlation matrix.
616
617    >>> import numpy as np
618    >>> from statsmodels.stats.correlation_tools import corr_nearest_factor
619    >>> np.random.seed(1234)
620    >>> b = 1.5 - np.random.rand(10, 1)
621    >>> x = np.random.randn(100,1).dot(b.T) + np.random.randn(100,10)
622    >>> corr = np.corrcoef(x.T)
623    >>> corr = corr * (np.abs(corr) >= 0.3)
624    >>> rslt = corr_nearest_factor(corr, 3)
625    """
626
627    p, _ = corr.shape
628
629    # Starting values (following the PCA method in BHR).
630    u, s, vt = svds(corr, rank)
631    X = u * np.sqrt(s)
632    nm = np.sqrt((X**2).sum(1))
633    ii = np.flatnonzero(nm > 1e-5)
634    X[ii, :] /= nm[ii][:, None]
635
636    # Zero the diagonal
637    corr1 = corr.copy()
638    if type(corr1) == np.ndarray:
639        np.fill_diagonal(corr1, 0)
640    elif sparse.issparse(corr1):
641        corr1.setdiag(np.zeros(corr1.shape[0]))
642        corr1.eliminate_zeros()
643        corr1.sort_indices()
644    else:
645        raise ValueError("Matrix type not supported")
646
647    # The gradient, from lemma 4.1 of BHR.
648    def grad(X):
649        gr = np.dot(X, np.dot(X.T, X))
650        if type(corr1) == np.ndarray:
651            gr -= np.dot(corr1, X)
652        else:
653            gr -= corr1.dot(X)
654        gr -= (X*X).sum(1)[:, None] * X
655        return 4*gr
656
657    # The objective function (sum of squared deviations between fitted
658    # and observed arrays).
659    def func(X):
660        if type(corr1) == np.ndarray:
661            M = np.dot(X, X.T)
662            np.fill_diagonal(M, 0)
663            M -= corr1
664            fval = (M*M).sum()
665            return fval
666        else:
667            fval = 0.
668            # Control the size of intermediates
669            max_ws = 1e6
670            bs = int(max_ws / X.shape[0])
671            ir = 0
672            while ir < X.shape[0]:
673                ir2 = min(ir+bs, X.shape[0])
674                u = np.dot(X[ir:ir2, :], X.T)
675                ii = np.arange(u.shape[0])
676                u[ii, ir+ii] = 0
677                u -= np.asarray(corr1[ir:ir2, :].todense())
678                fval += (u*u).sum()
679                ir += bs
680            return fval
681
682    rslt = _spg_optim(func, grad, X, _project_correlation_factors, ctol=ctol,
683                      lam_min=lam_min, lam_max=lam_max, maxiter=maxiter)
684    root = rslt.params
685    diag = 1 - (root**2).sum(1)
686    soln = FactoredPSDMatrix(diag, root)
687    rslt.corr = soln
688    del rslt.params
689    return rslt
690
691
692def cov_nearest_factor_homog(cov, rank):
693    """
694    Approximate an arbitrary square matrix with a factor-structured
695    matrix of the form k*I + XX'.
696
697    Parameters
698    ----------
699    cov : array_like
700        The input array, must be square but need not be positive
701        semidefinite
702    rank : int
703        The rank of the fitted factor structure
704
705    Returns
706    -------
707    A FactoredPSDMatrix instance containing the fitted matrix
708
709    Notes
710    -----
711    This routine is useful if one has an estimated covariance matrix
712    that is not SPD, and the ultimate goal is to estimate the inverse,
713    square root, or inverse square root of the true covariance
714    matrix. The factor structure allows these tasks to be performed
715    without constructing any n x n matrices.
716
717    The calculations use the fact that if k is known, then X can be
718    determined from the eigen-decomposition of cov - k*I, which can
719    in turn be easily obtained form the eigen-decomposition of `cov`.
720    Thus the problem can be reduced to a 1-dimensional search for k
721    that does not require repeated eigen-decompositions.
722
723    If the input matrix is sparse, then cov - k*I is also sparse, so
724    the eigen-decomposition can be done efficiently using sparse
725    routines.
726
727    The one-dimensional search for the optimal value of k is not
728    convex, so a local minimum could be obtained.
729
730    Examples
731    --------
732    Hard thresholding a covariance matrix may result in a matrix that
733    is not positive semidefinite.  We can approximate a hard
734    thresholded covariance matrix with a PSD matrix as follows:
735
736    >>> import numpy as np
737    >>> np.random.seed(1234)
738    >>> b = 1.5 - np.random.rand(10, 1)
739    >>> x = np.random.randn(100,1).dot(b.T) + np.random.randn(100,10)
740    >>> cov = np.cov(x)
741    >>> cov = cov * (np.abs(cov) >= 0.3)
742    >>> rslt = cov_nearest_factor_homog(cov, 3)
743    """
744
745    m, n = cov.shape
746
747    Q, Lambda, _ = svds(cov, rank)
748
749    if sparse.issparse(cov):
750        QSQ = np.dot(Q.T, cov.dot(Q))
751        ts = cov.diagonal().sum()
752        tss = cov.dot(cov).diagonal().sum()
753    else:
754        QSQ = np.dot(Q.T, np.dot(cov, Q))
755        ts = np.trace(cov)
756        tss = np.trace(np.dot(cov, cov))
757
758    def fun(k):
759        Lambda_t = Lambda - k
760        v = tss + m*(k**2) + np.sum(Lambda_t**2) - 2*k*ts
761        v += 2*k*np.sum(Lambda_t) - 2*np.sum(np.diag(QSQ) * Lambda_t)
762        return v
763
764    # Get the optimal decomposition
765    k_opt = fminbound(fun, 0, 1e5)
766    Lambda_opt = Lambda - k_opt
767    fac_opt = Q * np.sqrt(Lambda_opt)
768
769    diag = k_opt * np.ones(m, dtype=np.float64)  # - (fac_opt**2).sum(1)
770    return FactoredPSDMatrix(diag, fac_opt)
771
772
773def corr_thresholded(data, minabs=None, max_elt=1e7):
774    r"""
775    Construct a sparse matrix containing the thresholded row-wise
776    correlation matrix from a data array.
777
778    Parameters
779    ----------
780    data : array_like
781        The data from which the row-wise thresholded correlation
782        matrix is to be computed.
783    minabs : non-negative real
784        The threshold value; correlation coefficients smaller in
785        magnitude than minabs are set to zero.  If None, defaults
786        to 1 / sqrt(n), see Notes for more information.
787
788    Returns
789    -------
790    cormat : sparse.coo_matrix
791        The thresholded correlation matrix, in COO format.
792
793    Notes
794    -----
795    This is an alternative to C = np.corrcoef(data); C \*= (np.abs(C)
796    >= absmin), suitable for very tall data matrices.
797
798    If the data are jointly Gaussian, the marginal sampling
799    distributions of the elements of the sample correlation matrix are
800    approximately Gaussian with standard deviation 1 / sqrt(n).  The
801    default value of ``minabs`` is thus equal to 1 standard error, which
802    will set to zero approximately 68% of the estimated correlation
803    coefficients for which the population value is zero.
804
805    No intermediate matrix with more than ``max_elt`` values will be
806    constructed.  However memory use could still be high if a large
807    number of correlation values exceed `minabs` in magnitude.
808
809    The thresholded matrix is returned in COO format, which can easily
810    be converted to other sparse formats.
811
812    Examples
813    --------
814    Here X is a tall data matrix (e.g. with 100,000 rows and 50
815    columns).  The row-wise correlation matrix of X is calculated
816    and stored in sparse form, with all entries smaller than 0.3
817    treated as 0.
818
819    >>> import numpy as np
820    >>> np.random.seed(1234)
821    >>> b = 1.5 - np.random.rand(10, 1)
822    >>> x = np.random.randn(100,1).dot(b.T) + np.random.randn(100,10)
823    >>> cmat = corr_thresholded(x, 0.3)
824    """
825
826    nrow, ncol = data.shape
827
828    if minabs is None:
829        minabs = 1. / float(ncol)
830
831    # Row-standardize the data
832    data = data.copy()
833    data -= data.mean(1)[:, None]
834    sd = data.std(1, ddof=1)
835    ii = np.flatnonzero(sd > 1e-5)
836    data[ii, :] /= sd[ii][:, None]
837    ii = np.flatnonzero(sd <= 1e-5)
838    data[ii, :] = 0
839
840    # Number of rows to process in one pass
841    bs = int(np.floor(max_elt / nrow))
842
843    ipos_all, jpos_all, cor_values = [], [], []
844
845    ir = 0
846    while ir < nrow:
847        ir2 = min(data.shape[0], ir + bs)
848        cm = np.dot(data[ir:ir2, :], data.T) / (ncol - 1)
849        cma = np.abs(cm)
850        ipos, jpos = np.nonzero(cma >= minabs)
851        ipos_all.append(ipos + ir)
852        jpos_all.append(jpos)
853        cor_values.append(cm[ipos, jpos])
854        ir += bs
855
856    ipos = np.concatenate(ipos_all)
857    jpos = np.concatenate(jpos_all)
858    cor_values = np.concatenate(cor_values)
859
860    cmat = sparse.coo_matrix((cor_values, (ipos, jpos)), (nrow, nrow))
861
862    return cmat
863
864
865class MultivariateKernel(object):
866    """
867    Base class for multivariate kernels.
868
869    An instance of MultivariateKernel implements a `call` method having
870    signature `call(x, loc)`, returning the kernel weights comparing `x`
871    (a 1d ndarray) to each row of `loc` (a 2d ndarray).
872    """
873
874    def call(self, x, loc):
875        raise NotImplementedError
876
877    def set_bandwidth(self, bw):
878        """
879        Set the bandwidth to the given vector.
880
881        Parameters
882        ----------
883        bw : array_like
884            A vector of non-negative bandwidth values.
885        """
886
887        self.bw = bw
888        self._setup()
889
890    def _setup(self):
891
892        # Precompute the squared bandwidth values.
893        self.bwk = np.prod(self.bw)
894        self.bw2 = self.bw * self.bw
895
896    def set_default_bw(self, loc, bwm=None):
897        """
898        Set default bandwiths based on domain values.
899
900        Parameters
901        ----------
902        loc : array_like
903            Values from the domain to which the kernel will
904            be applied.
905        bwm : scalar, optional
906            A non-negative scalar that is used to multiply
907            the default bandwidth.
908        """
909
910        sd = loc.std(0)
911        q25, q75 = np.percentile(loc, [25, 75], axis=0)
912        iqr = (q75 - q25) / 1.349
913        bw = np.where(iqr < sd, iqr, sd)
914        bw *= 0.9 / loc.shape[0] ** 0.2
915
916        if bwm is not None:
917            bw *= bwm
918
919        # The final bandwidths
920        self.bw = np.asarray(bw, dtype=np.float64)
921
922        self._setup()
923
924
925class GaussianMultivariateKernel(MultivariateKernel):
926    """
927    The Gaussian (squared exponential) multivariate kernel.
928    """
929
930    def call(self, x, loc):
931        return np.exp(-(x - loc)**2 / (2 * self.bw2)).sum(1) / self.bwk
932
933
934def kernel_covariance(exog, loc, groups, kernel=None, bw=None):
935    """
936    Use kernel averaging to estimate a multivariate covariance function.
937
938    The goal is to estimate a covariance function C(x, y) =
939    cov(Z(x), Z(y)) where x, y are vectors in R^p (e.g. representing
940    locations in time or space), and Z(.) represents a multivariate
941    process on R^p.
942
943    The data used for estimation can be observed at arbitrary values of the
944    position vector, and there can be multiple independent observations
945    from the process.
946
947    Parameters
948    ----------
949    exog : array_like
950        The rows of exog are realizations of the process obtained at
951        specified points.
952    loc : array_like
953        The rows of loc are the locations (e.g. in space or time) at
954        which the rows of exog are observed.
955    groups : array_like
956        The values of groups are labels for distinct independent copies
957        of the process.
958    kernel : MultivariateKernel instance, optional
959        An instance of MultivariateKernel, defaults to
960        GaussianMultivariateKernel.
961    bw : array_like or scalar
962        A bandwidth vector, or bandwidth multiplier.  If a 1d array, it
963        contains kernel bandwidths for each component of the process, and
964        must have length equal to the number of columns of exog.  If a scalar,
965        bw is a bandwidth multiplier used to adjust the default bandwidth; if
966        None, a default bandwidth is used.
967
968    Returns
969    -------
970    A real-valued function C(x, y) that returns an estimate of the covariance
971    between values of the process located at x and y.
972
973    References
974    ----------
975    .. [1] Genton M, W Kleiber (2015).  Cross covariance functions for
976        multivariate geostatics.  Statistical Science 30(2).
977        https://arxiv.org/pdf/1507.08017.pdf
978    """
979
980    exog = np.asarray(exog)
981    loc = np.asarray(loc)
982    groups = np.asarray(groups)
983
984    if loc.ndim == 1:
985        loc = loc[:, None]
986
987    v = [exog.shape[0], loc.shape[0], len(groups)]
988    if min(v) != max(v):
989        msg = "exog, loc, and groups must have the same number of rows"
990        raise ValueError(msg)
991
992    # Map from group labels to the row indices in each group.
993    ix = {}
994    for i, g in enumerate(groups):
995        if g not in ix:
996            ix[g] = []
997        ix[g].append(i)
998    for g in ix.keys():
999        ix[g] = np.sort(ix[g])
1000
1001    if kernel is None:
1002        kernel = GaussianMultivariateKernel()
1003
1004    if bw is None:
1005        kernel.set_default_bw(loc)
1006    elif np.isscalar(bw):
1007        kernel.set_default_bw(loc, bwm=bw)
1008    else:
1009        kernel.set_bandwidth(bw)
1010
1011    def cov(x, y):
1012
1013        kx = kernel.call(x, loc)
1014        ky = kernel.call(y, loc)
1015
1016        cm, cw = 0., 0.
1017
1018        for g, ii in ix.items():
1019
1020            m = len(ii)
1021            j1, j2 = np.indices((m, m))
1022            j1 = ii[j1.flat]
1023            j2 = ii[j2.flat]
1024            w = kx[j1] * ky[j2]
1025
1026            # TODO: some other form of broadcasting may be faster than
1027            # einsum here
1028            cm += np.einsum("ij,ik,i->jk", exog[j1, :], exog[j2, :], w)
1029            cw += w.sum()
1030
1031        if cw < 1e-10:
1032            msg = ("Effective sample size is 0.  The bandwidth may be too " +
1033                   "small, or you are outside the range of your data.")
1034            warnings.warn(msg)
1035            return np.nan * np.ones_like(cm)
1036
1037        return cm / cw
1038
1039    return cov
1040