tsa/statespace/_tools.pyx.in

#cython: boundscheck=False
#cython: wraparound=False
#cython: cdivision=False
"""
State Space Model - Cython tools

Author: Chad Fulton
License: Simplified-BSD
"""

# Typical imports
cimport numpy as np
cimport cython
import numpy as np

np.import_array()

from statsmodels.src.math cimport *
cimport scipy.linalg.cython_blas as blas
cimport scipy.linalg.cython_lapack as lapack

cdef FORTRAN = 1

ctypedef fused sm_type_t:
    np.float64_t
    np.float32_t
    np.complex64_t
    np.complex128_t


# ------------------------------------------------------------------------
# Array shape validation

cdef validate_matrix_shape(str name, Py_ssize_t *shape, int nrows, int ncols, nobs=None):
    if not shape[0] == nrows:
        raise ValueError('Invalid shape for %s matrix: requires %d rows,'
                         ' got %d' % (name, nrows, shape[0]))
    if not shape[1] == ncols:
        raise ValueError('Invalid shape for %s matrix: requires %d columns,'
                         'got %d' % (name, shape[1], shape[1]))
    if nobs is not None and shape[2] not in [1, nobs]:
        raise ValueError('Invalid time-varying dimension for %s matrix:'
                         ' requires 1 or %d, got %d' % (name, nobs, shape[2]))


cdef validate_vector_shape(str name, Py_ssize_t *shape, int nrows, nobs = None):
    if not shape[0] == nrows:
        raise ValueError('Invalid shape for %s vector: requires %d rows,'
                         ' got %d' % (name, nrows, shape[0]))
    if nobs is not None and not shape[1] in [1, nobs]:
        raise ValueError('Invalid time-varying dimension for %s vector:'
                         ' requires 1 or %d got %d' % (name, nobs, shape[1]))


# ------------------------------------------------------------------------
# Blas Wrapping

cdef inline copy(int size, sm_type_t* src_arr, sm_type_t* target,
                 int incx=1, int incy=1):
    """
    Parameters
    ----------
    size : int
        number of elements in vectors `src_arr` and `target`
    src_arr : sm_type_t*
        Pointer to Array that we are copying _from_
    target : sm_type_t*
        Pointer to Array that we are copying _to_
    incx : int, default 1
        Specifies the increment for the elements of `src_arr`
    incy : int, default 1
        Specifies the increment for the elements of `target`

    References
    ----------
    https://software.intel.com/en-us/mkl-developer-reference-c-cblas-copy
    """
    if sm_type_t is np.float32_t:
        blas.scopy(&size, src_arr, &incx, target, &incy)
    elif sm_type_t is np.float64_t:
        blas.dcopy(&size, src_arr, &incx, target, &incy)
    elif sm_type_t is np.complex64_t:
        blas.ccopy(&size, src_arr, &incx, target, &incy)
    elif sm_type_t is np.complex128_t:
        blas.zcopy(&size, src_arr, &incx, target, &incy)


cdef inline swap(int size, sm_type_t* src_arr, sm_type_t* target,
                 int incx=1, int incy=1):
    """
    Parameters
    ----------
    size : int
        number of elements in vectors `src_arr` and `target`
    src_arr : sm_type_t*
        Pointer to first of two arrays being swapped
    target : sm_type_t*
        Pointer to second of two arrays being swapped
    incx : int, default 1
        Specifies the increment for the elements of `src_arr`
    incy : int, default 1
        Specifies the increment for the elements of `target`

    References
    ----------
    https://software.intel.com/en-us/mkl-developer-reference-c-cblas-swap
    """

    if sm_type_t is np.float32_t:
        blas.sswap(&size, src_arr, &incx, target, &incy)
    elif sm_type_t is np.float64_t:
        blas.dswap(&size, src_arr, &incx, target, &incy)
    elif sm_type_t is np.complex64_t:
        blas.cswap(&size, src_arr, &incx, target, &incy)
    elif sm_type_t is np.complex128_t:
        blas.zswap(&size, src_arr, &incx, target, &incy)


# ------------------------------------------------------------------------

{{py:

TYPES = {
    "s": ("np.float32_t", "np.float32", "np.NPY_FLOAT32"),
    "d": ("np.float64_t", "float", "np.NPY_FLOAT64"),
    "c": ("np.complex64_t", "np.complex64", "np.NPY_COMPLEX64"),
    "z": ("np.complex128_t", "complex", "np.NPY_COMPLEX128"),
}

}}

{{for prefix, types in TYPES.items()}}
{{py:cython_type, dtype, typenum = types}}
{{py:
combined_prefix = prefix
combined_cython_type = cython_type
if prefix == 'c':
    combined_prefix = 'z'
    combined_cython_type = 'np.complex128_t'
if prefix == 's':
    combined_prefix = 'd'
    combined_cython_type = 'np.float64_t'
}}


cdef bint _{{prefix}}select1({{cython_type}} * a):
    return 0

cdef bint _{{prefix}}select2({{cython_type}} * a, {{cython_type}} * b):
    return 0

cdef int _{{prefix}}solve_discrete_lyapunov({{cython_type}} * a, {{cython_type}} * q, int n, int complex_step=False) except *:
    # Note: some of this code (esp. the Sylvester solving part) cribbed from
    # https://raw.githubusercontent.com/scipy/scipy/master/scipy/linalg/_solvers.py

    # Solve an equation of the form $A'XA-X=-Q$
    # a: input / output
    # q: input / output
    cdef:
        int i, j
        int info
        int inc = 1
        int n2 = n**2
        {{if prefix == 's' or prefix == 'c'}}
        np.float32_t scale = 0.0
        {{else}}
        np.float64_t scale = 0.0
        {{endif}}
        {{cython_type}} tmp = 0.0
        {{cython_type}} alpha = 1.0
        {{cython_type}} beta = 0.0
        {{cython_type}} delta = -2.0
        char trans
    cdef np.npy_intp dim[2]
    cdef {{cython_type}} [::1,:] apI, capI, u, v
    cdef int [::1,:] ipiv
    # Dummy selection function, will not actually be referenced since we do not
    # need to order the eigenvalues in the ?gees call.
    cdef:
        int sdim
        int lwork = 3*n
        bint bwork
    cdef np.npy_intp dim1[1]
    cdef {{cython_type}} [::1,:] work
    cdef {{cython_type}} [:] wr
    {{if prefix == 's' or prefix == 'c'}}
    cdef np.float32_t [:] wi
    {{else}}
    cdef np.float64_t [:] wi
    {{endif}}

    # Initialize arrays
    dim[0] = n; dim[1] = n;
    apI = np.PyArray_ZEROS(2, dim, {{typenum}}, FORTRAN)
    capI = np.PyArray_ZEROS(2, dim, {{typenum}}, FORTRAN)
    u = np.PyArray_ZEROS(2, dim, {{typenum}}, FORTRAN)
    v = np.PyArray_ZEROS(2, dim, {{typenum}}, FORTRAN)
    ipiv = np.PyArray_ZEROS(2, dim, np.NPY_INT32, FORTRAN)

    dim1[0] = n;
    wr = np.PyArray_ZEROS(1, dim1, {{typenum}}, FORTRAN)
    {{if prefix == 's'}}
    wi = np.PyArray_ZEROS(1, dim1, {{typenum}}, FORTRAN)
    {{else}}
    wi = np.PyArray_ZEROS(1, dim1, np.NPY_FLOAT64, FORTRAN)
    {{endif}}
    #vs = np.PyArray_ZEROS(2, dim, {{typenum}}, FORTRAN)
    dim[0] = lwork; dim[1] = lwork;
    work = np.PyArray_ZEROS(2, dim, {{typenum}}, FORTRAN)

    # - Solve for b.conj().transpose() --------

    # Get apI = a + I (stored in apI)
    # = (a + eye)
    # For: c = 2*np.dot(np.dot(inv(a + eye), q), aHI_inv)
    copy(n2, a, &apI[0, 0])
    # (for loop below adds the identity)

    # Get conj(a) + I (stored in capI)
    # a^H + I -> capI
    # For: aHI_inv = inv(aH + eye)
    copy(n2, a, &capI[0, 0])
    # (for loop below adds the identity)

    # Get conj(a) - I (stored in a)
    # a^H - I -> a
    # For: b = np.dot(aH - eye, aHI_inv)
    # (for loop below subtracts the identity)

    # Add / subtract identity matrix
    for i in range(n):
        apI[i,i] = apI[i,i] + 1 # apI -> a + eye
        capI[i,i] = capI[i,i] + 1 # aH + eye
        a[i + i*n] = a[i + i*n] - 1 # a - eye

    # Solve [conj(a) + I] b' = [conj(a) - I] (result stored in a)
    # For: b = np.dot(aH - eye, aHI_inv)
    # Where: aHI_inv = inv(aH + eye)
    # where b = (a^H - eye) (a^H + eye)^{-1}
    # or b^H = (a + eye)^{-1} (a - eye)
    # or (a + eye) b^H = (a - eye)
    lapack.{{prefix}}getrf(&n, &n, &capI[0,0], &n, &ipiv[0,0], &info)

    if not info == 0:
        raise np.linalg.LinAlgError('LU decomposition error.')

    lapack.{{prefix}}getrs("N", &n, &n, &capI[0,0], &n, &ipiv[0,0],
                               a, &n, &info)

    if not info == 0:
        raise np.linalg.LinAlgError('LU solver error.')

    # Now we have b^H; we could take the conjugate transpose to get b, except
    # that the input to the continuous Lyapunov equation is exactly
    # b^H, so we already have the quantity we need.

    # - Solve for (-c) --------

    # where c = 2*np.dot(np.dot(inv(a + eye), q), aHI_inv)
    # = 2*(a + eye)^{-1} q (a^H + eye)^{-1}
    # and with q Hermitian
    # consider x = (a + eye)^{-1} q (a^H + eye)^{-1}
    # this can be done in two linear solving steps:
    # 1. consider y = q (a^H + eye)^{-1}
    #    or y (a^H + eye) = q
    #    or (a^H + eye)^H y^H = q^H
    #    or (a + eye) y^H = q
    # 2. Then consider x = (a + eye)^{-1} y
    #    or (a + eye) x = y

    # Solve [conj(a) + I] tmp' = q (result stored in q)
    # For: y = q (a^H + eye)^{-1} => (a + eye) y^H = q
    lapack.{{prefix}}getrs("N", &n, &n, &capI[0,0], &n, &ipiv[0,0],
                                        q, &n, &info)

    if not info == 0:
        raise np.linalg.LinAlgError('LU solver error.')

    # Replace the result (stored in q) with its (conjugate) transpose
    for j in range(1, n):
        for i in range(j):
            tmp = q[i + j*n]
            q[i + j*n] = q[j + i*n]
            q[j + i*n] = tmp

    {{if combined_prefix == 'z'}}
    if not complex_step:
        for i in range(n2):
            q[i] = q[i] - q[i].imag * 2.0j
    {{endif}}

    lapack.{{prefix}}getrs("N", &n, &n, &capI[0,0], &n, &ipiv[0,0],
                                        q, &n, &info)

    if not info == 0:
        raise np.linalg.LinAlgError('LU solver error.')

    # q -> -2.0 * q
    blas.{{prefix}}scal(&n2, &delta, q, &inc)

    # - Solve continuous time Lyapunov --------

    # Now solve the continuous time Lyapunov equation (AX + XA^H = Q), on the
    # transformed inputs ...

    # ... which requires solving the continuous time Sylvester equation
    # (AX + XB = Q) where B = A^H

    # Compute the real Schur decomposition of a (unordered)
    # TODO compute the optimal lwork rather than always using 3*n
    {{if combined_prefix == 'd'}}
    # a is now the Schur form of A; (r)
    # u is now the unitary Schur transformation matrix for A; (u)
    # In the usual case, we will also have:
    # r = s, so s is also stored in a
    # u = v, so v is also stored in u
    # In the complex-step case, we will instead have:
    # r = s.conj()
    # u = v.conj()
    lapack.{{prefix}}gees("V", "N", <lapack.{{prefix}}select2 *> &_{{prefix}}select2, &n,
                          a, &n,
                          &sdim,
                          &wr[0], &wi[0],
                          &u[0,0], &n,
                          &work[0,0], &lwork,
                          &bwork, &info)
    {{else}}
    lapack.{{prefix}}gees("V", "N", <lapack.{{prefix}}select1 *> &_{{prefix}}select1, &n,
                          a, &n,
                          &sdim,
                          &wr[0],
                          &u[0,0], &n,
                          &work[0,0], &lwork,
                          &wi[0],
                          &bwork, &info)
    {{endif}}

    if not info == 0:
        raise np.linalg.LinAlgError('Schur decomposition solver error.')

    # Get v (so that in the complex step case we can take the conjugate)
    copy(n2, &u[0, 0], &v[0, 0])
    # If complex step, take the conjugate
    {{if combined_prefix == 'z'}}
    if complex_step:
        for i in range(n):
            for j in range(n):
                v[i,j] = v[i,j] - v[i,j].imag * 2.0j
    {{endif}}

    # Construct f = u^H*q*u (result overwrites q)
    # In the usual case, v = u
    # In the complex step case, v = u.conj()
    blas.{{prefix}}gemm("N", "N", &n, &n, &n,
                        &alpha, q, &n,
                                &v[0,0], &n,
                        &beta, &capI[0,0], &n)
    blas.{{prefix}}gemm("C", "N", &n, &n, &n,
                        &alpha, &u[0,0], &n,
                                &capI[0,0], &n,
                        &beta, q, &n)

    # DTRYSL Solve op(A)*X + X*op(B) = scale*C which is here:
    # r*X + X*r = scale*q
    # results overwrite q
    copy(n2, a, &apI[0, 0])
    {{if combined_prefix == 'z'}}
    if complex_step:
        for i in range(n):
            for j in range(n):
                apI[j,i] = apI[j,i] - apI[j,i].imag * 2.0j
    {{endif}}
    lapack.{{prefix}}trsyl("N", "C", &inc, &n, &n,
                           a, &n,
                           &apI[0,0], &n,
                           q, &n,
                           &scale, &info)

    # Scale q by scale
    if not scale == 1.0:
        blas.{{prefix}}scal(&n2, <{{cython_type}}*> &scale, q, &inc)

    # Calculate the solution: u * q * v^H (results overwrite q)
    # In the usual case, v = u
    # In the complex step case, v = u.conj()
    blas.{{prefix}}gemm("N", "C", &n, &n, &n,
                        &alpha, q, &n,
                                &v[0,0], &n,
                        &beta, &capI[0,0], &n)
    blas.{{prefix}}gemm("N", "N", &n, &n, &n,
                        &alpha, &u[0,0], &n,
                                &capI[0,0], &n,
                        &beta, q, &n)


cpdef _{{prefix}}compute_coefficients_from_multivariate_pacf({{cython_type}} [::1,:] partial_autocorrelations,
                                                             {{cython_type}} [::1,:] error_variance,
                                                             int transform_variance,
                                                             int order, int k_endog):
    """
    Notes
    -----

    This uses the ?trmm BLAS functions which are not available in
    Scipy v0.11.0
    """
    # Constants
    cdef:
        {{cython_type}} alpha = 1.0
        {{cython_type}} beta = 0.0
        {{cython_type}} gamma = -1.0
        int k_endog2 = k_endog**2
        int k_endog_order = k_endog * order
        int k_endog_order1 = k_endog * (order+1)
        int info, s, k
    # Local variables
    cdef:
        np.npy_intp dim2[2]
        {{cython_type}} [::1, :] initial_variance
        {{cython_type}} [::1, :] forward_variance
        {{cython_type}} [::1, :] backward_variance
        {{cython_type}} [::1, :] autocovariances
        {{cython_type}} [::1, :] forwards1
        {{cython_type}} [::1, :] forwards2
        {{cython_type}} [::1, :] backwards1
        {{cython_type}} [::1, :] backwards2
        {{cython_type}} [::1, :] forward_factors
        {{cython_type}} [::1, :] backward_factors
        {{cython_type}} [::1, :] tmp
        {{cython_type}} [::1, :] tmp2
    # Pointers
    cdef:
        {{cython_type}} * forwards
        {{cython_type}} * prev_forwards
        {{cython_type}} * backwards
        {{cython_type}} * prev_backwards
    # ?trmm
    # cdef {{prefix}}trmm_t *{{prefix}}trmm = <{{prefix}}trmm_t*>Capsule_AsVoidPtr(blas.{{prefix}}trmm._cpointer)

    # dim2[0] = self.k_endog; dim2[1] = storage;
    # self.forecast = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN)

    # If we want to keep the provided variance but with the constrained
    # coefficient matrices, we need to make a copy here, and then after the
    # main loop we will transform the coefficients to match the passed variance
    if not transform_variance:
        initial_variance = np.asfortranarray(error_variance.copy())
        # Need to make the input variance large enough that the recursions
        # do not lead to zero-matrices due to roundoff error, which would case
        # exceptions from the Cholesky decompositions.
        # Note that this will still not always ensure positive definiteness,
        # and for k_endog, order large enough an exception may still be raised
        error_variance = np.asfortranarray(np.eye(k_endog, dtype={{dtype}}) * (order + k_endog)**10)

    # Initialize matrices
    dim2[0] = k_endog; dim2[1] = k_endog;
    forward_variance = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN)
    backward_variance = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN)
    forward_factors = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN)
    backward_factors = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN)
    tmp = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN)
    tmp2 = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN)

    dim2[0] = k_endog; dim2[1] = k_endog_order;
    # \phi_{s,k}, s = 1, ..., p
    #             k = 1, ..., s+1
    forwards1 = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN)
    forwards2 = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN)
    # \phi_{s,k}^*
    backwards1 = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN)
    backwards2 = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN)

    dim2[0] = k_endog; dim2[1] = k_endog_order1;
    autocovariances = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN)

    copy(k_endog2, &error_variance[0, 0], &forward_variance[0, 0])   # \Sigma_s
    copy(k_endog2, &error_variance[0, 0], &backward_variance[0, 0])  # \Sigma_s^*,  s = 0, ..., p
    copy(k_endog2, &error_variance[0, 0], &autocovariances[0, 0])  # \Gamma_s

    # error_variance_factor = linalg.cholesky(error_variance, lower=True)
    copy(k_endog2, &error_variance[0, 0], &forward_factors[0,0])
    lapack.{{prefix}}potrf("L", &k_endog, &forward_factors[0,0], &k_endog, &info)
    copy(k_endog2, &forward_factors[0, 0], &backward_factors[0, 0])

    # We fill in the entries as follows:
    # [1,1]
    # [2,2], [2,1]
    # [3,3], [3,1], [3,2]
    # ...
    # [p,p], [p,1], ..., [p,p-1]
    # the last row, correctly ordered, is then used as the coefficients
    for s in range(order):  # s = 0, ..., p-1
        if s % 2 == 0:
            forwards = &forwards1[0, 0]
            prev_forwards = &forwards2[0, 0]
            backwards = &backwards1[0, 0]
            prev_backwards = &backwards2[0, 0]
        else:
            forwards = &forwards2[0, 0]
            prev_forwards = &forwards1[0, 0]
            backwards = &backwards2[0, 0]
            prev_backwards = &backwards1[0, 0]

        # Create the "last" (k = s+1) matrix
        # Note: this is for k = s+1. However, below we then have to fill
        # in for k = 1, ..., s in order.
        # P L*^{-1} = x
        # x L* = P
        # L*' x' = P'
        # forwards[:, s*k_endog:(s+1)*k_endog] = np.dot(
        #     forward_factors,
        #     linalg.solve_triangular(
        #         backward_factors, partial_autocorrelations[:, s*k_endog:(s+1)*k_endog].T,
        #         lower=True, trans='T').T
        # )
        for k in range(k_endog):
            copy(k_endog, &partial_autocorrelations[k, s*k_endog], &tmp[0, k], k_endog)
        lapack.{{prefix}}trtrs("L", "T", "N", &k_endog, &k_endog, &backward_factors[0,0], &k_endog,
                                                           &tmp[0, 0], &k_endog, &info)
        # {{prefix}}gemm("N", "T", &k_endog, &k_endog, &k_endog,
        #   &alpha, &forward_factors[0,0], &k_endog,
        #           &tmp[0, 0], &k_endog,
        #   &beta, &forwards[s*k_endog2], &k_endog)
        blas.{{prefix}}trmm("R", "L", "T", "N", &k_endog, &k_endog,
          &alpha, &forward_factors[0,0], &k_endog,
                  &tmp[0, 0], &k_endog)
        for k in range(k_endog):
            copy(k_endog, &tmp[k, 0], &forwards[s*k_endog2 + k*k_endog], k_endog)

        # P' L^{-1} = x
        # x L = P'
        # L' x' = P
        # backwards[:, s*k_endog:(s+1)*k_endog] = np.dot(
        #     backward_factors,
        #     linalg.solve_triangular(
        #         forward_factors, partial_autocorrelations[:, s*k_endog:(s+1)*k_endog],
        #         lower=True, trans='T').T
        # )
        copy(k_endog2, &partial_autocorrelations[0, s*k_endog], &tmp[0, 0])
        lapack.{{prefix}}trtrs("L", "T", "N", &k_endog, &k_endog, &forward_factors[0, 0], &k_endog,
                                                           &tmp[0, 0], &k_endog, &info)
        # {{prefix}}gemm("N", "T", &k_endog, &k_endog, &k_endog,
        #   &alpha, &backward_factors[0, 0], &k_endog,
        #           &tmp[0, 0], &k_endog,
        #   &beta, &backwards[s * k_endog2], &k_endog)
        blas.{{prefix}}trmm("R", "L", "T", "N", &k_endog, &k_endog,
          &alpha, &backward_factors[0,0], &k_endog,
                  &tmp[0, 0], &k_endog)
        for k in range(k_endog):
            copy(k_endog, &tmp[k, 0], &backwards[s*k_endog2 + k*k_endog], k_endog)

        # Update the variance
        # Note: if s >= 1, this will be further updated in the for loop
        # below
        # Also, this calculation will be re-used in the forward variance
        # tmp = np.dot(forwards[:, s*k_endog:(s+1)*k_endog], backward_variance)
        # tmpT = np.dot(backward_variance.T, forwards[:, s*k_endog:(s+1)*k_endog].T)
        blas.{{prefix}}gemm("T", "T", &k_endog, &k_endog, &k_endog,
          &alpha, &backward_variance[0, 0], &k_endog,
                  &forwards[s * k_endog2], &k_endog,
          &beta, &tmp[0, 0], &k_endog)
        # autocovariances[:, (s+1)*k_endog:(s+2)*k_endog] = tmp.copy().T
        copy(k_endog2, &tmp[0, 0], &autocovariances[0, (s+1)*k_endog])

        # Create the remaining k = 1, ..., s matrices,
        # only has an effect if s >= 1
        for k in range(s):
            # forwards[:, k*k_endog:(k+1)*k_endog] = (
            #     prev_forwards[:, k*k_endog:(k+1)*k_endog] -
            #     np.dot(
            #         forwards[:, s*k_endog:(s+1)*k_endog],
            #         prev_backwards[:, (s-k-1)*k_endog:(s-k)*k_endog]
            #     )
            # )
            copy(k_endog2, &prev_forwards[k * k_endog2], &forwards[k * k_endog2])
            blas.{{prefix}}gemm("N", "N", &k_endog, &k_endog, &k_endog,
              &gamma, &forwards[s * k_endog2], &k_endog,
                      &prev_backwards[(s - k - 1) * k_endog2], &k_endog,
              &alpha, &forwards[k * k_endog2], &k_endog)

            # backwards[:, k*k_endog:(k+1)*k_endog] = (
            #     prev_backwards[:, k*k_endog:(k+1)*k_endog] -
            #     np.dot(
            #         backwards[:, s*k_endog:(s+1)*k_endog],
            #         prev_forwards[:, (s-k-1)*k_endog:(s-k)*k_endog]
            #     )
            # )
            copy(k_endog2, &prev_backwards[k * k_endog2], &backwards[k * k_endog2])
            blas.{{prefix}}gemm("N", "N", &k_endog, &k_endog, &k_endog,
              &gamma, &backwards[s * k_endog2], &k_endog,
                      &prev_forwards[(s - k - 1) * k_endog2], &k_endog,
              &alpha, &backwards[k * k_endog2], &k_endog)

            # autocovariances[:, (s+1)*k_endog:(s+2)*k_endog] += np.dot(
            #     autocovariances[:, (k+1)*k_endog:(k+2)*k_endog],
            #     prev_forwards[:, (s-k-1)*k_endog:(s-k)*k_endog].T
            # )
            blas.{{prefix}}gemm("N", "T", &k_endog, &k_endog, &k_endog,
              &alpha, &autocovariances[0, (k+1)*k_endog], &k_endog,
                      &prev_forwards[(s - k - 1) * k_endog2], &k_endog,
              &alpha, &autocovariances[0, (s+1)*k_endog], &k_endog)

        # Create forward and backwards variances
        # backward_variance = (
        #     backward_variance -
        #     np.dot(
        #         np.dot(backwards[:, s*k_endog:(s+1)*k_endog], forward_variance),
        #         backwards[:, s*k_endog:(s+1)*k_endog].T
        #     )
        # )
        blas.{{prefix}}gemm("N", "N", &k_endog, &k_endog, &k_endog,
          &alpha, &backwards[s * k_endog2], &k_endog,
                  &forward_variance[0, 0], &k_endog,
          &beta, &tmp2[0, 0], &k_endog)
        blas.{{prefix}}gemm("N", "T", &k_endog, &k_endog, &k_endog,
          &gamma, &tmp2[0, 0], &k_endog,
                  &backwards[s * k_endog2], &k_endog,
          &alpha, &backward_variance[0, 0], &k_endog)
        # forward_variance = (
        #     forward_variance -
        #     np.dot(tmp, forwards[:, s*k_endog:(s+1)*k_endog].T)
        # )
        # forward_variance = (
        #     forward_variance -
        #     np.dot(tmpT.T, forwards[:, s*k_endog:(s+1)*k_endog].T)
        # )
        blas.{{prefix}}gemm("T", "T", &k_endog, &k_endog, &k_endog,
          &gamma, &tmp[0, 0], &k_endog,
                  &forwards[s * k_endog2], &k_endog,
          &alpha, &forward_variance[0, 0], &k_endog)

        # Cholesky factors
        # forward_factors = linalg.cholesky(forward_variance, lower=True)
        # backward_factors =  linalg.cholesky(backward_variance, lower=True)
        copy(k_endog2, &forward_variance[0, 0], &forward_factors[0, 0])
        lapack.{{prefix}}potrf("L", &k_endog, &forward_factors[0,0], &k_endog, &info)
        copy(k_endog2, &backward_variance[0, 0], &backward_factors[0, 0])
        lapack.{{prefix}}potrf("L", &k_endog, &backward_factors[0,0], &k_endog, &info)


    # If we do not want to use the transformed variance, we need to
    # adjust the constrained matrices, as presented in Lemma 2.3, see above
    if not transform_variance:
        if order % 2 == 0:
            forwards = &forwards2[0,0]
        else:
            forwards = &forwards1[0,0]

        # Here, we need to construct T such that:
        # variance = T * initial_variance * T'
        # To do that, consider the Cholesky of variance (L) and
        # input_variance (M) to get:
        # L L' = T M M' T' = (TM) (TM)'
        # => L = T M
        # => L M^{-1} = T
        # initial_variance_factor = np.linalg.cholesky(initial_variance)
        # L'
        lapack.{{prefix}}potrf("U", &k_endog, &initial_variance[0,0], &k_endog, &info)
        # transformed_variance_factor = np.linalg.cholesky(variance)
        # M'
        copy(k_endog2, &forward_variance[0, 0], &tmp[0, 0])
        lapack.{{prefix}}potrf("U", &k_endog, &tmp[0,0], &k_endog, &info)
        # {{prefix}}potri("L", &k_endog, &tmp[0,0], &k_endog, &info)

        # We need to zero out the lower triangle of L', because ?trtrs only
        # knows that M' is upper triangular
        for s in range(k_endog - 1):          # column
            for k in range(s+1, k_endog):     # row
                initial_variance[k, s] = 0

        # Note that T is lower triangular
        # L M^{-1} = T
        # M' T' = L'
        # transform = np.dot(initial_variance_factor,
        #                    np.linalg.inv(transformed_variance_factor))
        lapack.{{prefix}}trtrs("U", "N", "N", &k_endog, &k_endog, &tmp[0,0], &k_endog,
                                                           &initial_variance[0, 0], &k_endog, &info)
        # Now:
        # initial_variance = T'

        for s in range(order):
            # forwards[:, s*k_endog:(s+1)*k_endog] = (
            #     np.dot(
            #         np.dot(transform, forwards[:, s*k_endog:(s+1)*k_endog]),
            #         inv_transform
            #     )
            # )
            # TF T^{-1} = x
            # TF = x T
            # (TF)' = T' x'

            # Get TF
            copy(k_endog2, &forwards[s * k_endog2], &tmp2[0, 0])
            blas.{{prefix}}trmm("L", "U", "T", "N", &k_endog, &k_endog,
              &alpha, &initial_variance[0, 0], &k_endog,
                      &tmp2[0, 0], &k_endog)
            for k in range(k_endog):
                copy(k_endog, &tmp2[k, 0], &tmp[0, k], k_endog)
            # Get x'
            lapack.{{prefix}}trtrs("U", "N", "N", &k_endog, &k_endog, &initial_variance[0,0], &k_endog,
                                                               &tmp[0, 0], &k_endog, &info)
            # Get x
            for k in range(k_endog):
                copy(k_endog, &tmp[k, 0], &forwards[s * k_endog2 + k*k_endog], k_endog)


    if order % 2 == 0:
        return forwards2, forward_variance
    else:
        return forwards1, forward_variance

cpdef _{{prefix}}constrain_sv_less_than_one({{cython_type}} [::1,:] unconstrained, int order, int k_endog):
    """
    Transform arbitrary matrices to matrices with singular values less than
    one.

    Corresponds to Lemma 2.2 in Ansley and Kohn (1986). See
    `constrain_stationary_multivariate` for more details.
    """
    # Constants
    cdef:
        {{cython_type}} alpha = 1.0
        int k_endog2 = k_endog**2
        int info, i
    # Local variables
    cdef:
        np.npy_intp dim2[2]
        {{cython_type}} [::1, :] constrained
        {{cython_type}} [::1, :] tmp
        {{cython_type}} [::1, :] eye

    dim2[0] = k_endog; dim2[1] = k_endog * order;
    constrained = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN)
    dim2[0] = k_endog; dim2[1] = k_endog;
    tmp = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN)
    eye = np.PyArray_ZEROS(2, dim2, {{typenum}}, FORTRAN)

    eye = np.asfortranarray(np.eye(k_endog, dtype={{dtype}}))
    for i in range(order):
        copy(k_endog2, &eye[0, 0], &tmp[0, 0])
        blas.{{prefix}}gemm("N", "T", &k_endog, &k_endog, &k_endog,
          &alpha, &unconstrained[0, i*k_endog], &k_endog,
                  &unconstrained[0, i*k_endog], &k_endog,
          &alpha, &tmp[0, 0], &k_endog)
        lapack.{{prefix}}potrf("L", &k_endog, &tmp[0, 0], &k_endog, &info)

        copy(k_endog2, &unconstrained[0, i*k_endog], &constrained[0, i*k_endog])
        # constrained.append(linalg.solve_triangular(B, A, lower=lower))
        lapack.{{prefix}}trtrs("L", "N", "N", &k_endog, &k_endog, &tmp[0, 0], &k_endog,
                                                           &constrained[0, i*k_endog], &k_endog, &info)
    return constrained

cdef int _{{prefix}}ldl({{cython_type}} * A, int n) except *:
    # See Golub and Van Loan, Algorithm 4.1.2
    cdef:
        int info = 0
        int j, i, k
        np.npy_intp dim[1]
        np.float64_t tol = 1e-15
        {{cython_type}} [:] v

    dim[0] = n
    v = np.PyArray_ZEROS(1, dim, {{typenum}}, FORTRAN)

    for j in range(n):
        # Compute v(1:j)
        v[j] = A[j + j*n]

        # Positive definite element: use Golub and Van Loan algorithm
        if v[j].real < -tol:
            info = -j
            break
        elif v[j].real > tol:
            for i in range(j):
                v[i] = A[j + i*n] * A[i + i*n]
                v[j] = v[j] - A[j + i*n] * v[i]

            # Store d(j) and compute L(j+1:n,j)
            A[j + j*n] = v[j]
            for i in range(j+1, n):
                for k in range(j):
                    A[i + j*n] = A[i + j*n] - A[i + k*n] * v[k]
                A[i + j*n] = A[i + j*n] / v[j]
        # Positive semi-definite element: zero the appropriate column
        else:
            info = 1
            for i in range(j, n):
                A[i + j*n]

    return info

cpdef int {{prefix}}ldl({{cython_type}} [::1, :] A) except *:
    _{{prefix}}ldl(&A[0,0], A.shape[0])

cdef int _{{prefix}}reorder_missing_diagonal({{cython_type}} * a, int * missing, int n):
    """
    a is a pointer to an n x n diagonal array A
    missing is a pointer to an n x 1 array
    n is the dimension of A
    """
    cdef int i, j, k, nobs

    nobs = n
    # Construct the non-missing index
    for i in range(n):
        nobs = nobs - missing[i]

    # Perform replacement
    k = nobs-1
    for i in range(n-1,-1,-1):
        if not missing[i]:
            a[i + i*n] = a[k + k*n]
            k = k - 1
        else:
            a[i + i*n] = 0

cdef int _{{prefix}}reorder_missing_submatrix({{cython_type}} * a, int * missing, int n):
    """
    a is a pointer to an n x n array A
    missing is a pointer to an n x 1 array
    n is the dimension of A
    """
    cdef int i, j, k, nobs

    _{{prefix}}reorder_missing_rows(a, missing, n, n)
    _{{prefix}}reorder_missing_cols(a, missing, n, n)

cdef int _{{prefix}}reorder_missing_rows({{cython_type}} * a, int * missing, int n, int m):
    """
    a is a pointer to an n x m array A
    missing is a pointer to an n x 1 array
    n is the number of rows of A
    m is the number of columns of A
    """
    cdef int i, j, k, nobs

    nobs = n
    # Construct the non-missing index
    for i in range(n):
        nobs = nobs - missing[i]

    # Perform replacement
    k = nobs-1
    for i in range(n-1,-1,-1):
        if not missing[i]:
            swap(m, &a[i], &a[k], n, n)
            k = k - 1


cdef int _{{prefix}}reorder_missing_cols({{cython_type}} * a, int * missing, int n, int m):
    """
    a is a pointer to an n x m array A
    missing is a pointer to an m x 1 array
    n is the number of rows of A
    m is the number of columns of A
    """
    cdef int i, k, nobs

    nobs = m
    # Construct the non-missing index
    for i in range(m):
        nobs = nobs - missing[i]

    # Perform replacement
    k = nobs-1
    for i in range(m-1,-1,-1):
        if not missing[i]:
            swap(n, &a[i*n], &a[k*n])
            k = k - 1


cpdef int {{prefix}}reorder_missing_matrix({{cython_type}} [::1, :, :] A, int [::1, :] missing, int reorder_rows, int reorder_cols, int diagonal) except *:
    cdef int n, m, T, t

    n, m, T = A.shape[0:3]

    if reorder_rows and reorder_cols:
        if not n == m:
            raise RuntimeError('Reordering a submatrix requires n = m')
        if diagonal:
            for t in range(T):
                _{{prefix}}reorder_missing_diagonal(&A[0, 0, t], &missing[0, t], n)
        else:
            for t in range(T):
                _{{prefix}}reorder_missing_submatrix(&A[0, 0, t], &missing[0, t], n)
    elif diagonal:
        raise RuntimeError('`diagonal` argument only valid with reordering a submatrix')
    elif reorder_rows:
        for t in range(T):
            _{{prefix}}reorder_missing_rows(&A[0, 0, t], &missing[0, t], n, m)
    elif reorder_cols:
        for t in range(T):
            _{{prefix}}reorder_missing_cols(&A[0, 0, t], &missing[0, t], n, m)


cpdef int {{prefix}}reorder_missing_vector({{cython_type}} [::1, :] A, int [::1, :] missing) except *:
    cdef int i, k, t, n, T, nobs

    n, T = A.shape[0:2]

    for t in range(T):
        _{{prefix}}reorder_missing_rows(&A[0, t], &missing[0, t], n, 1)


cdef int _{{prefix}}copy_missing_diagonal({{cython_type}} * a, {{cython_type}} * b, int * missing, int n):
    """
    Copy the non-missing block of diagonal entries

    a is a pointer to an n x n diagonal array A (copy from)
    b is a pointer to an n x n diagonal array B (copy to)
    missing is a pointer to an n x 1 array
    n is the dimension of A, B
    """
    cdef int i, j, k, nobs

    nobs = n
    # Construct the non-missing index
    for i in range(n):
        nobs = nobs - missing[i]

    # Perform replacement
    k = nobs-1
    for i in range(nobs):
        b[i + i*n] = a[i + i*n]


cdef int _{{prefix}}copy_missing_submatrix({{cython_type}} * a, {{cython_type}} * b, int * missing, int n):
    """
    Copy the non-missing submatrix

    a is a pointer to an n x n diagonal array A (copy from)
    b is a pointer to an n x n diagonal array B (copy to)
    missing is a pointer to an n x 1 array
    n is the dimension of A, B
    """
    cdef int i, j, nobs

    nobs = n
    # Construct the non-missing index
    for i in range(n):
        nobs = nobs - missing[i]

    # Perform replacement
    for i in range(nobs):
        copy(nobs, &a[i*n], &b[i*n])


cdef int _{{prefix}}copy_missing_rows({{cython_type}} * a, {{cython_type}} * b, int * missing, int n, int m):
    """
    a is a pointer to an n x m array A
    b is a pointer to an n x n diagonal array B (copy to)
    missing is a pointer to an n x 1 array
    n is the number of rows of A
    m is the number of columns of A
    """
    cdef int i, j, k, nobs

    nobs = n
    # Construct the non-missing index
    for i in range(n):
        nobs = nobs - missing[i]

    # Perform replacement
    for i in range(nobs):
        copy(m, &a[i], &b[i], n, n)


cdef int _{{prefix}}copy_missing_cols({{cython_type}} * a, {{cython_type}} * b, int * missing, int n, int m):
    """
    a is a pointer to an n x m array A
    b is a pointer to an n x n diagonal array B (copy to)
    missing is a pointer to an m x 1 array
    n is the number of rows of A
    m is the number of columns of A
    """
    cdef int i, j, nobs

    nobs = m
    # Construct the non-missing index
    for i in range(m):
        nobs = nobs - missing[i]

    # Perform replacement
    for i in range(nobs):
        copy(n, &a[i*n], &b[i*n])


cpdef int {{prefix}}copy_missing_matrix({{cython_type}} [::1, :, :] A, {{cython_type}} [::1, :, :] B, int [::1, :] missing, int missing_rows, int missing_cols, int diagonal) except *:
    cdef int n, m, T, t, A_T, A_t = 0, time_varying

    n, m, T = B.shape[0:3]
    A_T = A.shape[2]
    time_varying = (A_T == T)

    if missing_rows and missing_cols:
        if not n == m:
            raise RuntimeError('Copying a submatrix requires n = m')
        if diagonal:
            for t in range(T):
                if time_varying:
                    A_t = t
                _{{prefix}}copy_missing_diagonal(&A[0, 0, A_t], &B[0, 0, t], &missing[0, t], n)
        else:
            for t in range(T):
                if time_varying:
                    A_t = t
                _{{prefix}}copy_missing_submatrix(&A[0, 0, A_t], &B[0, 0, t], &missing[0, t], n)
    elif diagonal:
        raise RuntimeError('`diagonal` argument only valid with copying a submatrix')
    elif missing_rows:
        for t in range(T):
            if time_varying:
                    A_t = t
            _{{prefix}}copy_missing_rows(&A[0, 0, A_t], &B[0, 0, t], &missing[0, t], n, m)
    elif missing_cols:
        for t in range(T):
            if time_varying:
                    A_t = t
            _{{prefix}}copy_missing_cols(&A[0, 0, A_t], &B[0, 0, t], &missing[0, t], n, m)
    pass


cpdef int {{prefix}}copy_missing_vector({{cython_type}} [::1, :] A, {{cython_type}} [::1, :] B, int [::1, :] missing) except *:
    cdef int n, t, T, A_t = 0, A_T

    n, T = B.shape[0:2]
    A_T = A.shape[1]
    time_varying = (A_T == T)

    for t in range(T):
        if time_varying:
            A_t = t
        _{{prefix}}copy_missing_rows(&A[0, A_t], &B[0, t], &missing[0, t], n, 1)

cdef int _{{prefix}}copy_index_diagonal({{cython_type}} * a, {{cython_type}} * b, int * index, int n):
    """
    Copy the non-index block of diagonal entries

    a is a pointer to an n x n diagonal array A (copy from)
    b is a pointer to an n x n diagonal array B (copy to)
    index is a pointer to an n x 1 array
    n is the dimension of A, B
    """
    cdef int i, j, k, nobs

    # Perform replacement
    for i in range(n):
        if index[i]:
            b[i + i*n] = a[i + i*n]


cdef int _{{prefix}}copy_index_submatrix({{cython_type}} * a, {{cython_type}} * b, int * index, int n):
    """
    Copy the non-index submatrix

    a is a pointer to an n x n diagonal array A (copy from)
    b is a pointer to an n x n diagonal array B (copy to)
    index is a pointer to an n x 1 array
    n is the dimension of A, B
    """
    _{{prefix}}copy_index_rows(a, b, index, n, n)
    _{{prefix}}copy_index_cols(a, b, index, n, n)


cdef int _{{prefix}}copy_index_rows({{cython_type}} * a, {{cython_type}} * b, int * index, int n, int m):
    """
    a is a pointer to an n x m array A
    b is a pointer to an n x n diagonal array B (copy to)
    index is a pointer to an n x 1 array
    n is the number of rows of A
    m is the number of columns of A
    """
    cdef int i

    # Perform replacement
    for i in range(n):
        if index[i]:
            copy(m, &a[i], &b[i], n, n)


cdef int _{{prefix}}copy_index_cols({{cython_type}} * a, {{cython_type}} * b, int * index, int n, int m):
    """
    a is a pointer to an n x m array A
    b is a pointer to an n x n diagonal array B (copy to)
    index is a pointer to an m x 1 array
    n is the number of rows of A
    m is the number of columns of A
    """
    cdef int i, j, k, nobs

    # Perform replacement
    for i in range(m):
        if index[i]:
            copy(n, &a[i*n], &b[i*n])


cpdef int {{prefix}}copy_index_matrix({{cython_type}} [::1, :, :] A, {{cython_type}} [::1, :, :] B, int [::1, :] index, int index_rows, int index_cols, int diagonal) except *:
    cdef int n, m, T, t, A_T, A_t = 0, time_varying

    n, m, T = B.shape[0:3]
    A_T = A.shape[2]
    time_varying = (A_T == T)

    if index_rows and index_cols:
        if not n == m:
            raise RuntimeError('Copying a submatrix requires n = m')
        if diagonal:
            for t in range(T):
                if time_varying:
                    A_t = t
                _{{prefix}}copy_index_diagonal(&A[0, 0, A_t], &B[0, 0, t], &index[0, t], n)
        else:
            for t in range(T):
                if time_varying:
                    A_t = t
                _{{prefix}}copy_index_submatrix(&A[0, 0, A_t], &B[0, 0, t], &index[0, t], n)
    elif diagonal:
        raise RuntimeError('`diagonal` argument only valid with copying a submatrix')
    elif index_rows:
        for t in range(T):
            if time_varying:
                    A_t = t
            _{{prefix}}copy_index_rows(&A[0, 0, A_t], &B[0, 0, t], &index[0, t], n, m)
    elif index_cols:
        for t in range(T):
            if time_varying:
                    A_t = t
            _{{prefix}}copy_index_cols(&A[0, 0, A_t], &B[0, 0, t], &index[0, t], n, m)


cpdef int {{prefix}}copy_index_vector({{cython_type}} [::1, :] A, {{cython_type}} [::1, :] B, int [::1, :] index) except *:
    cdef int n, t, T, A_t = 0, A_T

    n, T = B.shape[0:2]
    A_T = A.shape[1]
    time_varying = (A_T == T)

    for t in range(T):
        if time_varying:
            A_t = t
        _{{prefix}}copy_index_rows(&A[0, A_t], &B[0, t], &index[0, t], n, 1)

cdef int _{{prefix}}select_cov(int k_states, int k_posdef, int k_states_total,
                               {{cython_type}} * tmp,
                               {{cython_type}} * selection,
                               {{cython_type}} * cov,
                               {{cython_type}} * selected_cov):
    cdef:
        int i, k_states2 = k_states**2
        {{cython_type}} alpha = 1.0
        {{cython_type}} beta = 0.0

    # Only need to do something if there is a covariance matrix
    # (i.e k_posdof == 0)
    if k_posdef > 0:

        # #### Calculate selected state covariance matrix
        # $Q_t^* = R_t Q_t R_t'$
        #
        # Combine a selection matrix and a covariance matrix to get
        # a simplified (but possibly singular) "selected" covariance
        # matrix (see e.g. Durbin and Koopman p. 43)

        # `tmp0` array used here, dimension $(m \times r)$

        # $\\#_0 = 1.0 * R_t Q_t$
        # $(m \times r) = (m \times r) (r \times r)$
        blas.{{prefix}}gemm("N", "N", &k_states, &k_posdef, &k_posdef,
              &alpha, selection, &k_states_total,
                      cov, &k_posdef,
              &beta, tmp, &k_states)
        # $Q_t^* = 1.0 * \\#_0 R_t'$
        # $(m \times m) = (m \times r) (m \times r)'$
        blas.{{prefix}}gemm("N", "T", &k_states, &k_states, &k_posdef,
              &alpha, tmp, &k_states,
                      selection, &k_states_total,
              &beta, selected_cov, &k_states)
    else:
        for i in range(k_states2):
            selected_cov[i] = 0

{{endfor}}