#!/usr/bin/env python
# Copyright 2014-2020 The PySCF Developers. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Author: Qiming Sun <osirpt.sun@gmail.com>
#

'''
Extension to numpy and scipy
'''

import string
import ctypes
import math
import numpy
from pyscf.lib import misc
from numpy import asarray  # For backward compatibility

EINSUM_MAX_SIZE = getattr(misc.__config__, 'lib_einsum_max_size', 2000)

try:
    # Import tblis before libnp_helper to avoid potential dl-loading conflicts
    from pyscf import tblis_einsum
    FOUND_TBLIS = True
except (ImportError, OSError):
    FOUND_TBLIS = False

_np_helper = misc.load_library('libnp_helper')

BLOCK_DIM = 192
PLAIN = 0
HERMITIAN = 1
ANTIHERMI = 2
SYMMETRIC = 3

LeviCivita = numpy.zeros((3,3,3))
LeviCivita[0,1,2] = LeviCivita[1,2,0] = LeviCivita[2,0,1] = 1
LeviCivita[0,2,1] = LeviCivita[2,1,0] = LeviCivita[1,0,2] = -1

PauliMatrices = numpy.array([[[0., 1.],
                              [1., 0.]],  # x
                             [[0.,-1j],
                              [1j, 0.]],  # y
                             [[1., 0.],
                              [0.,-1.]]]) # z

if hasattr(numpy, 'einsum_path'):
    _einsum_path = numpy.einsum_path
else:
    def _einsum_path(subscripts, *operands, **kwargs):
        #indices  = re.split(',|->', subscripts)
        #indices_in = indices[:-1]
        #idx_final = indices[-1]
        if '->' in subscripts:
            indices_in, idx_final = subscripts.split('->')
            indices_in = indices_in.split(',')
            # indices = indices_in + [idx_final]
        else:
            idx_final = ''
            indices_in = subscripts.split('->')[0].split(',')

        if len(indices_in) <= 2:
            idx_removed = set(indices_in[0]).intersection(set(indices_in[1]))
            einsum_str = indices_in[1] + ',' + indices_in[0] + '->' + idx_final
            return operands, [((1,0), idx_removed, einsum_str, idx_final)]

        input_sets = [set(x) for x in indices_in]
        n_shared_max = 0
        for i in range(len(indices_in)):
            for j in range(i):
                tmp = input_sets[i].intersection(input_sets[j])
                n_shared_indices = len(tmp)
                if n_shared_indices > n_shared_max:
                    n_shared_max = n_shared_indices
                    idx_removed = tmp
                    a,b = i,j

        idxA = indices_in.pop(a)
        idxB = indices_in.pop(b)
        rest_idx = ''.join(indices_in) + idx_final
        idx_out = input_sets[a].union(input_sets[b])
        idx_out = ''.join(idx_out.intersection(set(rest_idx)))

        indices_in.append(idx_out)
        einsum_str = idxA + ',' + idxB + '->' + idx_out
        einsum_args = _einsum_path(','.join(indices_in)+'->'+idx_final)[1]
        einsum_args.insert(0, ((a, b), idx_removed, einsum_str, indices_in))
        return operands, einsum_args

_numpy_einsum = numpy.einsum
def _contract(subscripts, *tensors, **kwargs):
    idx_str = subscripts.replace(' ','')
    A, B = tensors
    # Call numpy.asarray because A or B may be HDF5 Datasets
    A = numpy.asarray(A)
    B = numpy.asarray(B)

    # small problem size
    if A.size < EINSUM_MAX_SIZE or B.size < EINSUM_MAX_SIZE:
        return _numpy_einsum(idx_str, A, B)

    C_dtype = numpy.result_type(A, B)
    if FOUND_TBLIS and C_dtype == numpy.double:
        # tblis is slow for complex type
        return tblis_einsum.contract(idx_str, A, B, **kwargs)

    indices  = idx_str.replace(',', '').replace('->', '')
    if '->' not in idx_str or any(indices.count(x) != 2 for x in set(indices)):
        return _numpy_einsum(idx_str, A, B)

    # Split the strings into a list of idx char's
    idxA, idxBC = idx_str.split(',')
    idxB, idxC = idxBC.split('->')
    assert len(idxA) == A.ndim
    assert len(idxB) == B.ndim

    uniq_idxA = set(idxA)
    uniq_idxB = set(idxB)
    # Find the shared indices being summed over
    shared_idxAB = uniq_idxA.intersection(uniq_idxB)

    if ((not shared_idxAB) or  # Indices must overlap
        # one operand is a subset of the other one (e.g. 'ijkl,jk->il')
        uniq_idxA == shared_idxAB or uniq_idxB == shared_idxAB or
        # repeated indices (e.g. 'iijk,kl->jl')
        len(idxA) != len(uniq_idxA) or len(idxB) != len(uniq_idxB)):
        return _numpy_einsum(idx_str, A, B)

    DEBUG = kwargs.get('DEBUG', False)

    if DEBUG:
        print("*** Einsum for", idx_str)
        print(" idxA =", idxA)
        print(" idxB =", idxB)
        print(" idxC =", idxC)

    # Get the range for each index and put it in a dictionary
    rangeA = dict(zip(idxA, A.shape))
    rangeB = dict(zip(idxB, B.shape))
    #rangeC = dict(zip(idxC, C.shape))
    if DEBUG:
        print("rangeA =", rangeA)
        print("rangeB =", rangeB)

    idxAt = list(idxA)
    idxBt = list(idxB)
    inner_shape = 1
    insert_B_loc = 0
    for n in shared_idxAB:
        if rangeA[n] != rangeB[n]:
            err = ('ERROR: In index string %s, the range of index %s is '
                   'different in A (%d) and B (%d)' %
                   (idx_str, n, rangeA[n], rangeB[n]))
            raise ValueError(err)

        # Bring idx all the way to the right for A
        # and to the left (but preserve order) for B
        idxA_n = idxAt.index(n)
        idxAt.insert(len(idxAt)-1, idxAt.pop(idxA_n))

        idxB_n = idxBt.index(n)
        idxBt.insert(insert_B_loc, idxBt.pop(idxB_n))
        insert_B_loc += 1

        inner_shape *= rangeA[n]

    if DEBUG:
        print("shared_idxAB =", shared_idxAB)
        print("inner_shape =", inner_shape)

    # Transpose the tensors into the proper order and reshape into matrices
    new_orderA = [idxA.index(idx) for idx in idxAt]
    new_orderB = [idxB.index(idx) for idx in idxBt]

    if DEBUG:
        print("Transposing A as", new_orderA)
        print("Transposing B as", new_orderB)
        print("Reshaping A as (-1,", inner_shape, ")")
        print("Reshaping B as (", inner_shape, ",-1)")

    shapeCt = list()
    idxCt = list()
    for idx in idxAt:
        if idx in shared_idxAB:
            break
        shapeCt.append(rangeA[idx])
        idxCt.append(idx)
    for idx in idxBt:
        if idx in shared_idxAB:
            continue
        shapeCt.append(rangeB[idx])
        idxCt.append(idx)
    new_orderCt = [idxCt.index(idx) for idx in idxC]

    if A.size == 0 or B.size == 0:
        shapeCt = [shapeCt[i] for i in new_orderCt]
        return numpy.zeros(shapeCt, dtype=C_dtype)

    At = A.transpose(new_orderA)
    Bt = B.transpose(new_orderB)

    if At.flags.f_contiguous:
        At = numpy.asarray(At.reshape(-1,inner_shape), order='F')
    else:
        At = numpy.asarray(At.reshape(-1,inner_shape), order='C')
    if Bt.flags.f_contiguous:
        Bt = numpy.asarray(Bt.reshape(inner_shape,-1), order='F')
    else:
        Bt = numpy.asarray(Bt.reshape(inner_shape,-1), order='C')

    return dot(At,Bt).reshape(shapeCt, order='A').transpose(new_orderCt)

def einsum(subscripts, *tensors, **kwargs):
    '''Perform a more efficient einsum via reshaping to a matrix multiply.

    Current differences compared to numpy.einsum:
    This assumes that each repeated index is actually summed (i.e. no 'i,i->i')
    and appears only twice (i.e. no 'ij,ik,il->jkl'). The output indices must
    be explicitly specified (i.e. 'ij,j->i' and not 'ij,j').
    '''
    contract = kwargs.pop('_contract', _contract)

    subscripts = subscripts.replace(' ','')
    if len(tensors) <= 1 or '...' in subscripts:
        out = _numpy_einsum(subscripts, *tensors, **kwargs)
    elif len(tensors) <= 2:
        out = _contract(subscripts, *tensors, **kwargs)
    else:
        if '->' in subscripts:
            indices_in, idx_final = subscripts.split('->')
            indices_in = indices_in.split(',')
        else:
            # idx_final = ''
            indices_in = subscripts.split('->')[0].split(',')
        tensors = list(tensors)
        contraction_list = _einsum_path(subscripts, *tensors, optimize=True,
                                        einsum_call=True)[1]
        for contraction in contraction_list:
            inds, idx_rm, einsum_str, remaining = contraction[:4]
            tmp_operands = [tensors.pop(x) for x in inds]
            if len(tmp_operands) > 2:
                out = _numpy_einsum(einsum_str, *tmp_operands)
            else:
                out = contract(einsum_str, *tmp_operands)
            tensors.append(out)
    return out


# 2d -> 1d or 3d -> 2d
def pack_tril(mat, axis=-1, out=None):
    '''flatten the lower triangular part of a matrix.
    Given mat, it returns mat[...,numpy.tril_indices(mat.shape[0])]

    Examples:

    >>> pack_tril(numpy.arange(9).reshape(3,3))
    [0 3 4 6 7 8]
    '''
    if mat.size == 0:
        return numpy.zeros(mat.shape+(0,), dtype=mat.dtype)

    if mat.ndim == 2:
        count, nd = 1, mat.shape[0]
        shape = nd*(nd+1)//2
    else:
        count, nd = mat.shape[:2]
        shape = (count, nd*(nd+1)//2)

    if mat.ndim == 2 or axis == -1:
        mat = numpy.asarray(mat, order='C')
        out = numpy.ndarray(shape, mat.dtype, buffer=out)
        if mat.dtype == numpy.double:
            fn = _np_helper.NPdpack_tril_2d
        elif mat.dtype == numpy.complex128:
            fn = _np_helper.NPzpack_tril_2d
        else:
            out[:] = mat[numpy.tril_indices(nd)]
            return out

        fn(ctypes.c_int(count), ctypes.c_int(nd),
           out.ctypes.data_as(ctypes.c_void_p),
           mat.ctypes.data_as(ctypes.c_void_p))
        return out

    else:  # pack the leading two dimension
        assert(axis == 0)
        out = mat[numpy.tril_indices(nd)]
        return out

# 1d -> 2d or 2d -> 3d, write hermitian lower triangle to upper triangle
def unpack_tril(tril, filltriu=HERMITIAN, axis=-1, out=None):
    '''Reversed operation of pack_tril.

    Kwargs:
        filltriu : int

            | 0           Do not fill the upper triangular part, random number may appear
                          in the upper triangular part
            | 1 (default) Transpose the lower triangular part to fill the upper triangular part
            | 2           Similar to filltriu=1, negative of the lower triangular part is assign
                          to the upper triangular part to make the matrix anti-hermitian

    Examples:

    >>> unpack_tril(numpy.arange(6.))
    [[ 0. 1. 3.]
     [ 1. 2. 4.]
     [ 3. 4. 5.]]
    >>> unpack_tril(numpy.arange(6.), 0)
    [[ 0. 0. 0.]
     [ 1. 2. 0.]
     [ 3. 4. 5.]]
    >>> unpack_tril(numpy.arange(6.), 2)
    [[ 0. -1. -3.]
     [ 1.  2. -4.]
     [ 3.  4.  5.]]
    '''
    tril = numpy.asarray(tril, order='C')
    if tril.ndim == 1:
        count, nd = 1, tril.size
        nd = int(numpy.sqrt(nd*2))
        shape = (nd,nd)
    elif tril.ndim == 2:
        if axis == 0:
            nd, count = tril.shape
        else:
            count, nd = tril.shape
        nd = int(numpy.sqrt(nd*2))
        shape = (count,nd,nd)
    else:
        raise NotImplementedError('unpack_tril for high dimension arrays')

    if (tril.dtype != numpy.double and tril.dtype != numpy.complex128):
        out = numpy.ndarray(shape, tril.dtype, buffer=out)
        idx, idy = numpy.tril_indices(nd)
        if filltriu == ANTIHERMI:
            out[...,idy,idx] = -tril
        else:
            out[...,idy,idx] = tril
        out[...,idx,idy] = tril
        return out

    elif tril.ndim == 1 or axis == -1 or axis == tril.ndim-1:
        out = numpy.ndarray(shape, tril.dtype, buffer=out)
        if tril.dtype == numpy.double:
            fn = _np_helper.NPdunpack_tril_2d
        else:
            fn = _np_helper.NPzunpack_tril_2d
        fn(ctypes.c_int(count), ctypes.c_int(nd),
           tril.ctypes.data_as(ctypes.c_void_p),
           out.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(filltriu))
        return out

    else:  # unpack the leading dimension
        assert(axis == 0)
        shape = (nd,nd) + tril.shape[1:]
        out = numpy.ndarray(shape, tril.dtype, buffer=out)
        idx = numpy.tril_indices(nd)
        if filltriu == HERMITIAN:
            for ij,(i,j) in enumerate(zip(*idx)):
                out[i,j] = tril[ij]
                out[j,i] = tril[ij].conj()
        elif filltriu == ANTIHERMI:
            for ij,(i,j) in enumerate(zip(*idx)):
                out[i,j] = tril[ij]
                out[j,i] =-tril[ij].conj()
        elif filltriu == SYMMETRIC:
            #:for ij,(i,j) in enumerate(zip(*idx)):
            #:    out[i,j] = out[j,i] = tril[ij]
            idxy = numpy.empty((nd,nd), dtype=numpy.int)
            idxy[idx[0],idx[1]] = idxy[idx[1],idx[0]] = numpy.arange(nd*(nd+1)//2)
            numpy.take(tril, idxy, axis=0, out=out)
        else:
            out[idx] = tril
        return out

# extract a row from a tril-packed matrix
def unpack_row(tril, row_id):
    '''Extract one row of the lower triangular part of a matrix.
    It is equivalent to unpack_tril(a)[row_id]

    Examples:

    >>> unpack_row(numpy.arange(6.), 0)
    [ 0. 1. 3.]
    >>> unpack_tril(numpy.arange(6.))[0]
    [ 0. 1. 3.]
    '''
    tril = numpy.ascontiguousarray(tril)
    nd = int(numpy.sqrt(tril.size*2))
    mat = numpy.empty(nd, tril.dtype)
    if tril.dtype == numpy.double:
        fn = _np_helper.NPdunpack_row
    elif tril.dtype == numpy.complex128:
        fn = _np_helper.NPzunpack_row
    else:
        p0 = row_id*(row_id+1)//2
        p1 = row_id*(row_id+1)//2 + row_id
        idx = numpy.arange(row_id, nd)
        return numpy.append(tril[p0:p1], tril[idx*(idx+1)//2+row_id])

    fn.restype = ctypes.c_void_p
    fn(ctypes.c_int(nd), ctypes.c_int(row_id),
       tril.ctypes.data_as(ctypes.c_void_p),
       mat.ctypes.data_as(ctypes.c_void_p))
    return mat

# for i > j of 2d mat, mat[j,i] = mat[i,j]
def hermi_triu(mat, hermi=HERMITIAN, inplace=True):
    '''Use the elements of the lower triangular part to fill the upper triangular part.

    Kwargs:
        filltriu : int

            | 1 (default) return a hermitian matrix
            | 2           return an anti-hermitian matrix

    Examples:

    >>> unpack_row(numpy.arange(9.).reshape(3,3), 1)
    [[ 0.  3.  6.]
     [ 3.  4.  7.]
     [ 6.  7.  8.]]
    >>> unpack_row(numpy.arange(9.).reshape(3,3), 2)
    [[ 0. -3. -6.]
     [ 3.  4. -7.]
     [ 6.  7.  8.]]
    '''
    assert(hermi == HERMITIAN or hermi == ANTIHERMI)
    if not inplace:
        mat = mat.copy('A')
    if mat.flags.c_contiguous:
        buf = mat
    elif mat.flags.f_contiguous:
        buf = mat.T
    else:
        raise NotImplementedError

    nd = mat.shape[0]
    assert(mat.size == nd**2)

    if mat.dtype == numpy.double:
        fn = _np_helper.NPdsymm_triu
    elif mat.dtype == numpy.complex128:
        fn = _np_helper.NPzhermi_triu
    else:
        raise NotImplementedError
    fn.restype = ctypes.c_void_p
    fn(ctypes.c_int(nd), buf.ctypes.data_as(ctypes.c_void_p),
       ctypes.c_int(hermi))
    return mat


LINEAR_DEP_THRESHOLD = 1e-10
def solve_lineq_by_SVD(a, b):
    '''Solving a * x = b.  If a is a singular matrix, its small SVD values are
    neglected.
    '''
    t, w, vH = numpy.linalg.svd(a)
    idx = []
    for i,wi in enumerate(w):
        if wi > LINEAR_DEP_THRESHOLD:
            idx.append(i)
    if idx:
        idx = numpy.array(idx)
        tb = numpy.dot(numpy.array(t[:,idx]).T.conj(), numpy.array(b))
        x = numpy.dot(numpy.array(vH[idx,:]).T.conj(), tb / w[idx])
    else:
        x = numpy.zeros_like(b)
    return x

def take_2d(a, idx, idy, out=None):
    '''Equivalent to a[idx[:,None],idy] for a 2D array.

    Examples:

    >>> out = numpy.arange(9.).reshape(3,3)
    >>> take_2d(a, [0,2], [0,2])
    [[ 0.  2.]
     [ 6.  8.]]
    '''
    a = numpy.asarray(a, order='C')
    out = numpy.ndarray((len(idx),len(idy)), dtype=a.dtype, buffer=out)
    idx = numpy.asarray(idx, dtype=numpy.int32)
    idy = numpy.asarray(idy, dtype=numpy.int32)
    if a.dtype == numpy.double:
        fn = _np_helper.NPdtake_2d
    elif a.dtype == numpy.complex128:
        fn = _np_helper.NPztake_2d
    else:
        return a[idx[:,None],idy]
    fn(out.ctypes.data_as(ctypes.c_void_p),
       a.ctypes.data_as(ctypes.c_void_p),
       idx.ctypes.data_as(ctypes.c_void_p),
       idy.ctypes.data_as(ctypes.c_void_p),
       ctypes.c_int(out.shape[1]), ctypes.c_int(a.shape[1]),
       ctypes.c_int(idx.size), ctypes.c_int(idy.size))
    return out

def takebak_2d(out, a, idx, idy, thread_safe=True):
    '''Reverse operation of take_2d.  Equivalent to out[idx[:,None],idy] += a
    for a 2D array.

    Examples:

    >>> out = numpy.zeros((3,3))
    >>> takebak_2d(out, numpy.ones((2,2)), [0,2], [0,2])
    [[ 1.  0.  1.]
     [ 0.  0.  0.]
     [ 1.  0.  1.]]
    '''
    assert(out.flags.c_contiguous)
    a = numpy.asarray(a, order='C')
    if out.dtype != a.dtype:
        a = a.astype(out.dtype)
    if out.dtype == numpy.double:
        fn = _np_helper.NPdtakebak_2d
    elif out.dtype == numpy.complex128:
        fn = _np_helper.NPztakebak_2d
    else:
        if thread_safe:
            out[idx[:,None], idy] += a
        else:
            raise NotImplementedError
        return out
    idx = numpy.asarray(idx, dtype=numpy.int32)
    idy = numpy.asarray(idy, dtype=numpy.int32)
    fn(out.ctypes.data_as(ctypes.c_void_p),
       a.ctypes.data_as(ctypes.c_void_p),
       idx.ctypes.data_as(ctypes.c_void_p),
       idy.ctypes.data_as(ctypes.c_void_p),
       ctypes.c_int(out.shape[1]), ctypes.c_int(a.shape[1]),
       ctypes.c_int(idx.size), ctypes.c_int(idy.size),
       ctypes.c_int(thread_safe))
    return out

def transpose(a, axes=None, inplace=False, out=None):
    '''Transposing an array with better memory efficiency

    Examples:

    >>> transpose(numpy.ones((3,2)))
    [[ 1.  1.  1.]
     [ 1.  1.  1.]]
    '''
    if inplace:
        arow, acol = a.shape
        assert(arow == acol)
        tmp = numpy.empty((BLOCK_DIM,BLOCK_DIM))
        for c0, c1 in misc.prange(0, acol, BLOCK_DIM):
            for r0, r1 in misc.prange(0, c0, BLOCK_DIM):
                tmp[:c1-c0,:r1-r0] = a[c0:c1,r0:r1]
                a[c0:c1,r0:r1] = a[r0:r1,c0:c1].T
                a[r0:r1,c0:c1] = tmp[:c1-c0,:r1-r0].T
            # diagonal blocks
            a[c0:c1,c0:c1] = a[c0:c1,c0:c1].T
        return a

    if (not a.flags.c_contiguous
        or (a.dtype != numpy.double and a.dtype != numpy.complex128)):
        if a.ndim == 2:
            arow, acol = a.shape
            out = numpy.empty((acol,arow), a.dtype)
            r1 = c1 = 0
            for c0 in range(0, acol-BLOCK_DIM, BLOCK_DIM):
                c1 = c0 + BLOCK_DIM
                for r0 in range(0, arow-BLOCK_DIM, BLOCK_DIM):
                    r1 = r0 + BLOCK_DIM
                    out[c0:c1,r0:r1] = a[r0:r1,c0:c1].T
                out[c0:c1,r1:arow] = a[r1:arow,c0:c1].T
            for r0 in range(0, arow-BLOCK_DIM, BLOCK_DIM):
                r1 = r0 + BLOCK_DIM
                out[c1:acol,r0:r1] = a[r0:r1,c1:acol].T
            out[c1:acol,r1:arow] = a[r1:arow,c1:acol].T
            return out
        else:
            return a.transpose(axes)

    if a.ndim == 2:
        arow, acol = a.shape
        c_shape = (ctypes.c_int*3)(1, arow, acol)
        out = numpy.ndarray((acol, arow), a.dtype, buffer=out)
    elif a.ndim == 3 and axes == (0,2,1):
        d0, arow, acol = a.shape
        c_shape = (ctypes.c_int*3)(d0, arow, acol)
        out = numpy.ndarray((d0, acol, arow), a.dtype, buffer=out)
    else:
        raise NotImplementedError

    assert(a.flags.c_contiguous)
    if a.dtype == numpy.double:
        fn = _np_helper.NPdtranspose_021
    else:
        fn = _np_helper.NPztranspose_021
    fn.restype = ctypes.c_void_p
    fn(c_shape, a.ctypes.data_as(ctypes.c_void_p),
       out.ctypes.data_as(ctypes.c_void_p))
    return out

def transpose_sum(a, inplace=False, out=None):
    '''Computing a + a.T with better memory efficiency

    Examples:

    >>> transpose_sum(numpy.arange(4.).reshape(2,2))
    [[ 0.  3.]
     [ 3.  6.]]
    '''
    return hermi_sum(a, inplace=inplace, out=out)

def hermi_sum(a, axes=None, hermi=HERMITIAN, inplace=False, out=None):
    '''Computing a + a.T.conj() with better memory efficiency

    Examples:

    >>> transpose_sum(numpy.arange(4.).reshape(2,2))
    [[ 0.  3.]
     [ 3.  6.]]
    '''
    if inplace:
        out = a
    else:
        out = numpy.ndarray(a.shape, a.dtype, buffer=out)

    if (not a.flags.c_contiguous
        or (a.dtype != numpy.double and a.dtype != numpy.complex128)):
        if a.ndim == 2:
            na = a.shape[0]
            for c0, c1 in misc.prange(0, na, BLOCK_DIM):
                for r0, r1 in misc.prange(0, c0, BLOCK_DIM):
                    tmp = a[r0:r1,c0:c1] + a[c0:c1,r0:r1].conj().T
                    out[c0:c1,r0:r1] = tmp.T.conj()
                    out[r0:r1,c0:c1] = tmp
                # diagonal blocks
                tmp = a[c0:c1,c0:c1] + a[c0:c1,c0:c1].conj().T
                out[c0:c1,c0:c1] = tmp
            return out
        else:
            raise NotImplementedError('input array is not C-contiguous')

    if a.ndim == 2:
        assert(a.shape[0] == a.shape[1])
        c_shape = (ctypes.c_int*3)(1, a.shape[0], a.shape[1])
    elif a.ndim == 3 and axes == (0,2,1):
        assert(a.shape[1] == a.shape[2])
        c_shape = (ctypes.c_int*3)(*(a.shape))
    else:
        raise NotImplementedError

    assert(a.flags.c_contiguous)
    if a.dtype == numpy.double:
        fn = _np_helper.NPdsymm_021_sum
    else:
        fn = _np_helper.NPzhermi_021_sum
    fn(c_shape, a.ctypes.data_as(ctypes.c_void_p),
       out.ctypes.data_as(ctypes.c_void_p), ctypes.c_int(hermi))
    return out

# NOTE: NOT assume array a, b to be C-contiguous, since a and b are two
# pointers we want to pass in.
# numpy.dot might not call optimized blas
def ddot(a, b, alpha=1, c=None, beta=0):
    '''Matrix-matrix multiplication for double precision arrays
    '''
    m = a.shape[0]
    k = a.shape[1]
    n = b.shape[1]
    if a.flags.c_contiguous:
        trans_a = 'N'
    elif a.flags.f_contiguous:
        trans_a = 'T'
        a = a.T
    else:
        a = numpy.asarray(a, order='C')
        trans_a = 'N'
        #raise ValueError('a.flags: %s' % str(a.flags))

    assert(k == b.shape[0])
    if b.flags.c_contiguous:
        trans_b = 'N'
    elif b.flags.f_contiguous:
        trans_b = 'T'
        b = b.T
    else:
        b = numpy.asarray(b, order='C')
        trans_b = 'N'
        #raise ValueError('b.flags: %s' % str(b.flags))

    if c is None:
        c = numpy.empty((m,n))
        beta = 0
    else:
        assert(c.shape == (m,n))

    return _dgemm(trans_a, trans_b, m, n, k, a, b, c, alpha, beta)

def zdot(a, b, alpha=1, c=None, beta=0):
    '''Matrix-matrix multiplication for double complex arrays
    '''
    m = a.shape[0]
    k = a.shape[1]
    n = b.shape[1]
    if a.flags.c_contiguous:
        trans_a = 'N'
    elif a.flags.f_contiguous:
        trans_a = 'T'
        a = a.T
    else:
        raise ValueError('a.flags: %s' % str(a.flags))

    assert(k == b.shape[0])
    if b.flags.c_contiguous:
        trans_b = 'N'
    elif b.flags.f_contiguous:
        trans_b = 'T'
        b = b.T
    else:
        raise ValueError('b.flags: %s' % str(b.flags))

    if c is None:
        beta = 0
        c = numpy.empty((m,n), dtype=numpy.complex128)
    else:
        assert(c.shape == (m,n))

    return _zgemm(trans_a, trans_b, m, n, k, a, b, c, alpha, beta)

def dot(a, b, alpha=1, c=None, beta=0):
    atype = a.dtype
    btype = b.dtype

    if atype == numpy.float64 and btype == numpy.float64:
        if c is None or c.dtype == numpy.float64:
            return ddot(a, b, alpha, c, beta)
        else:
            cr = numpy.asarray(c.real, order='C')
            c.real = ddot(a, b, alpha, cr, beta)
            return c

    elif atype == numpy.complex128 and btype == numpy.complex128:
        # Gauss's complex multiplication algorithm may affect numerical stability
        #k1 = ddot(a.real+a.imag, b.real.copy(), alpha)
        #k2 = ddot(a.real.copy(), b.imag-b.real, alpha)
        #k3 = ddot(a.imag.copy(), b.real+b.imag, alpha)
        #ab = k1-k3 + (k1+k2)*1j
        return zdot(a, b, alpha, c, beta)

    elif atype == numpy.float64 and btype == numpy.complex128:
        if b.flags.f_contiguous:
            order = 'F'
        else:
            order = 'C'
        cr = ddot(a, numpy.asarray(b.real, order=order), alpha)
        ci = ddot(a, numpy.asarray(b.imag, order=order), alpha)
        ab = numpy.ndarray(cr.shape, dtype=numpy.complex128, buffer=c)
        if c is None or beta == 0:
            ab.real = cr
            ab.imag = ci
        else:
            ab *= beta
            ab.real += cr
            ab.imag += ci
        return ab

    elif atype == numpy.complex128 and btype == numpy.float64:
        if a.flags.f_contiguous:
            order = 'F'
        else:
            order = 'C'
        cr = ddot(numpy.asarray(a.real, order=order), b, alpha)
        ci = ddot(numpy.asarray(a.imag, order=order), b, alpha)
        ab = numpy.ndarray(cr.shape, dtype=numpy.complex128, buffer=c)
        if c is None or beta == 0:
            ab.real = cr
            ab.imag = ci
        else:
            ab *= beta
            ab.real += cr
            ab.imag += ci
        return ab

    else:
        if c is None:
            c = numpy.dot(a, b) * alpha
        elif beta == 0:
            c[:] = numpy.dot(a, b) * alpha
        else:
            c *= beta
            c += numpy.dot(a, b) * alpha
        return c

# a, b, c in C-order
def _dgemm(trans_a, trans_b, m, n, k, a, b, c, alpha=1, beta=0,
           offseta=0, offsetb=0, offsetc=0):
    if a.size == 0 or b.size == 0:
        if beta == 0:
            c[:] = 0
        else:
            c[:] *= beta
        return c

    assert(a.flags.c_contiguous)
    assert(b.flags.c_contiguous)
    assert(c.flags.c_contiguous)

    _np_helper.NPdgemm(ctypes.c_char(trans_b.encode('ascii')),
                       ctypes.c_char(trans_a.encode('ascii')),
                       ctypes.c_int(n), ctypes.c_int(m), ctypes.c_int(k),
                       ctypes.c_int(b.shape[1]), ctypes.c_int(a.shape[1]),
                       ctypes.c_int(c.shape[1]),
                       ctypes.c_int(offsetb), ctypes.c_int(offseta),
                       ctypes.c_int(offsetc),
                       b.ctypes.data_as(ctypes.c_void_p),
                       a.ctypes.data_as(ctypes.c_void_p),
                       c.ctypes.data_as(ctypes.c_void_p),
                       ctypes.c_double(alpha), ctypes.c_double(beta))
    return c
def _zgemm(trans_a, trans_b, m, n, k, a, b, c, alpha=1, beta=0,
           offseta=0, offsetb=0, offsetc=0):
    if a.size == 0 or b.size == 0:
        if beta == 0:
            c[:] = 0
        else:
            c[:] *= beta
        return c

    assert(a.flags.c_contiguous)
    assert(b.flags.c_contiguous)
    assert(c.flags.c_contiguous)
    assert(a.dtype == numpy.complex128)
    assert(b.dtype == numpy.complex128)
    assert(c.dtype == numpy.complex128)

    _np_helper.NPzgemm(ctypes.c_char(trans_b.encode('ascii')),
                       ctypes.c_char(trans_a.encode('ascii')),
                       ctypes.c_int(n), ctypes.c_int(m), ctypes.c_int(k),
                       ctypes.c_int(b.shape[1]), ctypes.c_int(a.shape[1]),
                       ctypes.c_int(c.shape[1]),
                       ctypes.c_int(offsetb), ctypes.c_int(offseta),
                       ctypes.c_int(offsetc),
                       b.ctypes.data_as(ctypes.c_void_p),
                       a.ctypes.data_as(ctypes.c_void_p),
                       c.ctypes.data_as(ctypes.c_void_p),
                       (ctypes.c_double*2)(alpha.real, alpha.imag),
                       (ctypes.c_double*2)(beta.real, beta.imag))
    return c

def frompointer(pointer, count, dtype=float):
    '''Interpret a buffer that the pointer refers to as a 1-dimensional array.

    Args:
        pointer : int or ctypes pointer
            address of a buffer
        count : int
            Number of items to read.
        dtype : data-type, optional
            Data-type of the returned array; default: float.

    Examples:

    >>> s = numpy.ones(3, dtype=numpy.int32)
    >>> ptr = s.ctypes.data
    >>> frompointer(ptr, count=6, dtype=numpy.int16)
    [1, 0, 1, 0, 1, 0]
    '''
    dtype = numpy.dtype(dtype)
    count *= dtype.itemsize
    buf = (ctypes.c_char * count).from_address(pointer)
    a = numpy.ndarray(count, dtype=numpy.int8, buffer=buf)
    return a.view(dtype)

from distutils.version import LooseVersion
if LooseVersion(numpy.__version__) <= LooseVersion('1.6.0'):
    def norm(x, ord=None, axis=None):
        '''numpy.linalg.norm for numpy 1.6.*
        '''
        if axis is None or ord is not None:
            return numpy.linalg.norm(x, ord)
        else:
            x = numpy.asarray(x)
            axes = string.ascii_lowercase[:x.ndim]
            target = axes.replace(axes[axis], '')
            descr = '%s,%s->%s' % (axes, axes, target)
            xx = _numpy_einsum(descr, x.conj(), x)
            return numpy.sqrt(xx.real)
else:
    norm = numpy.linalg.norm
del(LooseVersion)

def cond(x, p=None):
    '''Compute the condition number'''
    if isinstance(x, numpy.ndarray) and x.ndim == 2 or p is not None:
        return numpy.linalg.cond(x, p)
    else:
        return numpy.asarray([numpy.linalg.cond(xi) for xi in x])

def cartesian_prod(arrays, out=None):
    '''
    Generate a cartesian product of input arrays.
    http://stackoverflow.com/questions/1208118/using-numpy-to-build-an-array-of-all-combinations-of-two-arrays

    Args:
        arrays : list of array-like
            1-D arrays to form the cartesian product of.
        out : ndarray
            Array to place the cartesian product in.

    Returns:
        out : ndarray
            2-D array of shape (M, len(arrays)) containing cartesian products
            formed of input arrays.

    Examples:

    >>> cartesian_prod(([1, 2, 3], [4, 5], [6, 7]))
    array([[1, 4, 6],
           [1, 4, 7],
           [1, 5, 6],
           [1, 5, 7],
           [2, 4, 6],
           [2, 4, 7],
           [2, 5, 6],
           [2, 5, 7],
           [3, 4, 6],
           [3, 4, 7],
           [3, 5, 6],
           [3, 5, 7]])

    '''
    arrays = [numpy.asarray(x) for x in arrays]
    dtype = numpy.result_type(*arrays)
    nd = len(arrays)
    dims = [nd] + [len(x) for x in arrays]
    out = numpy.ndarray(dims, dtype, buffer=out)

    shape = [-1] + [1] * nd
    for i, arr in enumerate(arrays):
        out[i] = arr.reshape(shape[:nd-i])

    return out.reshape(nd,-1).T

def direct_sum(subscripts, *operands):
    '''Apply the summation over many operands with the einsum fashion.

    Examples:

    >>> a = numpy.random.random((6,5))
    >>> b = numpy.random.random((4,3,2))
    >>> direct_sum('ij,klm->ijklm', a, b).shape
    (6, 5, 4, 3, 2)
    >>> direct_sum('ij,klm', a, b).shape
    (6, 5, 4, 3, 2)
    >>> direct_sum('i,j,klm->mjlik', a[0], a[:,0], b).shape
    (2, 6, 3, 5, 4)
    >>> direct_sum('ij-klm->ijklm', a, b).shape
    (6, 5, 4, 3, 2)
    >>> direct_sum('ij+klm', a, b).shape
    (6, 5, 4, 3, 2)
    >>> direct_sum('-i-j+klm->mjlik', a[0], a[:,0], b).shape
    (2, 6, 3, 5, 4)
    >>> c = numpy.random((3,5))
    >>> z = direct_sum('ik+jk->kij', a, c).shape  # This is slow
    >>> abs(a.T.reshape(5,6,1) + c.reshape(5,1,3) - z).sum()
    0.0
    '''

    def sign_and_symbs(subscript):
        ''' sign list and notation list'''
        subscript = subscript.replace(' ', '').replace(',', '+')

        if subscript[0] not in '+-':
            subscript = '+' + subscript
        sign = [x for x in subscript if x in '+-']

        symbs = subscript[1:].replace('-', '+').split('+')
        #s = ''.join(symbs)
        #assert(len(set(s)) == len(s))  # make sure no duplicated symbols
        return sign, symbs

    if '->' in subscripts:
        src, dest = subscripts.split('->')
        sign, src = sign_and_symbs(src)
        dest = dest.replace(' ', '')
    else:
        sign, src = sign_and_symbs(subscripts)
        dest = ''.join(src)
    assert(len(src) == len(operands))

    for i, symb in enumerate(src):
        op = numpy.asarray(operands[i])
        assert(len(symb) == op.ndim)
        unisymb = set(symb)
        if len(unisymb) != len(symb):
            unisymb = ''.join(unisymb)
            op = _numpy_einsum('->'.join((symb, unisymb)), op)
            src[i] = unisymb
        if i == 0:
            if sign[i] == '+':
                out = op
            else:
                out = -op
        elif sign[i] == '+':
            out = out.reshape(out.shape+(1,)*op.ndim) + op
        else:
            out = out.reshape(out.shape+(1,)*op.ndim) - op

    out = _numpy_einsum('->'.join((''.join(src), dest)), out)
    out.flags.writeable = True  # old numpy has this issue
    return out

def condense(opname, a, loc_x, loc_y=None):
    '''
    .. code-block:: python

        for i,i0 in enumerate(loc_x):
            i1 = loc_x[i+1]
            for j,j0 in enumerate(loc_y):
                j1 = loc_y[j+1]
                out[i,j] = op(a[i0:i1,j0:j1])
    '''
    assert(a.dtype == numpy.double)
    if not opname.startswith('NP_'):
        opname = 'NP_' + opname
    op = getattr(_np_helper, opname)
    if loc_y is None:
        loc_y = loc_x
    loc_x = numpy.asarray(loc_x, numpy.int32)
    loc_y = numpy.asarray(loc_y, numpy.int32)
    nloc_x = loc_x.size - 1
    nloc_y = loc_y.size - 1
    if a.flags.f_contiguous:
        out = numpy.zeros((nloc_x, nloc_y), order='F')
    else:
        a = numpy.asarray(a, order='C')
        out = numpy.zeros((nloc_x, nloc_y))
    _np_helper.NPcondense(op, out.ctypes.data_as(ctypes.c_void_p),
                          a.ctypes.data_as(ctypes.c_void_p),
                          loc_x.ctypes.data_as(ctypes.c_void_p),
                          loc_y.ctypes.data_as(ctypes.c_void_p),
                          ctypes.c_int(nloc_x), ctypes.c_int(nloc_y))
    return out

def expm(a):
    '''Equivalent to scipy.linalg.expm'''
    bs = [a.copy()]
    n = 0
    for n in range(1, 14):
        bs.append(ddot(bs[-1], a))
        radius = (2**(n*(n+2))*math.factorial(n+2)*1e-16) **((n+1.)/(n+2))
        #print(n, radius, bs[-1].max(), -bs[-1].min())
        if bs[-1].max() < radius and -bs[-1].min() < radius:
            break

    y = numpy.eye(a.shape[0])
    fac = 1
    for i, b in enumerate(bs):
        fac *= i + 1
        b *= (.5**(n*(i+1)) / fac)
        y += b
    buf, bs = bs[0], None
    for i in range(n):
        ddot(y, y, 1, buf, 0)
        y, buf = buf, y
    return y


class NPArrayWithTag(numpy.ndarray):
    # Initialize kwargs in function tag_array
    #def __new__(cls, a, **kwargs):
    #    obj = numpy.asarray(a).view(cls)
    #    obj.__dict__.update(kwargs)
    #    return obj

    # Customize __reduce__ and __setstate__ to keep tags after serialization
    # pickle.loads(pickle.dumps(tagarray)).  This is needed by mpi communication
    def __reduce__(self):
        pickled = numpy.ndarray.__reduce__(self)
        state = pickled[2] + (self.__dict__,)
        return (pickled[0], pickled[1], state)

    def __setstate__(self, state):
        numpy.ndarray.__setstate__(self, state[0:-1])
        self.__dict__.update(state[-1])

    # Whenever the contents of the array was modified (through ufunc), the tag
    # should be expired. Overwrite the output of ufunc to restore ndarray type.
    def __array_wrap__(self, out, context=None):
        return numpy.ndarray.__array_wrap__(self, out, context).view(numpy.ndarray)


def tag_array(a, **kwargs):
    '''Attach attributes to numpy ndarray. The attribute name and value are
    obtained from the keyword arguments.
    '''
    # Make a shadow copy in any circumstance by converting it to an nparray.
    # If a is an object of NPArrayWithTag, all attributes will be lost in this
    # conversion. They need to be restored.
    t = numpy.asarray(a).view(NPArrayWithTag)

    if isinstance(a, NPArrayWithTag):
        t.__dict__.update(a.__dict__)
    t.__dict__.update(kwargs)
    return t

#TODO: merge with function pbc.cc.kccsd_rhf.vector_to_nested
def split_reshape(a, shapes):
    '''
    Split a vector into multiple tensors. shapes is a list of tuples.
    The entries of shapes indicate the shape of each tensor.

    Returns:
        tensors : a list of tensors

    Examples:

    >>> a = numpy.arange(12)
    >>> split_reshape(a, ((2,3), (1,), ((2,2), (1,1))))
    [array([[0, 1, 2],
            [3, 4, 5]]),
     array([6]),
     [array([[ 7,  8],
             [ 9, 10]]),
      array([[11]])]]
    '''
    if isinstance(shapes[0], (int, numpy.integer)):
        return a.reshape(shapes)

    def sub_split(a, shapes):
        tensors = []
        p1 = 0
        for shape in shapes:
            if isinstance(shape[0], (int, numpy.integer)):
                p0, p1 = p1, p1 + numpy.prod(shape)
                tensors.append(a[p0:p1].reshape(shape))
            else:
                subtensors, size = sub_split(a[p1:], shape)
                p1 += size
                tensors.append(subtensors)
        size = p1
        return tensors, size
    return sub_split(a, shapes)[0]

if __name__ == '__main__':
    a = numpy.random.random((30,40,5,10))
    b = numpy.random.random((10,30,5,20))
    c = numpy.random.random((10,20,20))
    d = numpy.random.random((20,10))
    f = einsum('ijkl,xiky,ayp,px->ajl', a,b,c,d, optimize=True)
    ref = einsum('ijkl,xiky->jlxy', a, b)
    ref = einsum('jlxy,ayp->jlxap', ref, c)
    ref = einsum('jlxap,px->ajl', ref, d)
    print(abs(ref-f).max())