/*-------------------------------------------------------------------
Copyright 2018 Ravishankar Sundararaman
This file is part of JDFTx.
JDFTx is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
JDFTx is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with JDFTx. If not, see .
-------------------------------------------------------------------*/
#include
#include
#include
//------------- Sub-matrices ------------------------
complex matrix::operator()(int i, int j) const
{ assert(i=0);
assert(j=0);
if(isOnGpu())
{ complex ret;
#ifdef GPU_ENABLED
cudaMemcpy(&ret, dataGpu()+index(i,j), sizeof(complex), cudaMemcpyDeviceToHost);
#else
assert(!"onGpu=true without GPU_ENABLED");
#endif
return ret;
}
else return data()[index(i,j)];
}
matrixScaledTransOp matrix::operator()(int iStart, int iStop, int jStart, int jStop) const
{ return matrixScaledTransOp(*this, iStart,iStop, jStart,jStop);
}
#ifdef GPU_ENABLED
void matrixSubGet_gpu(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out); //implemented in operators.cu
#endif
matrix matrix::operator()(int iStart, int iStep, int iStop, int jStart, int jStep, int jStop) const
{ if(iStart==0 && iStep==1 && iStop==nr && jStart==0 && jStep==1 && jStop==nc)
return *this; //faster to copy matrix for this special case
assert(iStart>=0 && iStartiStart && iStop<=nr);
assert(iStep>0);
assert(jStart>=0 && jStartjStart && jStop<=nc);
assert(jStep>0);
int iDelta = ceildiv(iStop-iStart, iStep), jDelta = ceildiv(jStop-jStart, jStep);
matrix ret(iDelta,jDelta, isGpuEnabled()); complex* retData = ret.dataPref(); const complex* thisData = this->dataPref();
#ifdef GPU_ENABLED
matrixSubGet_gpu(nr, iStart,iStep,iDelta, jStart,jStep,jDelta, thisData, retData);
#else
for(int i=0; iindex(i*iStep+iStart, j*jStep+jStart)];
#endif
return ret;
}
void matrix::set(int i, int j, complex m)
{ assert(i=0);
assert(j=0);
if(isOnGpu())
{
#ifdef GPU_ENABLED
cudaMemcpy(dataGpu()+index(i,j), &m, sizeof(complex), cudaMemcpyHostToDevice);
#else
assert(!"onGpu=true without GPU_ENABLED");
#endif
}
else data()[index(i,j)] = m;
}
#ifdef GPU_ENABLED
void matrixSubSet_gpu(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out);
void matrixSubAccum_gpu(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out);
#endif
#define DECLARE_matrixSubSetAccum(nAME, NAME, OP) \
void matrixSub ##NAME(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out) \
{ for(int j=0; j=0 && iStartiStart && iStop<=nr); \
assert(iStep>0); \
assert(jStart>=0 || jStartjStart || jStop<=nc); \
assert(jStep>0); \
int iDelta = ceildiv(iStop-iStart, iStep); \
int jDelta = ceildiv(jStop-jStart, jStep); \
assert(iDelta==m.nr); \
assert(jDelta==m.nc); \
callPref(matrixSub ##NAME)(nr, iStart,iStep,iDelta, jStart,jStep,jDelta, m.dataPref(), dataPref()); \
watch.stop(); \
}
DECLARE_matrixSubSetAccum(set, Set, =)
DECLARE_matrixSubSetAccum(accum, Accum, +=)
#undef DECLARE_matrixSubSetAccum
//----------------------- Arithmetic ---------------------
matrix operator*(const matrixScaledTransOp &m1st, const matrixScaledTransOp &m2st)
{ assert(m1st.nCols() == m2st.nRows());
const matrix& m1 = m1st.mat;
const matrix& m2 = m2st.mat;
double scaleFac = m1st.scale * m2st.scale;
matrix ret(m1st.nRows(), m2st.nCols(), isGpuEnabled());
callPref(eblas_zgemm)(m1st.op, m2st.op,
ret.nRows(), ret.nCols(), m1st.nCols(), scaleFac,
m1.dataPref()+m1st.index(0,0), m1.nRows(),
m2.dataPref()+m2st.index(0,0), m2.nRows(),
0.0, ret.dataPref(), ret.nRows());
return ret;
}
//----- diagonal matrix multiplies
template void mulMD(int nRows, int nCols, const complex* M, const scalar* D, complex* out)
{ for(int j=0; j void mulDM(int nRows, int nCols, const scalar* D, const complex* M, complex* out)
{ for(int j=0; j
#define mulMDcomplex mulMD
#define mulDMdouble mulDM
#define mulDMcomplex mulDM
#ifdef GPU_ENABLED
void mulMDdouble_gpu(int nRows, int nCols, const complex* M, const double* D, complex* out);
void mulDMdouble_gpu(int nRows, int nCols, const double* D, const complex* M, complex* out);
void mulMDcomplex_gpu(int nRows, int nCols, const complex* M, const complex* D, complex* out);
void mulDMcomplex_gpu(int nRows, int nCols, const complex* D, const complex* M, complex* out);
#define GetData(D,scalar) ManagedArray(D).dataGpu()
#else
#define GetData(D,scalar) D.data()
#endif
matrix operator*(const matrix& m, const diagMatrix& d)
{ assert(m.nCols()==d.nRows());
matrix ret(m.nRows(), m.nCols(), isGpuEnabled());
callPref(mulMDdouble)(m.nRows(), m.nCols(), m.dataPref(), GetData(d,double), ret.dataPref());
return ret;
}
matrix operator*(const matrix& m, const std::vector& d)
{ assert(m.nCols()==int(d.size()));
matrix ret(m.nRows(), m.nCols(), isGpuEnabled());
callPref(mulMDcomplex)(m.nRows(), m.nCols(), m.dataPref(), GetData(d,complex), ret.dataPref());
return ret;
}
matrix operator*(const diagMatrix& d, const matrix& m)
{ assert(d.nCols()==m.nRows());
matrix ret(m.nRows(), m.nCols(), isGpuEnabled());
callPref(mulDMdouble)(m.nRows(), m.nCols(), GetData(d,double), m.dataPref(), ret.dataPref());
return ret;
}
matrix operator*(const std::vector& d, const matrix& m)
{ assert(int(d.size())==m.nRows());
matrix ret(m.nRows(), m.nCols(), isGpuEnabled());
callPref(mulDMcomplex)(m.nRows(), m.nCols(), GetData(d,complex), m.dataPref(), ret.dataPref());
return ret;
}
diagMatrix operator*(const diagMatrix& d1, const diagMatrix& d2)
{ assert(d1.nCols()==d2.nRows());
diagMatrix ret(d1);
for(int i=0; iconj() * (*(Ycur++))).real();
out[col] = result;
}
}
void diagDot(int nRows, int nCols, const complex* X, const complex* Y, double* out)
{ threadLaunch(diagDot_thread, nCols, nRows, X, Y, out);
}
#ifdef GPU_ENABLED
void diagDot_gpu(int nRows, int nCols, const complex* X, const complex* Y, double* out); //implemented in matrixOperators.cu
#endif
diagMatrix diagDot(const matrix& X, const matrix& Y)
{ static StopWatch watch("diagDot"); watch.start();
assert(X.nCols()==Y.nCols());
assert(X.nRows()==Y.nRows());
ManagedArray buf; buf.init(X.nCols(), isGpuEnabled());
callPref(diagDot)(X.nRows(), X.nCols(), X.dataPref(), Y.dataPref(), buf.dataPref());
//Copy to diagMatrix:
diagMatrix result(X.nCols());
eblas_copy(result.data(), buf.data(), X.nCols());
watch.stop();
return result;
}
//---------------- Misc matrix ops --------------------
complex det(const matrix& A)
{
matrix decomposition = LU(A);
int N = A.nRows();
// Multiplies the diagonal entries to get the determinant up to a sign
complex determinant(1., 0.);
for(int i=0; i retManaged; retManaged.init(N, true);
eblas_zero_gpu(N, retManaged.dataGpu());
eblas_daxpy_gpu(N, 1., (const double*)A.dataGpu(),diagStride*2, retManaged.dataGpu(),1);
eblas_copy(ret.data(), retManaged.data(), N);
#else
eblas_daxpy(N, 1., (const double*)A.data(),diagStride*2, ret.data(),1);
#endif
return ret;
}
diagMatrix eye(int N)
{ diagMatrix ret(N, 1.);
return ret;
}
matrix zeroes(int nRows, int nCols)
{ matrix ret(nRows, nCols, isGpuEnabled());
ret.zero();
return ret;
}
//Apply pending transpose / dagger operations:
matrixScaledTransOp::operator matrix() const
{ if(op==CblasNoTrans)
return scale * mat(iStart,1,iStart+iDelta, jStart,1,jStart+jDelta);
else
{ matrix ret = zeroes(nRows(), nCols());
//Copy data, applying transpose and submatrix
complex* retData = ret.dataPref();
const complex* inData = mat.dataPref()+index(0,0);
for(int j=0; j& A)
{ matrix B = A.data;
double* Bdata = (double*)B.dataPref();
callPref(eblas_dscal)(B.nData(), A.scale, Bdata, 2); //real parts
callPref(eblas_dscal)(B.nData(), -A.scale, Bdata+1, 2); //imag parts
return B;
}
// Hermitian adjoint
matrixScaledTransOp dagger(const matrixScaledTransOp& A)
{ matrixScaledTransOp result(A);
assert(A.op != CblasTrans); //dagger(CblasTrans) = "CblasConj" is not supported in BLAS
result.op = (A.op==CblasNoTrans) ? CblasConjTrans : CblasNoTrans;
return result;
//Note: sub-matrix operation refers to input matrix and does not need to be handled above (because op applies after submatrix logically)
}
matrix dagger_symmetrize(const matrixScaledTransOp& A)
{ return 0.5*(dagger(A) + A);
}
// Transpose
matrixScaledTransOp transpose(const matrixScaledTransOp& A)
{ matrixScaledTransOp result(A);
assert(A.op != CblasConjTrans); //transpose(CblasConjTrans) = "CblasConj" is not supported in BLAS
result.op = (A.op==CblasNoTrans) ? CblasTrans : CblasNoTrans;
return result;
//Note: sub-matrix operation refers to input matrix and does not need to be handled above (because op applies after submatrix logically)
}
matrix transpose_symmetrize(const matrixScaledTransOp& A)
{ return 0.5*(transpose(A) + A);
}
double nrm2(const matrixScaledTransOp& A)
{ const complex* Adata = A.mat.dataPref() + A.mat.index(A.iStart,A.jStart);
if(A.iDelta == A.mat.nRows()) //contiguous
return callPref(eblas_dznrm2)(A.iDelta*A.jDelta, Adata, 1);
else //not=contiguous
{ double resultSq = 0.;
for(int j=0; j* phaseArr) : mBlock(mBlock), nBlocks(nBlocks), phaseArr(phaseArr)
{ if(phaseArr) assert(nBlocks==int(phaseArr->size()));
}
matrix tiledBlockMatrix::operator*(const matrix& other) const
{ assert(mBlock.nCols()*nBlocks == other.nRows());
matrix result(mBlock.nRows()*nBlocks, other.nCols(), isGpuEnabled());
//Dense matrix multiply for each block:
for(int iBlock=0; iBlockat(iBlock) : 1.;
callPref(eblas_zgemm)(CblasNoTrans, CblasNoTrans, mBlock.nRows(), other.nCols(), mBlock.nCols(),
phase, mBlock.dataPref(), mBlock.nRows(), other.dataPref()+offs, other.nRows(),
0.0, result.dataPref()+offs, result.nRows());
}
return result;
}
matrix operator*(const matrix& m, const tiledBlockMatrix& tbm)
{ assert(m.nCols() == tbm.mBlock.nRows()*tbm.nBlocks);
matrix result(m.nRows(), tbm.mBlock.nCols()*tbm.nBlocks, isGpuEnabled());
//Dense matrix multiply for each block:
for(int iBlock=0; iBlockat(iBlock) : 1.;
callPref(eblas_zgemm)(CblasNoTrans, CblasNoTrans, m.nRows(), tbm.mBlock.nCols(), tbm.mBlock.nRows(),
phase, m.dataPref()+offsIn, m.nRows(), tbm.mBlock.dataPref(), tbm.mBlock.nRows(),
0.0, result.dataPref()+offsOut, result.nRows());
}
return result;
}