1 /*-------------------------------------------------------------------
2 Copyright 2018 Ravishankar Sundararaman
3
4 This file is part of JDFTx.
5
6 JDFTx is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 JDFTx is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with JDFTx. If not, see <http://www.gnu.org/licenses/>.
18 -------------------------------------------------------------------*/
19
20 #include <core/matrix.h>
21 #include <core/GpuUtil.h>
22 #include <core/Random.h>
23
24 //------------- Sub-matrices ------------------------
25
operator ()(int i,int j) const26 complex matrix::operator()(int i, int j) const
27 { assert(i<nr and i>=0);
28 assert(j<nc and j>=0);
29 if(isOnGpu())
30 { complex ret;
31 #ifdef GPU_ENABLED
32 cudaMemcpy(&ret, dataGpu()+index(i,j), sizeof(complex), cudaMemcpyDeviceToHost);
33 #else
34 assert(!"onGpu=true without GPU_ENABLED");
35 #endif
36 return ret;
37 }
38 else return data()[index(i,j)];
39 }
40
operator ()(int iStart,int iStop,int jStart,int jStop) const41 matrixScaledTransOp matrix::operator()(int iStart, int iStop, int jStart, int jStop) const
42 { return matrixScaledTransOp(*this, iStart,iStop, jStart,jStop);
43 }
44
45 #ifdef GPU_ENABLED
46 void matrixSubGet_gpu(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out); //implemented in operators.cu
47 #endif
operator ()(int iStart,int iStep,int iStop,int jStart,int jStep,int jStop) const48 matrix matrix::operator()(int iStart, int iStep, int iStop, int jStart, int jStep, int jStop) const
49 { if(iStart==0 && iStep==1 && iStop==nr && jStart==0 && jStep==1 && jStop==nc)
50 return *this; //faster to copy matrix for this special case
51
52 assert(iStart>=0 && iStart<nr);
53 assert(iStop>iStart && iStop<=nr);
54 assert(iStep>0);
55 assert(jStart>=0 && jStart<nc);
56 assert(jStop>jStart && jStop<=nc);
57 assert(jStep>0);
58
59 int iDelta = ceildiv(iStop-iStart, iStep), jDelta = ceildiv(jStop-jStart, jStep);
60 matrix ret(iDelta,jDelta, isGpuEnabled()); complex* retData = ret.dataPref(); const complex* thisData = this->dataPref();
61 #ifdef GPU_ENABLED
62 matrixSubGet_gpu(nr, iStart,iStep,iDelta, jStart,jStep,jDelta, thisData, retData);
63 #else
64 for(int i=0; i<iDelta; i++)
65 for(int j=0; j<jDelta; j++)
66 retData[ret.index(i,j)] = thisData[this->index(i*iStep+iStart, j*jStep+jStart)];
67 #endif
68 return ret;
69 }
70
71
set(int i,int j,complex m)72 void matrix::set(int i, int j, complex m)
73 { assert(i<nr and i>=0);
74 assert(j<nc and j>=0);
75 if(isOnGpu())
76 {
77 #ifdef GPU_ENABLED
78 cudaMemcpy(dataGpu()+index(i,j), &m, sizeof(complex), cudaMemcpyHostToDevice);
79 #else
80 assert(!"onGpu=true without GPU_ENABLED");
81 #endif
82 }
83 else data()[index(i,j)] = m;
84 }
85
86 #ifdef GPU_ENABLED
87 void matrixSubSet_gpu(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out);
88 void matrixSubAccum_gpu(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out);
89 #endif
90 #define DECLARE_matrixSubSetAccum(nAME, NAME, OP) \
91 void matrixSub ##NAME(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out) \
92 { for(int j=0; j<jDelta; j++) \
93 for(int i=0; i<iDelta; i++) \
94 out[nr*(j*jStep+jStart) + (i*iStep+iStart)] OP *(in++); \
95 } \
96 void matrix::nAME(int iStart, int iStep, int iStop, int jStart, int jStep, int jStop, const matrix& m) \
97 { static StopWatch watch("matrix::" #nAME); watch.start(); \
98 assert(iStart>=0 && iStart<nr); \
99 assert(iStop>iStart && iStop<=nr); \
100 assert(iStep>0); \
101 assert(jStart>=0 || jStart<nc); \
102 assert(jStop>jStart || jStop<=nc); \
103 assert(jStep>0); \
104 int iDelta = ceildiv(iStop-iStart, iStep); \
105 int jDelta = ceildiv(jStop-jStart, jStep); \
106 assert(iDelta==m.nr); \
107 assert(jDelta==m.nc); \
108 callPref(matrixSub ##NAME)(nr, iStart,iStep,iDelta, jStart,jStep,jDelta, m.dataPref(), dataPref()); \
109 watch.stop(); \
110 }
111 DECLARE_matrixSubSetAccum(set, Set, =)
112 DECLARE_matrixSubSetAccum(accum, Accum, +=)
113 #undef DECLARE_matrixSubSetAccum
114
115 //----------------------- Arithmetic ---------------------
116
operator *(const matrixScaledTransOp & m1st,const matrixScaledTransOp & m2st)117 matrix operator*(const matrixScaledTransOp &m1st, const matrixScaledTransOp &m2st)
118 { assert(m1st.nCols() == m2st.nRows());
119 const matrix& m1 = m1st.mat;
120 const matrix& m2 = m2st.mat;
121 double scaleFac = m1st.scale * m2st.scale;
122 matrix ret(m1st.nRows(), m2st.nCols(), isGpuEnabled());
123 callPref(eblas_zgemm)(m1st.op, m2st.op,
124 ret.nRows(), ret.nCols(), m1st.nCols(), scaleFac,
125 m1.dataPref()+m1st.index(0,0), m1.nRows(),
126 m2.dataPref()+m2st.index(0,0), m2.nRows(),
127 0.0, ret.dataPref(), ret.nRows());
128 return ret;
129 }
130
131 //----- diagonal matrix multiplies
mulMD(int nRows,int nCols,const complex * M,const scalar * D,complex * out)132 template<typename scalar> void mulMD(int nRows, int nCols, const complex* M, const scalar* D, complex* out)
133 { for(int j=0; j<nCols; j++)
134 for(int i=0; i<nRows; i++)
135 *(out++) = *(M++) * D[j];
136 }
mulDM(int nRows,int nCols,const scalar * D,const complex * M,complex * out)137 template<typename scalar> void mulDM(int nRows, int nCols, const scalar* D, const complex* M, complex* out)
138 { for(int j=0; j<nCols; j++)
139 for(int i=0; i<nRows; i++)
140 *(out++) = D[i] * (*(M++));
141 }
142 #define mulMDdouble mulMD<double>
143 #define mulMDcomplex mulMD<complex>
144 #define mulDMdouble mulDM<double>
145 #define mulDMcomplex mulDM<complex>
146 #ifdef GPU_ENABLED
147 void mulMDdouble_gpu(int nRows, int nCols, const complex* M, const double* D, complex* out);
148 void mulDMdouble_gpu(int nRows, int nCols, const double* D, const complex* M, complex* out);
149 void mulMDcomplex_gpu(int nRows, int nCols, const complex* M, const complex* D, complex* out);
150 void mulDMcomplex_gpu(int nRows, int nCols, const complex* D, const complex* M, complex* out);
151 #define GetData(D,scalar) ManagedArray<scalar>(D).dataGpu()
152 #else
153 #define GetData(D,scalar) D.data()
154 #endif
operator *(const matrix & m,const diagMatrix & d)155 matrix operator*(const matrix& m, const diagMatrix& d)
156 { assert(m.nCols()==d.nRows());
157 matrix ret(m.nRows(), m.nCols(), isGpuEnabled());
158 callPref(mulMDdouble)(m.nRows(), m.nCols(), m.dataPref(), GetData(d,double), ret.dataPref());
159 return ret;
160 }
operator *(const matrix & m,const std::vector<complex> & d)161 matrix operator*(const matrix& m, const std::vector<complex>& d)
162 { assert(m.nCols()==int(d.size()));
163 matrix ret(m.nRows(), m.nCols(), isGpuEnabled());
164 callPref(mulMDcomplex)(m.nRows(), m.nCols(), m.dataPref(), GetData(d,complex), ret.dataPref());
165 return ret;
166 }
operator *(const diagMatrix & d,const matrix & m)167 matrix operator*(const diagMatrix& d, const matrix& m)
168 { assert(d.nCols()==m.nRows());
169 matrix ret(m.nRows(), m.nCols(), isGpuEnabled());
170 callPref(mulDMdouble)(m.nRows(), m.nCols(), GetData(d,double), m.dataPref(), ret.dataPref());
171 return ret;
172 }
operator *(const std::vector<complex> & d,const matrix & m)173 matrix operator*(const std::vector<complex>& d, const matrix& m)
174 { assert(int(d.size())==m.nRows());
175 matrix ret(m.nRows(), m.nCols(), isGpuEnabled());
176 callPref(mulDMcomplex)(m.nRows(), m.nCols(), GetData(d,complex), m.dataPref(), ret.dataPref());
177 return ret;
178 }
179
180
operator *(const diagMatrix & d1,const diagMatrix & d2)181 diagMatrix operator*(const diagMatrix& d1, const diagMatrix& d2)
182 { assert(d1.nCols()==d2.nRows());
183 diagMatrix ret(d1);
184 for(int i=0; i<ret.nRows(); i++) ret[i] *= d2[i]; //elementwise multiply
185 return ret;
186 }
187
axpy(double alpha,const diagMatrix & x,matrix & y)188 void axpy(double alpha, const diagMatrix& x, matrix& y)
189 { assert(x.nRows()==y.nRows());
190 assert(x.nCols()==y.nCols());
191 int diagStride = y.nRows() + 1;
192 callPref(eblas_daxpy)(x.nRows(), alpha, GetData(x,double),1, (double*)y.dataPref(),diagStride*2);
193 }
194
axpy(double alpha,const diagMatrix & x,diagMatrix & y)195 void axpy(double alpha, const diagMatrix& x, diagMatrix& y)
196 { assert(x.nRows()==y.nRows());
197 for(int i=0; i<y.nRows(); i++) y[i] += alpha * x[i];
198 }
199
clone(const diagMatrix & x)200 diagMatrix clone(const diagMatrix& x) { return x; }
clone(const matrix & x)201 matrix clone(const matrix& x) { return x; }
202
dot(const diagMatrix & x,const diagMatrix & y)203 double dot(const diagMatrix& x, const diagMatrix& y)
204 { assert(x.size()==y.size());
205 double ret = 0.;
206 for(size_t i=0; i<x.size(); i++)
207 ret += x[i]*y[i];
208 return ret;
209 }
dot(const matrix & x,const matrix & y)210 double dot(const matrix& x, const matrix& y) { return dotc(x,y).real(); }
211
randomize(diagMatrix & x)212 void randomize(diagMatrix& x) { for(size_t i=0; i<x.size(); i++) x[i] = Random::normal(); }
randomize(matrix & x)213 void randomize(matrix& x)
214 { complex* xData = x.data();
215 for(size_t i=0; i<x.nData(); i++)
216 xData[i] = Random::normalComplex();
217 }
218
219 //Compute diag(dagger(X)*Y) efficiently (avoid the off-diagonals)
diagDot_thread(int colStart,int colStop,int nRows,const complex * X,const complex * Y,double * out)220 void diagDot_thread(int colStart, int colStop, int nRows, const complex* X, const complex* Y, double* out)
221 { const complex* Xcur = X + colStart*nRows;
222 const complex* Ycur = Y + colStart*nRows;
223 for(int col=colStart; col<colStop; col++)
224 { double result = 0.;
225 for(int iRow=0; iRow<nRows; iRow++)
226 result += ((Xcur++)->conj() * (*(Ycur++))).real();
227 out[col] = result;
228 }
229 }
diagDot(int nRows,int nCols,const complex * X,const complex * Y,double * out)230 void diagDot(int nRows, int nCols, const complex* X, const complex* Y, double* out)
231 { threadLaunch(diagDot_thread, nCols, nRows, X, Y, out);
232 }
233 #ifdef GPU_ENABLED
234 void diagDot_gpu(int nRows, int nCols, const complex* X, const complex* Y, double* out); //implemented in matrixOperators.cu
235 #endif
diagDot(const matrix & X,const matrix & Y)236 diagMatrix diagDot(const matrix& X, const matrix& Y)
237 { static StopWatch watch("diagDot"); watch.start();
238 assert(X.nCols()==Y.nCols());
239 assert(X.nRows()==Y.nRows());
240 ManagedArray<double> buf; buf.init(X.nCols(), isGpuEnabled());
241 callPref(diagDot)(X.nRows(), X.nCols(), X.dataPref(), Y.dataPref(), buf.dataPref());
242 //Copy to diagMatrix:
243 diagMatrix result(X.nCols());
244 eblas_copy(result.data(), buf.data(), X.nCols());
245 watch.stop();
246 return result;
247 }
248
249 //---------------- Misc matrix ops --------------------
250
det(const matrix & A)251 complex det(const matrix& A)
252 {
253 matrix decomposition = LU(A);
254 int N = A.nRows();
255
256 // Multiplies the diagonal entries to get the determinant up to a sign
257 complex determinant(1., 0.);
258 for(int i=0; i<N; i++)
259 determinant *= decomposition(i,i);
260
261 return determinant;
262
263 }
264
det(const diagMatrix & A)265 double det(const diagMatrix& A)
266 { double determinant = 1.;
267 for(int i=0; i<A.nCols(); i++)
268 determinant *= A[i];
269 return determinant;
270 }
271
trace(const matrix & A)272 complex trace(const matrix &A)
273 { assert(A.nRows() == A.nCols());
274 matrix one(eye(1));
275 return callPref(eblas_zdotc)(A.nRows(), one.dataPref(),0, A.dataPref(),A.nCols()+1);
276 }
277
trace(const diagMatrix & A)278 double trace(const diagMatrix& A)
279 { double ret=0.0;
280 for(double d: A) ret += d;
281 return ret;
282 }
283
nrm2(const diagMatrix & A)284 double nrm2(const diagMatrix& A)
285 { double ret=0.0;
286 for(double d: A) ret += d*d;
287 return sqrt(ret);
288 }
289
diag(const matrix & A)290 diagMatrix diag(const matrix &A)
291 { int N = A.nRows();
292 assert(N==A.nCols());
293 diagMatrix ret(N);
294 int diagStride = N + 1;
295 #ifdef GPU_ENABLED
296 ManagedArray<double> retManaged; retManaged.init(N, true);
297 eblas_zero_gpu(N, retManaged.dataGpu());
298 eblas_daxpy_gpu(N, 1., (const double*)A.dataGpu(),diagStride*2, retManaged.dataGpu(),1);
299 eblas_copy(ret.data(), retManaged.data(), N);
300 #else
301 eblas_daxpy(N, 1., (const double*)A.data(),diagStride*2, ret.data(),1);
302 #endif
303 return ret;
304 }
305
eye(int N)306 diagMatrix eye(int N)
307 { diagMatrix ret(N, 1.);
308 return ret;
309 }
310
zeroes(int nRows,int nCols)311 matrix zeroes(int nRows, int nCols)
312 { matrix ret(nRows, nCols, isGpuEnabled());
313 ret.zero();
314 return ret;
315 }
316
317
318 //Apply pending transpose / dagger operations:
operator matrix() const319 matrixScaledTransOp::operator matrix() const
320 { if(op==CblasNoTrans)
321 return scale * mat(iStart,1,iStart+iDelta, jStart,1,jStart+jDelta);
322 else
323 { matrix ret = zeroes(nRows(), nCols());
324 //Copy data, applying transpose and submatrix
325 complex* retData = ret.dataPref();
326 const complex* inData = mat.dataPref()+index(0,0);
327 for(int j=0; j<ret.nCols(); j++)
328 { callPref(eblas_zaxpy)(ret.nRows(), scale, inData,mat.nRows(), retData,1);
329 retData += ret.nRows();
330 inData++;
331 }
332 //Apply conjugate if needed:
333 if(op==CblasConjTrans)
334 callPref(eblas_dscal)(ret.nData(), -1., ((double*)ret.dataPref())+1, 2); //negate imag part
335 return ret;
336 }
337 }
338
339 //Complex conjugate:
conj(const scaled<matrix> & A)340 matrix conj(const scaled<matrix>& A)
341 { matrix B = A.data;
342 double* Bdata = (double*)B.dataPref();
343 callPref(eblas_dscal)(B.nData(), A.scale, Bdata, 2); //real parts
344 callPref(eblas_dscal)(B.nData(), -A.scale, Bdata+1, 2); //imag parts
345 return B;
346 }
347
348 // Hermitian adjoint
dagger(const matrixScaledTransOp & A)349 matrixScaledTransOp dagger(const matrixScaledTransOp& A)
350 { matrixScaledTransOp result(A);
351 assert(A.op != CblasTrans); //dagger(CblasTrans) = "CblasConj" is not supported in BLAS
352 result.op = (A.op==CblasNoTrans) ? CblasConjTrans : CblasNoTrans;
353 return result;
354 //Note: sub-matrix operation refers to input matrix and does not need to be handled above (because op applies after submatrix logically)
355 }
dagger_symmetrize(const matrixScaledTransOp & A)356 matrix dagger_symmetrize(const matrixScaledTransOp& A)
357 { return 0.5*(dagger(A) + A);
358 }
359 // Transpose
transpose(const matrixScaledTransOp & A)360 matrixScaledTransOp transpose(const matrixScaledTransOp& A)
361 { matrixScaledTransOp result(A);
362 assert(A.op != CblasConjTrans); //transpose(CblasConjTrans) = "CblasConj" is not supported in BLAS
363 result.op = (A.op==CblasNoTrans) ? CblasTrans : CblasNoTrans;
364 return result;
365 //Note: sub-matrix operation refers to input matrix and does not need to be handled above (because op applies after submatrix logically)
366 }
transpose_symmetrize(const matrixScaledTransOp & A)367 matrix transpose_symmetrize(const matrixScaledTransOp& A)
368 { return 0.5*(transpose(A) + A);
369 }
370
nrm2(const matrixScaledTransOp & A)371 double nrm2(const matrixScaledTransOp& A)
372 { const complex* Adata = A.mat.dataPref() + A.mat.index(A.iStart,A.jStart);
373 if(A.iDelta == A.mat.nRows()) //contiguous
374 return callPref(eblas_dznrm2)(A.iDelta*A.jDelta, Adata, 1);
375 else //not=contiguous
376 { double resultSq = 0.;
377 for(int j=0; j<A.jDelta; j++) //loop over columns, each of which is contiguous
378 { resultSq += std::pow(callPref(eblas_dznrm2)(A.iDelta, Adata, 1), 2);
379 Adata += A.mat.nRows(); //stride between columns of the matrix
380 }
381 return sqrt(resultSq);
382 }
383 //Note: transpose/conjugate do not affect nrm2 and hence do not need to be handled above
384 }
385
relativeHermiticityError(int N,const complex * data)386 double relativeHermiticityError(int N, const complex* data)
387 { double errNum=0.0, errDen=1e-20; //avoid irrelevant error for zero matrix
388 for(int i=0; i<N; i++)
389 for(int j=0; j<N; j++)
390 { int index = N*i + j;
391 int indexT = N*j + i;
392 errNum += norm(data[index]-data[indexT].conj());
393 errDen += norm(data[index]);
394 }
395 return sqrt(errNum / (errDen*N));
396 }
397
398 //------------ Block matrices ------------
399
tiledBlockMatrix(const matrix & mBlock,int nBlocks,const std::vector<complex> * phaseArr)400 tiledBlockMatrix::tiledBlockMatrix(const matrix& mBlock, int nBlocks, const std::vector<complex>* phaseArr) : mBlock(mBlock), nBlocks(nBlocks), phaseArr(phaseArr)
401 { if(phaseArr) assert(nBlocks==int(phaseArr->size()));
402 }
403
operator *(const matrix & other) const404 matrix tiledBlockMatrix::operator*(const matrix& other) const
405 { assert(mBlock.nCols()*nBlocks == other.nRows());
406 matrix result(mBlock.nRows()*nBlocks, other.nCols(), isGpuEnabled());
407 //Dense matrix multiply for each block:
408 for(int iBlock=0; iBlock<nBlocks; iBlock++)
409 { int offs = iBlock * mBlock.nCols();
410 complex phase = phaseArr ? phaseArr->at(iBlock) : 1.;
411 callPref(eblas_zgemm)(CblasNoTrans, CblasNoTrans, mBlock.nRows(), other.nCols(), mBlock.nCols(),
412 phase, mBlock.dataPref(), mBlock.nRows(), other.dataPref()+offs, other.nRows(),
413 0.0, result.dataPref()+offs, result.nRows());
414 }
415 return result;
416 }
417
operator *(const matrix & m,const tiledBlockMatrix & tbm)418 matrix operator*(const matrix& m, const tiledBlockMatrix& tbm)
419 { assert(m.nCols() == tbm.mBlock.nRows()*tbm.nBlocks);
420 matrix result(m.nRows(), tbm.mBlock.nCols()*tbm.nBlocks, isGpuEnabled());
421 //Dense matrix multiply for each block:
422 for(int iBlock=0; iBlock<tbm.nBlocks; iBlock++)
423 { int offsIn = iBlock * tbm.mBlock.nRows() * m.nRows();
424 int offsOut = iBlock * tbm.mBlock.nCols() * m.nRows();
425 complex phase = tbm.phaseArr ? tbm.phaseArr->at(iBlock) : 1.;
426 callPref(eblas_zgemm)(CblasNoTrans, CblasNoTrans, m.nRows(), tbm.mBlock.nCols(), tbm.mBlock.nRows(),
427 phase, m.dataPref()+offsIn, m.nRows(), tbm.mBlock.dataPref(), tbm.mBlock.nRows(),
428 0.0, result.dataPref()+offsOut, result.nRows());
429 }
430 return result;
431 }
432