1 /*-------------------------------------------------------------------
2 Copyright 2018 Ravishankar Sundararaman
3 
4 This file is part of JDFTx.
5 
6 JDFTx is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10 
11 JDFTx is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with JDFTx.  If not, see <http://www.gnu.org/licenses/>.
18 -------------------------------------------------------------------*/
19 
20 #include <core/matrix.h>
21 #include <core/GpuUtil.h>
22 #include <core/Random.h>
23 
24 //------------- Sub-matrices ------------------------
25 
operator ()(int i,int j) const26 complex matrix::operator()(int i, int j) const
27 {	assert(i<nr and i>=0);
28 	assert(j<nc and j>=0);
29 	if(isOnGpu())
30 	{	complex ret;
31 		#ifdef GPU_ENABLED
32 		cudaMemcpy(&ret, dataGpu()+index(i,j), sizeof(complex), cudaMemcpyDeviceToHost);
33 		#else
34 		assert(!"onGpu=true without GPU_ENABLED");
35 		#endif
36 		return ret;
37 	}
38 	else return data()[index(i,j)];
39 }
40 
operator ()(int iStart,int iStop,int jStart,int jStop) const41 matrixScaledTransOp matrix::operator()(int iStart, int iStop, int jStart, int jStop) const
42 {	return matrixScaledTransOp(*this, iStart,iStop, jStart,jStop);
43 }
44 
45 #ifdef GPU_ENABLED
46 void matrixSubGet_gpu(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out); //implemented in operators.cu
47 #endif
operator ()(int iStart,int iStep,int iStop,int jStart,int jStep,int jStop) const48 matrix matrix::operator()(int iStart, int iStep,  int iStop, int jStart, int jStep, int jStop) const
49 {	if(iStart==0 && iStep==1 && iStop==nr && jStart==0 && jStep==1 && jStop==nc)
50 		return *this; //faster to copy matrix for this special case
51 
52 	assert(iStart>=0 && iStart<nr);
53 	assert(iStop>iStart && iStop<=nr);
54 	assert(iStep>0);
55 	assert(jStart>=0 && jStart<nc);
56 	assert(jStop>jStart && jStop<=nc);
57 	assert(jStep>0);
58 
59 	int iDelta = ceildiv(iStop-iStart, iStep), jDelta = ceildiv(jStop-jStart, jStep);
60 	matrix ret(iDelta,jDelta, isGpuEnabled()); complex* retData = ret.dataPref(); const complex* thisData = this->dataPref();
61 	#ifdef GPU_ENABLED
62 	matrixSubGet_gpu(nr, iStart,iStep,iDelta, jStart,jStep,jDelta, thisData, retData);
63 	#else
64 	for(int i=0; i<iDelta; i++)
65 		for(int j=0; j<jDelta; j++)
66 			retData[ret.index(i,j)] = thisData[this->index(i*iStep+iStart, j*jStep+jStart)];
67 	#endif
68 	return ret;
69 }
70 
71 
set(int i,int j,complex m)72 void matrix::set(int i, int j, complex m)
73 {	assert(i<nr and i>=0);
74 	assert(j<nc and j>=0);
75 	if(isOnGpu())
76 	{
77 		#ifdef GPU_ENABLED
78 		cudaMemcpy(dataGpu()+index(i,j), &m, sizeof(complex), cudaMemcpyHostToDevice);
79 		#else
80 		assert(!"onGpu=true without GPU_ENABLED");
81 		#endif
82 	}
83 	else data()[index(i,j)] = m;
84 }
85 
86 #ifdef GPU_ENABLED
87 void matrixSubSet_gpu(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out);
88 void matrixSubAccum_gpu(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out);
89 #endif
90 #define DECLARE_matrixSubSetAccum(nAME, NAME, OP) \
91 	void matrixSub ##NAME(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out) \
92 	{	for(int j=0; j<jDelta; j++) \
93 			for(int i=0; i<iDelta; i++) \
94 				out[nr*(j*jStep+jStart) + (i*iStep+iStart)] OP *(in++); \
95 	} \
96 	void matrix::nAME(int iStart, int iStep, int iStop, int jStart, int jStep, int jStop, const matrix& m) \
97 	{	static StopWatch watch("matrix::" #nAME); watch.start(); \
98 		assert(iStart>=0 && iStart<nr); \
99 		assert(iStop>iStart && iStop<=nr); \
100 		assert(iStep>0); \
101 		assert(jStart>=0 || jStart<nc); \
102 		assert(jStop>jStart || jStop<=nc); \
103 		assert(jStep>0); \
104 		int iDelta = ceildiv(iStop-iStart, iStep); \
105 		int jDelta = ceildiv(jStop-jStart, jStep); \
106 		assert(iDelta==m.nr); \
107 		assert(jDelta==m.nc); \
108 		callPref(matrixSub ##NAME)(nr, iStart,iStep,iDelta, jStart,jStep,jDelta, m.dataPref(), dataPref()); \
109 		watch.stop(); \
110 	}
111 DECLARE_matrixSubSetAccum(set, Set, =)
112 DECLARE_matrixSubSetAccum(accum, Accum, +=)
113 #undef DECLARE_matrixSubSetAccum
114 
115 //----------------------- Arithmetic ---------------------
116 
operator *(const matrixScaledTransOp & m1st,const matrixScaledTransOp & m2st)117 matrix operator*(const matrixScaledTransOp &m1st, const matrixScaledTransOp &m2st)
118 {	assert(m1st.nCols() == m2st.nRows());
119 	const matrix& m1 = m1st.mat;
120 	const matrix& m2 = m2st.mat;
121 	double scaleFac = m1st.scale * m2st.scale;
122 	matrix ret(m1st.nRows(), m2st.nCols(), isGpuEnabled());
123 	callPref(eblas_zgemm)(m1st.op, m2st.op,
124 		ret.nRows(), ret.nCols(), m1st.nCols(), scaleFac,
125 		m1.dataPref()+m1st.index(0,0), m1.nRows(),
126 		m2.dataPref()+m2st.index(0,0), m2.nRows(),
127 		0.0, ret.dataPref(), ret.nRows());
128 	return ret;
129 }
130 
131 //----- diagonal matrix multiplies
mulMD(int nRows,int nCols,const complex * M,const scalar * D,complex * out)132 template<typename scalar> void mulMD(int nRows, int nCols, const complex* M, const scalar* D, complex* out)
133 {	for(int j=0; j<nCols; j++)
134 		for(int i=0; i<nRows; i++)
135 			*(out++) = *(M++) * D[j];
136 }
mulDM(int nRows,int nCols,const scalar * D,const complex * M,complex * out)137 template<typename scalar> void mulDM(int nRows, int nCols, const scalar* D, const complex* M, complex* out)
138 {	for(int j=0; j<nCols; j++)
139 		for(int i=0; i<nRows; i++)
140 			*(out++) = D[i] * (*(M++));
141 }
142 #define mulMDdouble  mulMD<double>
143 #define mulMDcomplex mulMD<complex>
144 #define mulDMdouble  mulDM<double>
145 #define mulDMcomplex mulDM<complex>
146 #ifdef GPU_ENABLED
147 void mulMDdouble_gpu(int nRows, int nCols, const complex* M, const double* D, complex* out);
148 void mulDMdouble_gpu(int nRows, int nCols, const double* D, const complex* M, complex* out);
149 void mulMDcomplex_gpu(int nRows, int nCols, const complex* M, const complex* D, complex* out);
150 void mulDMcomplex_gpu(int nRows, int nCols, const complex* D, const complex* M, complex* out);
151 #define GetData(D,scalar) ManagedArray<scalar>(D).dataGpu()
152 #else
153 #define GetData(D,scalar) D.data()
154 #endif
operator *(const matrix & m,const diagMatrix & d)155 matrix operator*(const matrix& m, const diagMatrix& d)
156 {	assert(m.nCols()==d.nRows());
157 	matrix ret(m.nRows(), m.nCols(), isGpuEnabled());
158 	callPref(mulMDdouble)(m.nRows(), m.nCols(), m.dataPref(), GetData(d,double), ret.dataPref());
159 	return ret;
160 }
operator *(const matrix & m,const std::vector<complex> & d)161 matrix operator*(const matrix& m, const std::vector<complex>& d)
162 {	assert(m.nCols()==int(d.size()));
163 	matrix ret(m.nRows(), m.nCols(), isGpuEnabled());
164 	callPref(mulMDcomplex)(m.nRows(), m.nCols(), m.dataPref(), GetData(d,complex), ret.dataPref());
165 	return ret;
166 }
operator *(const diagMatrix & d,const matrix & m)167 matrix operator*(const diagMatrix& d, const matrix& m)
168 {	assert(d.nCols()==m.nRows());
169 	matrix ret(m.nRows(), m.nCols(), isGpuEnabled());
170 	callPref(mulDMdouble)(m.nRows(), m.nCols(), GetData(d,double), m.dataPref(), ret.dataPref());
171 	return ret;
172 }
operator *(const std::vector<complex> & d,const matrix & m)173 matrix operator*(const std::vector<complex>& d, const matrix& m)
174 {	assert(int(d.size())==m.nRows());
175 	matrix ret(m.nRows(), m.nCols(), isGpuEnabled());
176 	callPref(mulDMcomplex)(m.nRows(), m.nCols(), GetData(d,complex), m.dataPref(), ret.dataPref());
177 	return ret;
178 }
179 
180 
operator *(const diagMatrix & d1,const diagMatrix & d2)181 diagMatrix operator*(const diagMatrix& d1, const diagMatrix& d2)
182 {	assert(d1.nCols()==d2.nRows());
183 	diagMatrix ret(d1);
184 	for(int i=0; i<ret.nRows(); i++) ret[i] *= d2[i]; //elementwise multiply
185 	return ret;
186 }
187 
axpy(double alpha,const diagMatrix & x,matrix & y)188 void axpy(double alpha, const diagMatrix& x, matrix& y)
189 {	assert(x.nRows()==y.nRows());
190 	assert(x.nCols()==y.nCols());
191 	int diagStride = y.nRows() + 1;
192 	callPref(eblas_daxpy)(x.nRows(), alpha, GetData(x,double),1, (double*)y.dataPref(),diagStride*2);
193 }
194 
axpy(double alpha,const diagMatrix & x,diagMatrix & y)195 void axpy(double alpha, const diagMatrix& x, diagMatrix& y)
196 {	assert(x.nRows()==y.nRows());
197 	for(int i=0; i<y.nRows(); i++) y[i] += alpha * x[i];
198 }
199 
clone(const diagMatrix & x)200 diagMatrix clone(const diagMatrix& x) { return x; }
clone(const matrix & x)201 matrix clone(const matrix& x) { return x; }
202 
dot(const diagMatrix & x,const diagMatrix & y)203 double dot(const diagMatrix& x, const diagMatrix& y)
204 {	assert(x.size()==y.size());
205 	double ret = 0.;
206 	for(size_t i=0; i<x.size(); i++)
207 		ret += x[i]*y[i];
208 	return ret;
209 }
dot(const matrix & x,const matrix & y)210 double dot(const matrix& x, const matrix& y) { return dotc(x,y).real(); }
211 
randomize(diagMatrix & x)212 void randomize(diagMatrix& x) { for(size_t i=0; i<x.size(); i++) x[i] = Random::normal(); }
randomize(matrix & x)213 void randomize(matrix& x)
214 {	complex* xData = x.data();
215 	for(size_t i=0; i<x.nData(); i++)
216 		xData[i] = Random::normalComplex();
217 }
218 
219 //Compute diag(dagger(X)*Y) efficiently (avoid the off-diagonals)
diagDot_thread(int colStart,int colStop,int nRows,const complex * X,const complex * Y,double * out)220 void diagDot_thread(int colStart, int colStop, int nRows, const complex* X, const complex* Y, double* out)
221 {	const complex* Xcur = X + colStart*nRows;
222 	const complex* Ycur = Y + colStart*nRows;
223 	for(int col=colStart; col<colStop; col++)
224 	{	double result = 0.;
225 		for(int iRow=0; iRow<nRows; iRow++)
226 			result += ((Xcur++)->conj() * (*(Ycur++))).real();
227 		out[col] = result;
228 	}
229 }
diagDot(int nRows,int nCols,const complex * X,const complex * Y,double * out)230 void diagDot(int nRows, int nCols, const complex* X, const complex* Y, double* out)
231 {	threadLaunch(diagDot_thread, nCols, nRows, X, Y, out);
232 }
233 #ifdef GPU_ENABLED
234 void diagDot_gpu(int nRows, int nCols, const complex* X, const complex* Y, double* out); //implemented in matrixOperators.cu
235 #endif
diagDot(const matrix & X,const matrix & Y)236 diagMatrix diagDot(const matrix& X, const matrix& Y)
237 {	static StopWatch watch("diagDot"); watch.start();
238 	assert(X.nCols()==Y.nCols());
239 	assert(X.nRows()==Y.nRows());
240 	ManagedArray<double> buf; buf.init(X.nCols(), isGpuEnabled());
241 	callPref(diagDot)(X.nRows(), X.nCols(), X.dataPref(), Y.dataPref(), buf.dataPref());
242 	//Copy to diagMatrix:
243 	diagMatrix result(X.nCols());
244 	eblas_copy(result.data(), buf.data(), X.nCols());
245 	watch.stop();
246 	return result;
247 }
248 
249 //---------------- Misc matrix ops --------------------
250 
det(const matrix & A)251 complex det(const matrix& A)
252 {
253 	matrix decomposition = LU(A);
254 	int N = A.nRows();
255 
256 	// Multiplies the diagonal entries to get the determinant up to a sign
257 	complex determinant(1., 0.);
258 	for(int i=0; i<N; i++)
259 		determinant *= decomposition(i,i);
260 
261 	return determinant;
262 
263 }
264 
det(const diagMatrix & A)265 double det(const diagMatrix& A)
266 {	double determinant = 1.;
267 	for(int i=0; i<A.nCols(); i++)
268 		determinant *= A[i];
269 	return determinant;
270 }
271 
trace(const matrix & A)272 complex trace(const matrix &A)
273 {	assert(A.nRows() == A.nCols());
274 	matrix one(eye(1));
275 	return callPref(eblas_zdotc)(A.nRows(), one.dataPref(),0, A.dataPref(),A.nCols()+1);
276 }
277 
trace(const diagMatrix & A)278 double trace(const diagMatrix& A)
279 {	double ret=0.0;
280 	for(double d: A) ret += d;
281 	return ret;
282 }
283 
nrm2(const diagMatrix & A)284 double nrm2(const diagMatrix& A)
285 {	double ret=0.0;
286 	for(double d: A) ret += d*d;
287 	return sqrt(ret);
288 }
289 
diag(const matrix & A)290 diagMatrix diag(const matrix &A)
291 {	int N = A.nRows();
292 	assert(N==A.nCols());
293 	diagMatrix ret(N);
294 	int diagStride = N + 1;
295 	#ifdef GPU_ENABLED
296 	ManagedArray<double> retManaged; retManaged.init(N, true);
297 	eblas_zero_gpu(N, retManaged.dataGpu());
298 	eblas_daxpy_gpu(N, 1., (const double*)A.dataGpu(),diagStride*2, retManaged.dataGpu(),1);
299 	eblas_copy(ret.data(), retManaged.data(), N);
300 	#else
301 	eblas_daxpy(N, 1., (const double*)A.data(),diagStride*2, ret.data(),1);
302 	#endif
303 	return ret;
304 }
305 
eye(int N)306 diagMatrix eye(int N)
307 {	diagMatrix ret(N, 1.);
308 	return ret;
309 }
310 
zeroes(int nRows,int nCols)311 matrix zeroes(int nRows, int nCols)
312 {	matrix ret(nRows, nCols, isGpuEnabled());
313 	ret.zero();
314 	return ret;
315 }
316 
317 
318 //Apply pending transpose / dagger operations:
operator matrix() const319 matrixScaledTransOp::operator matrix() const
320 {	if(op==CblasNoTrans)
321 		return scale * mat(iStart,1,iStart+iDelta, jStart,1,jStart+jDelta);
322 	else
323 	{	matrix ret = zeroes(nRows(), nCols());
324 		//Copy data, applying transpose and submatrix
325 		complex* retData = ret.dataPref();
326 		const complex* inData = mat.dataPref()+index(0,0);
327 		for(int j=0; j<ret.nCols(); j++)
328 		{	callPref(eblas_zaxpy)(ret.nRows(), scale, inData,mat.nRows(), retData,1);
329 			retData += ret.nRows();
330 			inData++;
331 		}
332 		//Apply conjugate if needed:
333 		if(op==CblasConjTrans)
334 			callPref(eblas_dscal)(ret.nData(), -1., ((double*)ret.dataPref())+1, 2); //negate imag part
335 		return ret;
336 	}
337 }
338 
339 //Complex conjugate:
conj(const scaled<matrix> & A)340 matrix conj(const scaled<matrix>& A)
341 {	matrix B = A.data;
342 	double* Bdata = (double*)B.dataPref();
343 	callPref(eblas_dscal)(B.nData(), A.scale, Bdata, 2); //real parts
344 	callPref(eblas_dscal)(B.nData(), -A.scale, Bdata+1, 2); //imag parts
345 	return B;
346 }
347 
348 // Hermitian adjoint
dagger(const matrixScaledTransOp & A)349 matrixScaledTransOp dagger(const matrixScaledTransOp& A)
350 {	matrixScaledTransOp result(A);
351 	assert(A.op != CblasTrans); //dagger(CblasTrans) = "CblasConj" is not supported in BLAS
352 	result.op = (A.op==CblasNoTrans) ? CblasConjTrans : CblasNoTrans;
353 	return result;
354 	//Note: sub-matrix operation refers to input matrix and does not need to be handled above (because op applies after submatrix logically)
355 }
dagger_symmetrize(const matrixScaledTransOp & A)356 matrix dagger_symmetrize(const matrixScaledTransOp& A)
357 {	return 0.5*(dagger(A) + A);
358 }
359 // Transpose
transpose(const matrixScaledTransOp & A)360 matrixScaledTransOp transpose(const matrixScaledTransOp& A)
361 {	matrixScaledTransOp result(A);
362 	assert(A.op != CblasConjTrans); //transpose(CblasConjTrans) = "CblasConj" is not supported in BLAS
363 	result.op = (A.op==CblasNoTrans) ? CblasTrans : CblasNoTrans;
364 	return result;
365 	//Note: sub-matrix operation refers to input matrix and does not need to be handled above (because op applies after submatrix logically)
366 }
transpose_symmetrize(const matrixScaledTransOp & A)367 matrix transpose_symmetrize(const matrixScaledTransOp& A)
368 {	return 0.5*(transpose(A) + A);
369 }
370 
nrm2(const matrixScaledTransOp & A)371 double nrm2(const matrixScaledTransOp& A)
372 {	const complex* Adata = A.mat.dataPref() + A.mat.index(A.iStart,A.jStart);
373 	if(A.iDelta == A.mat.nRows()) //contiguous
374 		return callPref(eblas_dznrm2)(A.iDelta*A.jDelta, Adata, 1);
375 	else //not=contiguous
376 	{	double resultSq = 0.;
377 		for(int j=0; j<A.jDelta; j++) //loop over columns, each of which is contiguous
378 		{	resultSq += std::pow(callPref(eblas_dznrm2)(A.iDelta, Adata, 1), 2);
379 			Adata += A.mat.nRows(); //stride between columns of the matrix
380 		}
381 		return sqrt(resultSq);
382 	}
383 	//Note: transpose/conjugate do not affect nrm2 and hence do not need to be handled above
384 }
385 
relativeHermiticityError(int N,const complex * data)386 double relativeHermiticityError(int N, const complex* data)
387 {	double errNum=0.0, errDen=1e-20; //avoid irrelevant error for zero matrix
388 	for(int i=0; i<N; i++)
389 		for(int j=0; j<N; j++)
390 		{	int index = N*i + j;
391 			int indexT = N*j + i;
392 			errNum += norm(data[index]-data[indexT].conj());
393 			errDen += norm(data[index]);
394 		}
395 	return sqrt(errNum / (errDen*N));
396 }
397 
398 //------------ Block matrices ------------
399 
tiledBlockMatrix(const matrix & mBlock,int nBlocks,const std::vector<complex> * phaseArr)400 tiledBlockMatrix::tiledBlockMatrix(const matrix& mBlock, int nBlocks, const std::vector<complex>* phaseArr) : mBlock(mBlock), nBlocks(nBlocks), phaseArr(phaseArr)
401 {	if(phaseArr) assert(nBlocks==int(phaseArr->size()));
402 }
403 
operator *(const matrix & other) const404 matrix tiledBlockMatrix::operator*(const matrix& other) const
405 {	assert(mBlock.nCols()*nBlocks == other.nRows());
406 	matrix result(mBlock.nRows()*nBlocks, other.nCols(), isGpuEnabled());
407 	//Dense matrix multiply for each block:
408 	for(int iBlock=0; iBlock<nBlocks; iBlock++)
409 	{	int offs = iBlock * mBlock.nCols();
410 		complex phase = phaseArr ? phaseArr->at(iBlock) : 1.;
411 		callPref(eblas_zgemm)(CblasNoTrans, CblasNoTrans, mBlock.nRows(), other.nCols(), mBlock.nCols(),
412 			phase, mBlock.dataPref(), mBlock.nRows(), other.dataPref()+offs, other.nRows(),
413 			0.0, result.dataPref()+offs, result.nRows());
414 	}
415 	return result;
416 }
417 
operator *(const matrix & m,const tiledBlockMatrix & tbm)418 matrix operator*(const matrix& m, const tiledBlockMatrix& tbm)
419 {	assert(m.nCols() == tbm.mBlock.nRows()*tbm.nBlocks);
420 	matrix result(m.nRows(), tbm.mBlock.nCols()*tbm.nBlocks, isGpuEnabled());
421 	//Dense matrix multiply for each block:
422 	for(int iBlock=0; iBlock<tbm.nBlocks; iBlock++)
423 	{	int offsIn = iBlock * tbm.mBlock.nRows() * m.nRows();
424 		int offsOut = iBlock * tbm.mBlock.nCols() * m.nRows();
425 		complex phase = tbm.phaseArr ? tbm.phaseArr->at(iBlock) : 1.;
426 		callPref(eblas_zgemm)(CblasNoTrans, CblasNoTrans, m.nRows(), tbm.mBlock.nCols(), tbm.mBlock.nRows(),
427 			phase, m.dataPref()+offsIn, m.nRows(), tbm.mBlock.dataPref(), tbm.mBlock.nRows(),
428 			0.0, result.dataPref()+offsOut, result.nRows());
429 	}
430 	return result;
431 }
432