1 /*-------------------------------------------------------------------
2 Copyright 2018 Ravishankar Sundararaman
3 
4 This file is part of JDFTx.
5 
6 JDFTx is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10 
11 JDFTx is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with JDFTx.  If not, see <http://www.gnu.org/licenses/>.
18 -------------------------------------------------------------------*/
19 
20 #include <core/GpuKernelUtils.h>
21 #include <cublas_v2.h>
22 
23 __global__
matrixSubGet_kernel(int nr,int iStart,int iStep,int iDelta,int jStart,int jStep,int jDelta,const complex * in,complex * out)24 void matrixSubGet_kernel(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out)
25 {	int i = kernelIndex(x); if(i>=iDelta) return;
26 	int j = kernelIndex(y); if(j>=jDelta) return;
27 	out[iDelta*j+i] = in[nr*(j*jStep+jStart) + (i*iStep+iStart)];
28 
29 }
matrixSubGet_gpu(int nr,int iStart,int iStep,int iDelta,int jStart,int jStep,int jDelta,const complex * in,complex * out)30 void matrixSubGet_gpu(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out)
31 {	GpuLaunchConfig3D glc(matrixSubGet_kernel, vector3<int>(1,jDelta,iDelta));
32 	matrixSubGet_kernel<<<glc.nBlocks,glc.nPerBlock>>>(nr, iStart,iStep,iDelta, jStart,jStep,jDelta, in, out);
33 	gpuErrorCheck();
34 }
35 
36 #define DECLARE_matrixSubSetAccum(NAME, OP) \
37 	__global__ \
38 	void matrixSub ##NAME## _kernel(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out) \
39 	{	int i = kernelIndex(x); if(i>=iDelta) return; \
40 		int j = kernelIndex(y); if(j>=jDelta) return; \
41 		out[nr*(j*jStep+jStart) + (i*iStep+iStart)] OP in[iDelta*j+i]; \
42 	} \
43 	void matrixSub ##NAME## _gpu(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out) \
44 	{	GpuLaunchConfig3D glc(matrixSub ##NAME## _kernel, vector3<int>(1,jDelta,iDelta)); \
45 		matrixSub ##NAME## _kernel<<<glc.nBlocks,glc.nPerBlock>>>(nr, iStart,iStep,iDelta, jStart,jStep,jDelta, in, out); \
46 		gpuErrorCheck(); \
47 	}
48 DECLARE_matrixSubSetAccum(Set, =)
49 DECLARE_matrixSubSetAccum(Accum, +=)
50 #undef DECLARE_matrixSubSetAccum
51 
52 
53 //----- diagonal matrix multiplies
mulMD_kernel(int nRows,int nCols,const complex * M,const scalar * D,complex * out)54 template<typename scalar> __global__ void mulMD_kernel(int nRows, int nCols, const complex* M, const scalar* D, complex* out)
55 {	int i = kernelIndex(x); if(i>=nRows) return;
56 	int j = kernelIndex(y); if(j>=nCols) return;
57 	int index = j*nRows + i;
58 	out[index] = M[index] * D[j];
59 }
mulDM_kernel(int nRows,int nCols,const scalar * D,const complex * M,complex * out)60 template<typename scalar> __global__ void mulDM_kernel(int nRows, int nCols, const scalar* D, const complex* M, complex* out)
61 {	int i = kernelIndex(x); if(i>=nRows) return;
62 	int j = kernelIndex(y); if(j>=nCols) return;
63 	int index = j*nRows + i;
64 	out[index] = D[i] * M[index];
65 }
mulMD_gpu(int nRows,int nCols,const complex * M,const scalar * D,complex * out)66 template<typename scalar> void mulMD_gpu(int nRows, int nCols, const complex* M, const scalar* D, complex* out)
67 {	static const int nColsMax = 32768;
68 	if(nCols > nColsMax)
69 	{	//Break up into smaller calls to avoid CUDA grid limits:
70 		int nLoop = ceildiv(nCols, nColsMax);
71 		for(int colStart=0; colStart<nCols; colStart+=nColsMax)
72 		{	int nColsCur = std::min(nCols-colStart, nColsMax);
73 			int offset = colStart * nRows;
74 			mulMD_gpu<scalar>(nRows, nColsCur, M+offset, D+colStart, out+offset);
75 		}
76 		return;
77 	}
78 	GpuLaunchConfig3D glc(mulMD_kernel<scalar>, vector3<int>(1,nCols,nRows));
79 	mulMD_kernel<scalar><<<glc.nBlocks,glc.nPerBlock>>>(nRows, nCols, M, D, out);
80 	gpuErrorCheck();
81 }
mulDM_gpu(int nRows,int nCols,const scalar * D,const complex * M,complex * out)82 template<typename scalar> void mulDM_gpu(int nRows, int nCols, const scalar* D, const complex* M, complex* out)
83 {	GpuLaunchConfig3D glc(mulDM_kernel<scalar>, vector3<int>(1,nCols,nRows));
84 	mulDM_kernel<scalar><<<glc.nBlocks,glc.nPerBlock>>>(nRows, nCols, D, M, out);
85 	gpuErrorCheck();
86 }
mulMDdouble_gpu(int nRows,int nCols,const complex * M,const double * D,complex * out)87 void mulMDdouble_gpu(int nRows, int nCols, const complex* M, const double* D, complex* out) { mulMD_gpu<double>(nRows, nCols, M, D, out); }
mulDMdouble_gpu(int nRows,int nCols,const double * D,const complex * M,complex * out)88 void mulDMdouble_gpu(int nRows, int nCols, const double* D, const complex* M, complex* out) { mulDM_gpu<double>(nRows, nCols, D, M, out); }
mulMDcomplex_gpu(int nRows,int nCols,const complex * M,const complex * D,complex * out)89 void mulMDcomplex_gpu(int nRows, int nCols, const complex* M, const complex* D, complex* out) { mulMD_gpu<complex>(nRows, nCols, M, D, out); }
mulDMcomplex_gpu(int nRows,int nCols,const complex * D,const complex * M,complex * out)90 void mulDMcomplex_gpu(int nRows, int nCols, const complex* D, const complex* M, complex* out) { mulDM_gpu<complex>(nRows, nCols, D, M, out); }
91 
92 
93 __global__
diagDot_kernel(int nRows,int nCols,const complex * X,const complex * Y,double * out)94 void diagDot_kernel(int nRows, int nCols, const complex* X, const complex* Y, double* out)
95 {	int iCol = kernelIndex1D(); if(iCol>=nCols) return;
96 	double result = 0.;
97 	for(int iRow=0; iRow<nRows; iRow++)
98 	{	int index = iCol*nRows + iRow;
99 		result += (X[index].conj() * Y[index]).real();
100 	}
101 	out[iCol] = result;
102 }
diagDot_gpu(int nRows,int nCols,const complex * X,const complex * Y,double * out)103 void diagDot_gpu(int nRows, int nCols, const complex* X, const complex* Y, double* out)
104 {	GpuLaunchConfig1D glc(diagDot_kernel, nCols);
105 	diagDot_kernel<<<glc.nBlocks,glc.nPerBlock>>>(nRows, nCols, X, Y, out);
106 	gpuErrorCheck();
107 }
108 
109 
110 __global__
relativeHermiticityError_kernel(int N,const complex * data,double * buf)111 void relativeHermiticityError_kernel(int N, const complex* data, double* buf)
112 {	int i = kernelIndex1D();
113 	if(i<N)
114 	{	double errNum = 0., errDen = 0.;
115 		for(int j=0; j<N; j++)
116 		{	int index = N*i + j;
117 			int indexT = N*j + i;
118 			errNum += norm(data[index]-data[indexT].conj());
119 			errDen += norm(data[index]);
120 		}
121 		buf[i] = errNum;
122 		buf[i+N] = errDen;
123 	}
124 }
relativeHermiticityError_gpu(int N,const complex * data)125 double relativeHermiticityError_gpu(int N, const complex* data)
126 {	GpuLaunchConfig1D glc(relativeHermiticityError_kernel, N);
127 	double* buf; cudaMalloc(&buf, sizeof(double)*(2*N)); //buffer to store results per row
128 	relativeHermiticityError_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, data, buf);
129 	gpuErrorCheck();
130 	double errNum = 0., errDen = 0.;
131 	cublasDasum(cublasHandle, N, buf, 1, &errNum);
132 	cublasDasum(cublasHandle, N, buf+N, 1, &errDen);
133 	cudaFree(buf);
134 	return sqrt(errNum / (errDen*N));
135 }
136