1 /*-------------------------------------------------------------------
2 Copyright 2011 Ravishankar Sundararaman
3 
4 This file is part of JDFTx.
5 
6 JDFTx is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10 
11 JDFTx is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with JDFTx.  If not, see <http://www.gnu.org/licenses/>.
18 -------------------------------------------------------------------*/
19 
20 #include <core/GpuKernelUtils.h>
21 #include <core/BlasExtra_internal.h>
22 #include <algorithm>
23 #include <cublas_v2.h>
24 #include <cfloat>
25 #include <gsl/gsl_cblas.h>
26 
27 template<typename Tx, typename Ty> __global__
eblas_mul_kernel(const int N,const Tx * X,const int incX,Ty * Y,const int incY)28 void eblas_mul_kernel(const int N, const Tx* X, const int incX, Ty* Y, const int incY)
29 {	int i = kernelIndex1D();
30 	if(i<N) Y[i*incY] *= X[i*incX];
31 }
eblas_dmul_gpu(const int N,const double * X,const int incX,double * Y,const int incY)32 void eblas_dmul_gpu(const int N, const double* X, const int incX, double* Y, const int incY)
33 {	GpuLaunchConfig1D glc(eblas_mul_kernel<double,double>, N);
34 	eblas_mul_kernel<double,double><<<glc.nBlocks,glc.nPerBlock>>>(N, X,incX, Y,incY);
35 	gpuErrorCheck();
36 }
eblas_zmul_gpu(const int N,const complex * X,const int incX,complex * Y,const int incY)37 void eblas_zmul_gpu(const int N, const complex* X, const int incX, complex* Y, const int incY)
38 {	GpuLaunchConfig1D glc(eblas_mul_kernel<complex,complex>, N);
39 	eblas_mul_kernel<complex,complex><<<glc.nBlocks,glc.nPerBlock>>>(N, X,incX, Y,incY);
40 	gpuErrorCheck();
41 }
eblas_zmuld_gpu(const int N,const double * X,const int incX,complex * Y,const int incY)42 void eblas_zmuld_gpu(const int N, const double* X, const int incX, complex* Y, const int incY)
43 {	GpuLaunchConfig1D glc(eblas_mul_kernel<double,complex>, N);
44 	eblas_mul_kernel<double,complex><<<glc.nBlocks,glc.nPerBlock>>>(N, X,incX, Y,incY);
45 	gpuErrorCheck();
46 }
47 
48 
49 __global__
eblas_lincomb_kernel(const int N,complex sX,const complex * X,const int incX,complex sY,const complex * Y,const int incY,complex * Z,const int incZ)50 void eblas_lincomb_kernel(const int N,
51 	complex sX, const complex* X, const int incX,
52 	complex sY, const complex* Y, const int incY,
53 	complex* Z, const int incZ)
54 {	int i = kernelIndex1D();
55 	if(i<N) Z[i*incZ] = sX*X[i*incX] + sY*Y[i*incY];
56 }
eblas_lincomb_gpu(const int N,const complex & sX,const complex * X,const int incX,const complex & sY,const complex * Y,const int incY,complex * Z,const int incZ)57 void eblas_lincomb_gpu(const int N,
58 	const complex& sX, const complex* X, const int incX,
59 	const complex& sY, const complex* Y, const int incY,
60 	complex* Z, const int incZ)
61 {	GpuLaunchConfig1D glc(eblas_lincomb_kernel, N);
62 	eblas_lincomb_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, sX,X,incX, sY,Y,incY, Z,incZ);
63 	gpuErrorCheck();
64 }
65 
cublasTranspose(CBLAS_TRANSPOSE trans)66 inline cublasOperation_t cublasTranspose(CBLAS_TRANSPOSE trans)
67 {	switch(trans)
68 	{	case CblasNoTrans: return CUBLAS_OP_N;
69 		case CblasTrans: return CUBLAS_OP_T;
70 		case CblasConjTrans: return CUBLAS_OP_C;
71 	}
72 	return CUBLAS_OP_N;
73 }
eblas_zgemm_gpu(CBLAS_TRANSPOSE TransA,CBLAS_TRANSPOSE TransB,int M,int N,int K,const complex & alpha,const complex * A,const int lda,const complex * B,const int ldb,const complex & beta,complex * C,const int ldc)74 void eblas_zgemm_gpu(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, int M, int N, int K,
75 	const complex& alpha, const complex *A, const int lda, const complex *B, const int ldb,
76 	const complex& beta, complex *C, const int ldc)
77 {	cublasZgemm(cublasHandle, cublasTranspose(TransA), cublasTranspose(TransB), M, N, K,
78 		(const double2*)&alpha, (const double2*)A, lda, (const double2*)B, ldb,
79 		(const double2*)&beta, (double2*)C, ldc);
80 }
81 
82 template<typename scalar, typename scalar2, typename Conjugator> __global__
eblas_scatter_axpy_kernel(const int N,scalar2 a,const int * index,const scalar * x,scalar * y,const scalar * w,const Conjugator & conjugator)83 void eblas_scatter_axpy_kernel(const int N, scalar2 a, const int* index, const scalar* x, scalar* y, const scalar* w, const Conjugator& conjugator)
84 {	int i = kernelIndex1D();
85 	if(i<N) y[index[i]] += a * conjugator(x,i, w,i);
86 }
87 template<typename scalar, typename scalar2, typename Conjugator> __global__
eblas_gather_axpy_kernel(const int N,scalar2 a,const int * index,const scalar * x,scalar * y,const scalar * w,const Conjugator & conjugator)88 void eblas_gather_axpy_kernel(const int N, scalar2 a, const int* index, const scalar* x, scalar* y, const scalar* w, const Conjugator& conjugator)
89 {	int i = kernelIndex1D();
90 	if(i<N) y[i] += a * conjugator(x,index[i], w,i);
91 }
92 #define DEFINE_SPARSE_AXPY_GPU_LAUNCHER(type) \
93 	template<typename scalar, typename scalar2, typename Conjugator> \
94 	void eblas_##type##_axpy_gpu(const int N, scalar2 a, const int* index, const scalar* x, scalar* y, const scalar* w, const Conjugator& conjugator) \
95 	{	GpuLaunchConfig1D glc(eblas_##type##_axpy_kernel<scalar,scalar2,Conjugator>, N); \
96 		eblas_##type##_axpy_kernel<scalar,scalar,Conjugator><<<glc.nBlocks,glc.nPerBlock>>>(N, a, index, x, y, w, conjugator); \
97 		gpuErrorCheck(); \
98 	}
99 DEFINE_SPARSE_AXPY_GPU_LAUNCHER(scatter)
DEFINE_SPARSE_AXPY_GPU_LAUNCHER(gather)100 DEFINE_SPARSE_AXPY_GPU_LAUNCHER(gather)
101 DEFINE_SPARSE_AXPY(scatter, _gpu)
102 DEFINE_SPARSE_AXPY(gather, _gpu)
103 
104 
105 __global__
106 void eblas_accumNorm_kernel(int N, double a, const complex* x, double* y)
107 {	int i = kernelIndex1D();
108 	if(i<N) y[i] += a * norm(x[i]);
109 }
eblas_accumNorm_gpu(int N,const double & a,const complex * x,double * y)110 void eblas_accumNorm_gpu(int N, const double& a, const complex* x, double* y)
111 {	GpuLaunchConfig1D glc(eblas_accumNorm_kernel, N);
112 	eblas_accumNorm_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, a, x, y);
113 	gpuErrorCheck();
114 }
115 
116 __global__
eblas_accumProd_kernel(int N,double a,const complex * xU,const complex * xC,double * yRe,double * yIm)117 void eblas_accumProd_kernel(int N, double a, const complex* xU, const complex* xC, double* yRe, double* yIm)
118 {	int i = kernelIndex1D();
119 	if(i<N)
120 	{	complex z = a * xU[i] * xC[i].conj();
121 		yRe[i] += z.real();
122 		yIm[i] += z.imag();
123 	}
124 }
eblas_accumProd_gpu(int N,const double & a,const complex * xU,const complex * xC,double * yRe,double * yIm)125 void eblas_accumProd_gpu(int N, const double& a, const complex* xU, const complex* xC, double* yRe, double* yIm)
126 {	GpuLaunchConfig1D glc(eblas_accumProd_kernel, N);
127 	eblas_accumProd_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, a, xU, xC, yRe, yIm);
128 	gpuErrorCheck();
129 }
130 
131 template<typename scalar> __global__
eblas_symmetrize_kernel(int N,int n,const int * symmIndex,scalar * x,double nInv)132 void eblas_symmetrize_kernel(int N, int n, const int* symmIndex, scalar* x, double nInv)
133 {	int i=kernelIndex1D();
134 	if(i<N) eblas_symmetrize_calc(i, n, symmIndex, x, nInv);
135 }
eblas_symmetrize_gpu(int N,int n,const int * symmIndex,scalar * x)136 template<typename scalar> void eblas_symmetrize_gpu(int N, int n, const int* symmIndex, scalar* x)
137 {	GpuLaunchConfig1D glc(eblas_symmetrize_kernel<scalar>, N);
138 	eblas_symmetrize_kernel<scalar><<<glc.nBlocks,glc.nPerBlock>>>(N, n, symmIndex, x, 1./n);
139 	gpuErrorCheck();
140 }
eblas_symmetrize_gpu(int N,int n,const int * symmIndex,double * x)141 void eblas_symmetrize_gpu(int N, int n, const int* symmIndex, double* x) { eblas_symmetrize_gpu<double>(N, n, symmIndex, x); }
eblas_symmetrize_gpu(int N,int n,const int * symmIndex,complex * x)142 void eblas_symmetrize_gpu(int N, int n, const int* symmIndex, complex* x) { eblas_symmetrize_gpu<complex>(N, n, symmIndex, x); }
143 
144 __global__
eblas_symmetrize_phase_kernel(int N,int n,const int * symmIndex,const int * symmMult,const complex * phase,complex * x)145 void eblas_symmetrize_phase_kernel(int N, int n, const int* symmIndex, const int* symmMult, const complex* phase, complex* x)
146 {	int i=kernelIndex1D();
147 	if(i<N) eblas_symmetrize_phase_calc(i, n, symmIndex, symmMult, phase, x);
148 }
eblas_symmetrize_gpu(int N,int n,const int * symmIndex,const int * symmMult,const complex * phase,complex * x)149 void eblas_symmetrize_gpu(int N, int n, const int* symmIndex, const int* symmMult, const complex* phase, complex* x)
150 {	GpuLaunchConfig1D glc(eblas_symmetrize_phase_kernel, N);
151 	eblas_symmetrize_phase_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, n, symmIndex, symmMult, phase, x);
152 	gpuErrorCheck();
153 }
154 
155 __global__
eblas_symmetrize_phase_rot_kernel(int N,int n,const int * symmIndex,const int * symmMult,const complex * phase,const matrix3<> * rotSpin,complexPtr4 x)156 void eblas_symmetrize_phase_rot_kernel(int N, int n, const int* symmIndex, const int* symmMult, const complex* phase, const matrix3<>* rotSpin, complexPtr4 x)
157 {	int i=kernelIndex1D();
158 	if(i<N) eblas_symmetrize_phase_rot_calc(i, n, symmIndex, symmMult, phase, rotSpin, x);
159 }
eblas_symmetrize_gpu(int N,int n,const int * symmIndex,const int * symmMult,const complex * phase,const matrix3<> * rotSpin,std::vector<complex * > x)160 void eblas_symmetrize_gpu(int N, int n, const int* symmIndex, const int* symmMult, const complex* phase, const matrix3<>* rotSpin, std::vector<complex*> x)
161 {	GpuLaunchConfig1D glc(eblas_symmetrize_phase_rot_kernel, N);
162 	eblas_symmetrize_phase_rot_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, n, symmIndex, symmMult, phase, rotSpin, complexPtr4(x));
163 	gpuErrorCheck();
164 }
165 
166 //BLAS-1 wrappers:
eblas_dscal_gpu(int N,double a,double * x,int incx)167 void eblas_dscal_gpu(int N, double a, double* x, int incx)
168 {	cublasDscal_v2(cublasHandle, N, &a, x, incx);
169 }
eblas_zdscal_gpu(int N,double a,complex * x,int incx)170 void eblas_zdscal_gpu(int N, double a, complex* x, int incx)
171 {	cublasZdscal_v2(cublasHandle, N, &a, (double2*)x, incx);
172 }
eblas_zscal_gpu(int N,const complex & a,complex * x,int incx)173 void eblas_zscal_gpu(int N, const complex& a, complex* x, int incx)
174 {	cublasZscal_v2(cublasHandle, N, (const double2*)&a, (double2*)x, incx);
175 }
eblas_daxpy_gpu(int N,double a,const double * x,int incx,double * y,int incy)176 void eblas_daxpy_gpu(int N, double a, const double* x, int incx, double* y, int incy)
177 {	cublasDaxpy_v2(cublasHandle, N, &a, x, incx, y, incy);
178 }
eblas_zaxpy_gpu(int N,const complex & a,const complex * x,int incx,complex * y,int incy)179 void eblas_zaxpy_gpu(int N, const complex& a, const complex* x, int incx, complex* y, int incy)
180 {	cublasZaxpy_v2(cublasHandle, N, (const double2*)&a, (const double2*)x, incx, (double2*)y, incy);
181 }
eblas_zdotc_gpu(int N,const complex * x,int incx,const complex * y,int incy)182 complex eblas_zdotc_gpu(int N, const complex* x, int incx, const complex* y, int incy)
183 {	complex result;
184 	cublasZdotc_v2(cublasHandle, N, (const double2*)x, incx, (const double2*)y, incy, (double2*)&result);
185 	return result;
186 }
eblas_ddot_gpu(int N,const double * x,int incx,const double * y,int incy)187 double eblas_ddot_gpu(int N, const double* x, int incx, const double* y, int incy)
188 {	double result;
189 	cublasDdot_v2(cublasHandle, N, x, incx, y, incy, &result);
190 	return result;
191 }
eblas_dznrm2_gpu(int N,const complex * x,int incx)192 double eblas_dznrm2_gpu(int N, const complex* x, int incx)
193 {	double result;
194 	cublasDznrm2_v2(cublasHandle, N, (const double2*)x, incx, &result);
195 	return result;
196 }
eblas_dnrm2_gpu(int N,const double * x,int incx)197 double eblas_dnrm2_gpu(int N, const double* x, int incx)
198 {	double result;
199 	cublasDnrm2_v2(cublasHandle, N, x, incx, &result);
200 	return result;
201 }
202 
203 
204 //Min-max:
205 
206 //forward declare the cpu version (used at the end for colletcing results):
207 void eblas_capMinMax(const int N, double* x, double& xMin, double& xMax, double capLo=-DBL_MAX, double capHi=+DBL_MAX);
208 
209 extern __shared__ double xMinLoc[];
210 __global__
eblas_capMinMax_kernel(int N,double * x,double * xMinBlk,double * xMaxBlk,double capLo,double capHi)211 void eblas_capMinMax_kernel(int N, double* x, double* xMinBlk, double* xMaxBlk, double capLo, double capHi)
212 {	int i=kernelIndex1D();
213 	int iThread = threadIdx.x;
214 	//Store the original value as the local min and max:
215 	double* xMaxLoc = xMinLoc + blockDim.x;
216 	if(i<N)
217 	{	xMinLoc[iThread] = x[i];
218 		xMaxLoc[iThread] = x[i];
219 		//Cap the value in the array:
220 		if(x[i]<capLo) x[i]=capLo;
221 		if(x[i]>capHi) x[i]=capHi;
222 	}
223 	else
224 	{	xMinLoc[iThread] = +DBL_MAX;
225 		xMaxLoc[iThread] = -DBL_MAX;
226 	}
227 	//Min-max within block:
228 	int extent = blockDim.x/2;
229 	int stride = (blockDim.x+1)/2;
230 	while(extent)
231 	{	__syncthreads();
232 		if(iThread<extent)
233 		{	if(xMinLoc[iThread+stride]<xMinLoc[iThread]) xMinLoc[iThread]=xMinLoc[iThread+stride];
234 			if(xMaxLoc[iThread+stride]>xMaxLoc[iThread]) xMaxLoc[iThread]=xMaxLoc[iThread+stride];
235 		}
236 		extent = stride/2;
237 		stride = (stride+1)/2;
238 	}
239 	__syncthreads();
240 	//Save to the global min-max:
241 	if(iThread==0)
242 	{	int iBlock = blockIdx.y*gridDim.x+blockIdx.x;
243 		xMinBlk[iBlock] = xMinLoc[0];
244 		xMaxBlk[iBlock] = xMaxLoc[0];
245 	}
246 }
eblas_capMinMax_gpu(const int N,double * x,double & xMin,double & xMax,double capLo,double capHi)247 void eblas_capMinMax_gpu(const int N, double* x, double& xMin, double& xMax, double capLo, double capHi)
248 {	GpuLaunchConfig1D glc(eblas_capMinMax_kernel, N);
249 	int nBlocksTot = glc.nBlocks.x * glc.nBlocks.y;
250 	//First perform the capping and obtain the max and min per block:
251 	double* xMinBlk; cudaMalloc(&xMinBlk, sizeof(double)*nBlocksTot*2);
252 	double* xMaxBlk = xMinBlk + nBlocksTot;
253 	int sharedMemBytes = 2*glc.nPerBlock.x*sizeof(double);
254 	eblas_capMinMax_kernel<<<glc.nBlocks,glc.nPerBlock,sharedMemBytes>>>(N,x,xMinBlk,xMaxBlk,capLo,capHi);
255 	//Finish on the CPU:
256 	double* xMinCpu = new double[2*nBlocksTot];
257 	double* xMaxCpu = xMinCpu + nBlocksTot;
258 	cudaMemcpy(xMinCpu, xMinBlk, sizeof(double)*nBlocksTot*2, cudaMemcpyDeviceToHost);
259 	cudaFree(xMinBlk);
260 	xMin = *std::min_element(xMinCpu, xMinCpu+nBlocksTot);
261 	xMax = *std::max_element(xMaxCpu, xMaxCpu+nBlocksTot);
262 	delete[] xMinCpu;
263 }
264