1 /*-------------------------------------------------------------------
2 Copyright 2011 Ravishankar Sundararaman
3
4 This file is part of JDFTx.
5
6 JDFTx is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 JDFTx is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with JDFTx. If not, see <http://www.gnu.org/licenses/>.
18 -------------------------------------------------------------------*/
19
20 #include <core/GpuKernelUtils.h>
21 #include <core/BlasExtra_internal.h>
22 #include <algorithm>
23 #include <cublas_v2.h>
24 #include <cfloat>
25 #include <gsl/gsl_cblas.h>
26
27 template<typename Tx, typename Ty> __global__
eblas_mul_kernel(const int N,const Tx * X,const int incX,Ty * Y,const int incY)28 void eblas_mul_kernel(const int N, const Tx* X, const int incX, Ty* Y, const int incY)
29 { int i = kernelIndex1D();
30 if(i<N) Y[i*incY] *= X[i*incX];
31 }
eblas_dmul_gpu(const int N,const double * X,const int incX,double * Y,const int incY)32 void eblas_dmul_gpu(const int N, const double* X, const int incX, double* Y, const int incY)
33 { GpuLaunchConfig1D glc(eblas_mul_kernel<double,double>, N);
34 eblas_mul_kernel<double,double><<<glc.nBlocks,glc.nPerBlock>>>(N, X,incX, Y,incY);
35 gpuErrorCheck();
36 }
eblas_zmul_gpu(const int N,const complex * X,const int incX,complex * Y,const int incY)37 void eblas_zmul_gpu(const int N, const complex* X, const int incX, complex* Y, const int incY)
38 { GpuLaunchConfig1D glc(eblas_mul_kernel<complex,complex>, N);
39 eblas_mul_kernel<complex,complex><<<glc.nBlocks,glc.nPerBlock>>>(N, X,incX, Y,incY);
40 gpuErrorCheck();
41 }
eblas_zmuld_gpu(const int N,const double * X,const int incX,complex * Y,const int incY)42 void eblas_zmuld_gpu(const int N, const double* X, const int incX, complex* Y, const int incY)
43 { GpuLaunchConfig1D glc(eblas_mul_kernel<double,complex>, N);
44 eblas_mul_kernel<double,complex><<<glc.nBlocks,glc.nPerBlock>>>(N, X,incX, Y,incY);
45 gpuErrorCheck();
46 }
47
48
49 __global__
eblas_lincomb_kernel(const int N,complex sX,const complex * X,const int incX,complex sY,const complex * Y,const int incY,complex * Z,const int incZ)50 void eblas_lincomb_kernel(const int N,
51 complex sX, const complex* X, const int incX,
52 complex sY, const complex* Y, const int incY,
53 complex* Z, const int incZ)
54 { int i = kernelIndex1D();
55 if(i<N) Z[i*incZ] = sX*X[i*incX] + sY*Y[i*incY];
56 }
eblas_lincomb_gpu(const int N,const complex & sX,const complex * X,const int incX,const complex & sY,const complex * Y,const int incY,complex * Z,const int incZ)57 void eblas_lincomb_gpu(const int N,
58 const complex& sX, const complex* X, const int incX,
59 const complex& sY, const complex* Y, const int incY,
60 complex* Z, const int incZ)
61 { GpuLaunchConfig1D glc(eblas_lincomb_kernel, N);
62 eblas_lincomb_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, sX,X,incX, sY,Y,incY, Z,incZ);
63 gpuErrorCheck();
64 }
65
cublasTranspose(CBLAS_TRANSPOSE trans)66 inline cublasOperation_t cublasTranspose(CBLAS_TRANSPOSE trans)
67 { switch(trans)
68 { case CblasNoTrans: return CUBLAS_OP_N;
69 case CblasTrans: return CUBLAS_OP_T;
70 case CblasConjTrans: return CUBLAS_OP_C;
71 }
72 return CUBLAS_OP_N;
73 }
eblas_zgemm_gpu(CBLAS_TRANSPOSE TransA,CBLAS_TRANSPOSE TransB,int M,int N,int K,const complex & alpha,const complex * A,const int lda,const complex * B,const int ldb,const complex & beta,complex * C,const int ldc)74 void eblas_zgemm_gpu(CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB, int M, int N, int K,
75 const complex& alpha, const complex *A, const int lda, const complex *B, const int ldb,
76 const complex& beta, complex *C, const int ldc)
77 { cublasZgemm(cublasHandle, cublasTranspose(TransA), cublasTranspose(TransB), M, N, K,
78 (const double2*)&alpha, (const double2*)A, lda, (const double2*)B, ldb,
79 (const double2*)&beta, (double2*)C, ldc);
80 }
81
82 template<typename scalar, typename scalar2, typename Conjugator> __global__
eblas_scatter_axpy_kernel(const int N,scalar2 a,const int * index,const scalar * x,scalar * y,const scalar * w,const Conjugator & conjugator)83 void eblas_scatter_axpy_kernel(const int N, scalar2 a, const int* index, const scalar* x, scalar* y, const scalar* w, const Conjugator& conjugator)
84 { int i = kernelIndex1D();
85 if(i<N) y[index[i]] += a * conjugator(x,i, w,i);
86 }
87 template<typename scalar, typename scalar2, typename Conjugator> __global__
eblas_gather_axpy_kernel(const int N,scalar2 a,const int * index,const scalar * x,scalar * y,const scalar * w,const Conjugator & conjugator)88 void eblas_gather_axpy_kernel(const int N, scalar2 a, const int* index, const scalar* x, scalar* y, const scalar* w, const Conjugator& conjugator)
89 { int i = kernelIndex1D();
90 if(i<N) y[i] += a * conjugator(x,index[i], w,i);
91 }
92 #define DEFINE_SPARSE_AXPY_GPU_LAUNCHER(type) \
93 template<typename scalar, typename scalar2, typename Conjugator> \
94 void eblas_##type##_axpy_gpu(const int N, scalar2 a, const int* index, const scalar* x, scalar* y, const scalar* w, const Conjugator& conjugator) \
95 { GpuLaunchConfig1D glc(eblas_##type##_axpy_kernel<scalar,scalar2,Conjugator>, N); \
96 eblas_##type##_axpy_kernel<scalar,scalar,Conjugator><<<glc.nBlocks,glc.nPerBlock>>>(N, a, index, x, y, w, conjugator); \
97 gpuErrorCheck(); \
98 }
99 DEFINE_SPARSE_AXPY_GPU_LAUNCHER(scatter)
DEFINE_SPARSE_AXPY_GPU_LAUNCHER(gather)100 DEFINE_SPARSE_AXPY_GPU_LAUNCHER(gather)
101 DEFINE_SPARSE_AXPY(scatter, _gpu)
102 DEFINE_SPARSE_AXPY(gather, _gpu)
103
104
105 __global__
106 void eblas_accumNorm_kernel(int N, double a, const complex* x, double* y)
107 { int i = kernelIndex1D();
108 if(i<N) y[i] += a * norm(x[i]);
109 }
eblas_accumNorm_gpu(int N,const double & a,const complex * x,double * y)110 void eblas_accumNorm_gpu(int N, const double& a, const complex* x, double* y)
111 { GpuLaunchConfig1D glc(eblas_accumNorm_kernel, N);
112 eblas_accumNorm_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, a, x, y);
113 gpuErrorCheck();
114 }
115
116 __global__
eblas_accumProd_kernel(int N,double a,const complex * xU,const complex * xC,double * yRe,double * yIm)117 void eblas_accumProd_kernel(int N, double a, const complex* xU, const complex* xC, double* yRe, double* yIm)
118 { int i = kernelIndex1D();
119 if(i<N)
120 { complex z = a * xU[i] * xC[i].conj();
121 yRe[i] += z.real();
122 yIm[i] += z.imag();
123 }
124 }
eblas_accumProd_gpu(int N,const double & a,const complex * xU,const complex * xC,double * yRe,double * yIm)125 void eblas_accumProd_gpu(int N, const double& a, const complex* xU, const complex* xC, double* yRe, double* yIm)
126 { GpuLaunchConfig1D glc(eblas_accumProd_kernel, N);
127 eblas_accumProd_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, a, xU, xC, yRe, yIm);
128 gpuErrorCheck();
129 }
130
131 template<typename scalar> __global__
eblas_symmetrize_kernel(int N,int n,const int * symmIndex,scalar * x,double nInv)132 void eblas_symmetrize_kernel(int N, int n, const int* symmIndex, scalar* x, double nInv)
133 { int i=kernelIndex1D();
134 if(i<N) eblas_symmetrize_calc(i, n, symmIndex, x, nInv);
135 }
eblas_symmetrize_gpu(int N,int n,const int * symmIndex,scalar * x)136 template<typename scalar> void eblas_symmetrize_gpu(int N, int n, const int* symmIndex, scalar* x)
137 { GpuLaunchConfig1D glc(eblas_symmetrize_kernel<scalar>, N);
138 eblas_symmetrize_kernel<scalar><<<glc.nBlocks,glc.nPerBlock>>>(N, n, symmIndex, x, 1./n);
139 gpuErrorCheck();
140 }
eblas_symmetrize_gpu(int N,int n,const int * symmIndex,double * x)141 void eblas_symmetrize_gpu(int N, int n, const int* symmIndex, double* x) { eblas_symmetrize_gpu<double>(N, n, symmIndex, x); }
eblas_symmetrize_gpu(int N,int n,const int * symmIndex,complex * x)142 void eblas_symmetrize_gpu(int N, int n, const int* symmIndex, complex* x) { eblas_symmetrize_gpu<complex>(N, n, symmIndex, x); }
143
144 __global__
eblas_symmetrize_phase_kernel(int N,int n,const int * symmIndex,const int * symmMult,const complex * phase,complex * x)145 void eblas_symmetrize_phase_kernel(int N, int n, const int* symmIndex, const int* symmMult, const complex* phase, complex* x)
146 { int i=kernelIndex1D();
147 if(i<N) eblas_symmetrize_phase_calc(i, n, symmIndex, symmMult, phase, x);
148 }
eblas_symmetrize_gpu(int N,int n,const int * symmIndex,const int * symmMult,const complex * phase,complex * x)149 void eblas_symmetrize_gpu(int N, int n, const int* symmIndex, const int* symmMult, const complex* phase, complex* x)
150 { GpuLaunchConfig1D glc(eblas_symmetrize_phase_kernel, N);
151 eblas_symmetrize_phase_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, n, symmIndex, symmMult, phase, x);
152 gpuErrorCheck();
153 }
154
155 __global__
eblas_symmetrize_phase_rot_kernel(int N,int n,const int * symmIndex,const int * symmMult,const complex * phase,const matrix3<> * rotSpin,complexPtr4 x)156 void eblas_symmetrize_phase_rot_kernel(int N, int n, const int* symmIndex, const int* symmMult, const complex* phase, const matrix3<>* rotSpin, complexPtr4 x)
157 { int i=kernelIndex1D();
158 if(i<N) eblas_symmetrize_phase_rot_calc(i, n, symmIndex, symmMult, phase, rotSpin, x);
159 }
eblas_symmetrize_gpu(int N,int n,const int * symmIndex,const int * symmMult,const complex * phase,const matrix3<> * rotSpin,std::vector<complex * > x)160 void eblas_symmetrize_gpu(int N, int n, const int* symmIndex, const int* symmMult, const complex* phase, const matrix3<>* rotSpin, std::vector<complex*> x)
161 { GpuLaunchConfig1D glc(eblas_symmetrize_phase_rot_kernel, N);
162 eblas_symmetrize_phase_rot_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, n, symmIndex, symmMult, phase, rotSpin, complexPtr4(x));
163 gpuErrorCheck();
164 }
165
166 //BLAS-1 wrappers:
eblas_dscal_gpu(int N,double a,double * x,int incx)167 void eblas_dscal_gpu(int N, double a, double* x, int incx)
168 { cublasDscal_v2(cublasHandle, N, &a, x, incx);
169 }
eblas_zdscal_gpu(int N,double a,complex * x,int incx)170 void eblas_zdscal_gpu(int N, double a, complex* x, int incx)
171 { cublasZdscal_v2(cublasHandle, N, &a, (double2*)x, incx);
172 }
eblas_zscal_gpu(int N,const complex & a,complex * x,int incx)173 void eblas_zscal_gpu(int N, const complex& a, complex* x, int incx)
174 { cublasZscal_v2(cublasHandle, N, (const double2*)&a, (double2*)x, incx);
175 }
eblas_daxpy_gpu(int N,double a,const double * x,int incx,double * y,int incy)176 void eblas_daxpy_gpu(int N, double a, const double* x, int incx, double* y, int incy)
177 { cublasDaxpy_v2(cublasHandle, N, &a, x, incx, y, incy);
178 }
eblas_zaxpy_gpu(int N,const complex & a,const complex * x,int incx,complex * y,int incy)179 void eblas_zaxpy_gpu(int N, const complex& a, const complex* x, int incx, complex* y, int incy)
180 { cublasZaxpy_v2(cublasHandle, N, (const double2*)&a, (const double2*)x, incx, (double2*)y, incy);
181 }
eblas_zdotc_gpu(int N,const complex * x,int incx,const complex * y,int incy)182 complex eblas_zdotc_gpu(int N, const complex* x, int incx, const complex* y, int incy)
183 { complex result;
184 cublasZdotc_v2(cublasHandle, N, (const double2*)x, incx, (const double2*)y, incy, (double2*)&result);
185 return result;
186 }
eblas_ddot_gpu(int N,const double * x,int incx,const double * y,int incy)187 double eblas_ddot_gpu(int N, const double* x, int incx, const double* y, int incy)
188 { double result;
189 cublasDdot_v2(cublasHandle, N, x, incx, y, incy, &result);
190 return result;
191 }
eblas_dznrm2_gpu(int N,const complex * x,int incx)192 double eblas_dznrm2_gpu(int N, const complex* x, int incx)
193 { double result;
194 cublasDznrm2_v2(cublasHandle, N, (const double2*)x, incx, &result);
195 return result;
196 }
eblas_dnrm2_gpu(int N,const double * x,int incx)197 double eblas_dnrm2_gpu(int N, const double* x, int incx)
198 { double result;
199 cublasDnrm2_v2(cublasHandle, N, x, incx, &result);
200 return result;
201 }
202
203
204 //Min-max:
205
206 //forward declare the cpu version (used at the end for colletcing results):
207 void eblas_capMinMax(const int N, double* x, double& xMin, double& xMax, double capLo=-DBL_MAX, double capHi=+DBL_MAX);
208
209 extern __shared__ double xMinLoc[];
210 __global__
eblas_capMinMax_kernel(int N,double * x,double * xMinBlk,double * xMaxBlk,double capLo,double capHi)211 void eblas_capMinMax_kernel(int N, double* x, double* xMinBlk, double* xMaxBlk, double capLo, double capHi)
212 { int i=kernelIndex1D();
213 int iThread = threadIdx.x;
214 //Store the original value as the local min and max:
215 double* xMaxLoc = xMinLoc + blockDim.x;
216 if(i<N)
217 { xMinLoc[iThread] = x[i];
218 xMaxLoc[iThread] = x[i];
219 //Cap the value in the array:
220 if(x[i]<capLo) x[i]=capLo;
221 if(x[i]>capHi) x[i]=capHi;
222 }
223 else
224 { xMinLoc[iThread] = +DBL_MAX;
225 xMaxLoc[iThread] = -DBL_MAX;
226 }
227 //Min-max within block:
228 int extent = blockDim.x/2;
229 int stride = (blockDim.x+1)/2;
230 while(extent)
231 { __syncthreads();
232 if(iThread<extent)
233 { if(xMinLoc[iThread+stride]<xMinLoc[iThread]) xMinLoc[iThread]=xMinLoc[iThread+stride];
234 if(xMaxLoc[iThread+stride]>xMaxLoc[iThread]) xMaxLoc[iThread]=xMaxLoc[iThread+stride];
235 }
236 extent = stride/2;
237 stride = (stride+1)/2;
238 }
239 __syncthreads();
240 //Save to the global min-max:
241 if(iThread==0)
242 { int iBlock = blockIdx.y*gridDim.x+blockIdx.x;
243 xMinBlk[iBlock] = xMinLoc[0];
244 xMaxBlk[iBlock] = xMaxLoc[0];
245 }
246 }
eblas_capMinMax_gpu(const int N,double * x,double & xMin,double & xMax,double capLo,double capHi)247 void eblas_capMinMax_gpu(const int N, double* x, double& xMin, double& xMax, double capLo, double capHi)
248 { GpuLaunchConfig1D glc(eblas_capMinMax_kernel, N);
249 int nBlocksTot = glc.nBlocks.x * glc.nBlocks.y;
250 //First perform the capping and obtain the max and min per block:
251 double* xMinBlk; cudaMalloc(&xMinBlk, sizeof(double)*nBlocksTot*2);
252 double* xMaxBlk = xMinBlk + nBlocksTot;
253 int sharedMemBytes = 2*glc.nPerBlock.x*sizeof(double);
254 eblas_capMinMax_kernel<<<glc.nBlocks,glc.nPerBlock,sharedMemBytes>>>(N,x,xMinBlk,xMaxBlk,capLo,capHi);
255 //Finish on the CPU:
256 double* xMinCpu = new double[2*nBlocksTot];
257 double* xMaxCpu = xMinCpu + nBlocksTot;
258 cudaMemcpy(xMinCpu, xMinBlk, sizeof(double)*nBlocksTot*2, cudaMemcpyDeviceToHost);
259 cudaFree(xMinBlk);
260 xMin = *std::min_element(xMinCpu, xMinCpu+nBlocksTot);
261 xMax = *std::max_element(xMaxCpu, xMaxCpu+nBlocksTot);
262 delete[] xMinCpu;
263 }
264