1 /*-------------------------------------------------------------------
2 Copyright 2018 Ravishankar Sundararaman
3
4 This file is part of JDFTx.
5
6 JDFTx is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10
11 JDFTx is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with JDFTx. If not, see <http://www.gnu.org/licenses/>.
18 -------------------------------------------------------------------*/
19
20 #include <core/GpuKernelUtils.h>
21 #include <cublas_v2.h>
22
23 __global__
matrixSubGet_kernel(int nr,int iStart,int iStep,int iDelta,int jStart,int jStep,int jDelta,const complex * in,complex * out)24 void matrixSubGet_kernel(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out)
25 { int i = kernelIndex(x); if(i>=iDelta) return;
26 int j = kernelIndex(y); if(j>=jDelta) return;
27 out[iDelta*j+i] = in[nr*(j*jStep+jStart) + (i*iStep+iStart)];
28
29 }
matrixSubGet_gpu(int nr,int iStart,int iStep,int iDelta,int jStart,int jStep,int jDelta,const complex * in,complex * out)30 void matrixSubGet_gpu(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out)
31 { GpuLaunchConfig3D glc(matrixSubGet_kernel, vector3<int>(1,jDelta,iDelta));
32 matrixSubGet_kernel<<<glc.nBlocks,glc.nPerBlock>>>(nr, iStart,iStep,iDelta, jStart,jStep,jDelta, in, out);
33 gpuErrorCheck();
34 }
35
36 #define DECLARE_matrixSubSetAccum(NAME, OP) \
37 __global__ \
38 void matrixSub ##NAME## _kernel(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out) \
39 { int i = kernelIndex(x); if(i>=iDelta) return; \
40 int j = kernelIndex(y); if(j>=jDelta) return; \
41 out[nr*(j*jStep+jStart) + (i*iStep+iStart)] OP in[iDelta*j+i]; \
42 } \
43 void matrixSub ##NAME## _gpu(int nr, int iStart, int iStep, int iDelta, int jStart, int jStep, int jDelta, const complex* in, complex* out) \
44 { GpuLaunchConfig3D glc(matrixSub ##NAME## _kernel, vector3<int>(1,jDelta,iDelta)); \
45 matrixSub ##NAME## _kernel<<<glc.nBlocks,glc.nPerBlock>>>(nr, iStart,iStep,iDelta, jStart,jStep,jDelta, in, out); \
46 gpuErrorCheck(); \
47 }
48 DECLARE_matrixSubSetAccum(Set, =)
49 DECLARE_matrixSubSetAccum(Accum, +=)
50 #undef DECLARE_matrixSubSetAccum
51
52
53 //----- diagonal matrix multiplies
mulMD_kernel(int nRows,int nCols,const complex * M,const scalar * D,complex * out)54 template<typename scalar> __global__ void mulMD_kernel(int nRows, int nCols, const complex* M, const scalar* D, complex* out)
55 { int i = kernelIndex(x); if(i>=nRows) return;
56 int j = kernelIndex(y); if(j>=nCols) return;
57 int index = j*nRows + i;
58 out[index] = M[index] * D[j];
59 }
mulDM_kernel(int nRows,int nCols,const scalar * D,const complex * M,complex * out)60 template<typename scalar> __global__ void mulDM_kernel(int nRows, int nCols, const scalar* D, const complex* M, complex* out)
61 { int i = kernelIndex(x); if(i>=nRows) return;
62 int j = kernelIndex(y); if(j>=nCols) return;
63 int index = j*nRows + i;
64 out[index] = D[i] * M[index];
65 }
mulMD_gpu(int nRows,int nCols,const complex * M,const scalar * D,complex * out)66 template<typename scalar> void mulMD_gpu(int nRows, int nCols, const complex* M, const scalar* D, complex* out)
67 { static const int nColsMax = 32768;
68 if(nCols > nColsMax)
69 { //Break up into smaller calls to avoid CUDA grid limits:
70 int nLoop = ceildiv(nCols, nColsMax);
71 for(int colStart=0; colStart<nCols; colStart+=nColsMax)
72 { int nColsCur = std::min(nCols-colStart, nColsMax);
73 int offset = colStart * nRows;
74 mulMD_gpu<scalar>(nRows, nColsCur, M+offset, D+colStart, out+offset);
75 }
76 return;
77 }
78 GpuLaunchConfig3D glc(mulMD_kernel<scalar>, vector3<int>(1,nCols,nRows));
79 mulMD_kernel<scalar><<<glc.nBlocks,glc.nPerBlock>>>(nRows, nCols, M, D, out);
80 gpuErrorCheck();
81 }
mulDM_gpu(int nRows,int nCols,const scalar * D,const complex * M,complex * out)82 template<typename scalar> void mulDM_gpu(int nRows, int nCols, const scalar* D, const complex* M, complex* out)
83 { GpuLaunchConfig3D glc(mulDM_kernel<scalar>, vector3<int>(1,nCols,nRows));
84 mulDM_kernel<scalar><<<glc.nBlocks,glc.nPerBlock>>>(nRows, nCols, D, M, out);
85 gpuErrorCheck();
86 }
mulMDdouble_gpu(int nRows,int nCols,const complex * M,const double * D,complex * out)87 void mulMDdouble_gpu(int nRows, int nCols, const complex* M, const double* D, complex* out) { mulMD_gpu<double>(nRows, nCols, M, D, out); }
mulDMdouble_gpu(int nRows,int nCols,const double * D,const complex * M,complex * out)88 void mulDMdouble_gpu(int nRows, int nCols, const double* D, const complex* M, complex* out) { mulDM_gpu<double>(nRows, nCols, D, M, out); }
mulMDcomplex_gpu(int nRows,int nCols,const complex * M,const complex * D,complex * out)89 void mulMDcomplex_gpu(int nRows, int nCols, const complex* M, const complex* D, complex* out) { mulMD_gpu<complex>(nRows, nCols, M, D, out); }
mulDMcomplex_gpu(int nRows,int nCols,const complex * D,const complex * M,complex * out)90 void mulDMcomplex_gpu(int nRows, int nCols, const complex* D, const complex* M, complex* out) { mulDM_gpu<complex>(nRows, nCols, D, M, out); }
91
92
93 __global__
diagDot_kernel(int nRows,int nCols,const complex * X,const complex * Y,double * out)94 void diagDot_kernel(int nRows, int nCols, const complex* X, const complex* Y, double* out)
95 { int iCol = kernelIndex1D(); if(iCol>=nCols) return;
96 double result = 0.;
97 for(int iRow=0; iRow<nRows; iRow++)
98 { int index = iCol*nRows + iRow;
99 result += (X[index].conj() * Y[index]).real();
100 }
101 out[iCol] = result;
102 }
diagDot_gpu(int nRows,int nCols,const complex * X,const complex * Y,double * out)103 void diagDot_gpu(int nRows, int nCols, const complex* X, const complex* Y, double* out)
104 { GpuLaunchConfig1D glc(diagDot_kernel, nCols);
105 diagDot_kernel<<<glc.nBlocks,glc.nPerBlock>>>(nRows, nCols, X, Y, out);
106 gpuErrorCheck();
107 }
108
109
110 __global__
relativeHermiticityError_kernel(int N,const complex * data,double * buf)111 void relativeHermiticityError_kernel(int N, const complex* data, double* buf)
112 { int i = kernelIndex1D();
113 if(i<N)
114 { double errNum = 0., errDen = 0.;
115 for(int j=0; j<N; j++)
116 { int index = N*i + j;
117 int indexT = N*j + i;
118 errNum += norm(data[index]-data[indexT].conj());
119 errDen += norm(data[index]);
120 }
121 buf[i] = errNum;
122 buf[i+N] = errDen;
123 }
124 }
relativeHermiticityError_gpu(int N,const complex * data)125 double relativeHermiticityError_gpu(int N, const complex* data)
126 { GpuLaunchConfig1D glc(relativeHermiticityError_kernel, N);
127 double* buf; cudaMalloc(&buf, sizeof(double)*(2*N)); //buffer to store results per row
128 relativeHermiticityError_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, data, buf);
129 gpuErrorCheck();
130 double errNum = 0., errDen = 0.;
131 cublasDasum(cublasHandle, N, buf, 1, &errNum);
132 cublasDasum(cublasHandle, N, buf+N, 1, &errDen);
133 cudaFree(buf);
134 return sqrt(errNum / (errDen*N));
135 }
136