1 /*-------------------------------------------------------------------
2 Copyright 2011 Ravishankar Sundararaman
3 
4 This file is part of JDFTx.
5 
6 JDFTx is free software: you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation, either version 3 of the License, or
9 (at your option) any later version.
10 
11 JDFTx is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 GNU General Public License for more details.
15 
16 You should have received a copy of the GNU General Public License
17 along with JDFTx.  If not, see <http://www.gnu.org/licenses/>.
18 -------------------------------------------------------------------*/
19 
20 #include <core/GpuKernelUtils.h>
21 #include <core/LoopMacros.h>
22 #include <core/Operators_internal.h>
23 
24 __global__
RealG_kernel(int zBlock,const vector3<int> S,const complex * vFull,complex * vHalf,double scaleFac)25 void RealG_kernel(int zBlock, const vector3<int> S, const complex* vFull, complex* vHalf, double scaleFac)
26 {	COMPUTE_halfGindices
27 	RealG_calc(i, iG, S, vFull, vHalf, scaleFac);
28 }
RealG_gpu(const vector3<int> S,const complex * vFull,complex * vHalf,double scaleFac)29 void RealG_gpu(const vector3<int> S, const complex* vFull, complex* vHalf, double scaleFac)
30 {	GpuLaunchConfigHalf3D glc(RealG_kernel, S);
31 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
32 		RealG_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, vFull, vHalf, scaleFac);
33 	gpuErrorCheck();
34 }
35 
36 __global__
ImagG_kernel(int zBlock,const vector3<int> S,const complex * vFull,complex * vHalf,double scaleFac)37 void ImagG_kernel(int zBlock, const vector3<int> S, const complex* vFull, complex* vHalf, double scaleFac)
38 {	COMPUTE_halfGindices
39 	ImagG_calc(i, iG, S, vFull, vHalf, scaleFac);
40 }
ImagG_gpu(const vector3<int> S,const complex * vFull,complex * vHalf,double scaleFac)41 void ImagG_gpu(const vector3<int> S, const complex* vFull, complex* vHalf, double scaleFac)
42 {	GpuLaunchConfigHalf3D glc(ImagG_kernel, S);
43 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
44 		ImagG_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, vFull, vHalf, scaleFac);
45 	gpuErrorCheck();
46 }
47 
48 __global__
ComplexG_kernel(int zBlock,const vector3<int> S,const complex * vHalf,complex * vFull,double scaleFac)49 void ComplexG_kernel(int zBlock, const vector3<int> S, const complex* vHalf, complex *vFull, double scaleFac)
50 {	COMPUTE_halfGindices
51 	ComplexG_calc(i, iG, S, vHalf, vFull, scaleFac);
52 }
ComplexG_gpu(const vector3<int> S,const complex * vHalf,complex * vFull,double scaleFac)53 void ComplexG_gpu(const vector3<int> S, const complex* vHalf, complex *vFull, double scaleFac)
54 {	GpuLaunchConfigHalf3D glc(ComplexG_kernel, S);
55 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
56 		ComplexG_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, vHalf, vFull, scaleFac);
57 	gpuErrorCheck();
58 }
59 
60 
61 
62 __global__
L_kernel(int zBlock,const vector3<int> S,const matrix3<> GGT,complex * v)63 void L_kernel(int zBlock, const vector3<int> S, const matrix3<> GGT, complex* v)
64 {	COMPUTE_halfGindices
65 	v[i] *= GGT.metric_length_squared(iG);
66 }
L_gpu(const vector3<int> S,const matrix3<> GGT,complex * v)67 void L_gpu(const vector3<int> S, const matrix3<> GGT, complex* v)
68 {	GpuLaunchConfigHalf3D glc(L_kernel, S);
69 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
70 		L_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, GGT, v);
71 	gpuErrorCheck();
72 }
73 
74 __global__
Linv_kernel(int zBlock,const vector3<int> S,const matrix3<> GGT,complex * v)75 void Linv_kernel(int zBlock, const vector3<int> S, const matrix3<> GGT, complex* v)
76 {	COMPUTE_halfGindices
77 	v[i] *= i ? 1.0/GGT.metric_length_squared(iG) : 0.0;
78 }
Linv_gpu(const vector3<int> S,const matrix3<> GGT,complex * v)79 void Linv_gpu(const vector3<int> S, const matrix3<> GGT, complex* v)
80 {	GpuLaunchConfigHalf3D glc(Linv_kernel, S);
81 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
82 		Linv_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, GGT, v);
83 	gpuErrorCheck();
84 }
85 
86 __global__
fullL_kernel(int zBlock,const vector3<int> S,const matrix3<> GGT,complex * v)87 void fullL_kernel(int zBlock, const vector3<int> S, const matrix3<> GGT, complex* v)
88 {	COMPUTE_fullGindices
89 	v[i] *= GGT.metric_length_squared(iG);
90 }
fullL_gpu(const vector3<int> S,const matrix3<> GGT,complex * v)91 void fullL_gpu(const vector3<int> S, const matrix3<> GGT, complex* v)
92 {	GpuLaunchConfig3D glc(fullL_kernel, S);
93 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
94 		fullL_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, GGT, v);
95 	gpuErrorCheck();
96 }
97 
98 __global__
fullLinv_kernel(int zBlock,const vector3<int> S,const matrix3<> GGT,complex * v)99 void fullLinv_kernel(int zBlock, const vector3<int> S, const matrix3<> GGT, complex* v)
100 {	COMPUTE_fullGindices
101 	v[i] *= i ? 1.0/GGT.metric_length_squared(iG) : 0.0;
102 }
fullLinv_gpu(const vector3<int> S,const matrix3<> GGT,complex * v)103 void fullLinv_gpu(const vector3<int> S, const matrix3<> GGT, complex* v)
104 {	GpuLaunchConfig3D glc(fullLinv_kernel, S);
105 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
106 		fullLinv_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, GGT, v);
107 	gpuErrorCheck();
108 }
109 
110 __global__
Lstress_kernel(int zBlock,vector3<int> S,const complex * X,const complex * Y,symmetricMatrix3<> * grad_RRT)111 void Lstress_kernel(int zBlock, vector3<int> S, const complex* X, const complex* Y, symmetricMatrix3<>* grad_RRT)
112 {	COMPUTE_halfGindices
113 	double weight = ((iG[2]==0) or (2*iG[2]==S[2])) ? 1 : 2; //weight factor for points in reduced reciprocal space of real scalar fields
114 	grad_RRT[i] = (weight * real(X[i].conj() * Y[i])) * outer(vector3<>(iG));
115 }
116 
Lstress_gpu(vector3<int> S,const complex * X,const complex * Y,symmetricMatrix3<> * grad_RRT)117 void Lstress_gpu(vector3<int> S, const complex* X, const complex* Y, symmetricMatrix3<>* grad_RRT)
118 {	GpuLaunchConfigHalf3D glc(Lstress_kernel, S);
119 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
120 		Lstress_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, X, Y, grad_RRT);
121 }
122 
123 __global__
LinvStress_kernel(int zBlock,vector3<int> S,const matrix3<> GGT,const complex * X,const complex * Y,symmetricMatrix3<> * grad_RRT)124 void LinvStress_kernel(int zBlock, vector3<int> S, const matrix3<> GGT, const complex* X, const complex* Y, symmetricMatrix3<>* grad_RRT)
125 {	COMPUTE_halfGindices
126 	double weight = ((iG[2]==0) or (2*iG[2]==S[2])) ? 1 : 2; //weight factor for points in reduced reciprocal space of real scalar fields
127 	double Gsq = GGT.metric_length_squared(iG);
128 	double GsqInv = Gsq ? 1./Gsq : 0;
129 	grad_RRT[i] = (weight * real(X[i].conj() * Y[i]) * (-GsqInv*GsqInv)) * outer(vector3<>(iG));
130 }
131 
LinvStress_gpu(vector3<int> S,const matrix3<> & GGT,const complex * X,const complex * Y,symmetricMatrix3<> * grad_RRT)132 void LinvStress_gpu(vector3<int> S, const matrix3<>& GGT, const complex* X, const complex* Y, symmetricMatrix3<>* grad_RRT)
133 {	GpuLaunchConfigHalf3D glc(LinvStress_kernel, S);
134 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
135 		LinvStress_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, GGT, X, Y, grad_RRT);
136 }
137 
138 
139 __global__
D_kernel(int zBlock,const vector3<int> S,const complex * in,complex * out,vector3<> Ge)140 void D_kernel(int zBlock, const vector3<int> S, const complex* in, complex* out, vector3<> Ge)
141 {	COMPUTE_halfGindices
142 	D_calc(i, iG, in, out, Ge);
143 }
D_gpu(const vector3<int> S,const complex * in,complex * out,vector3<> Ge)144 void D_gpu(const vector3<int> S, const complex* in, complex* out, vector3<> Ge)
145 {	GpuLaunchConfigHalf3D glc(D_kernel, S);
146 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
147 		D_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, in, out, Ge);
148 	gpuErrorCheck();
149 }
150 
151 __global__
DD_kernel(int zBlock,const vector3<int> S,const complex * in,complex * out,vector3<> Ge1,vector3<> Ge2)152 void DD_kernel(int zBlock, const vector3<int> S, const complex* in, complex* out, vector3<> Ge1, vector3<> Ge2)
153 {	COMPUTE_halfGindices
154 	DD_calc(i, iG, in, out, Ge1, Ge2);
155 }
DD_gpu(const vector3<int> S,const complex * in,complex * out,vector3<> Ge1,vector3<> Ge2)156 void DD_gpu(const vector3<int> S, const complex* in, complex* out, vector3<> Ge1, vector3<> Ge2)
157 {	GpuLaunchConfigHalf3D glc(DD_kernel, S);
158 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
159 		DD_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, in, out, Ge1, Ge2);
160 	gpuErrorCheck();
161 }
162 
163 template<int l> __global__
lGradient_kernel(int zBlock,const vector3<int> S,const complex * in,array<complex *,2* l+1> out,const matrix3<> G)164 void lGradient_kernel(int zBlock, const vector3<int> S, const complex* in, array<complex*, 2*l+1> out, const matrix3<> G)
165 {	COMPUTE_halfGindices
166 	lGradient_calc<l>(i, iG, IS_NYQUIST, in, out, G);
167 }
lGradient_gpu(const vector3<int> & S,const complex * in,array<complex *,2* l+1> out,const matrix3<> & G)168 template<int l> void lGradient_gpu(const vector3<int>& S, const complex* in, array<complex*, 2*l+1> out, const matrix3<>& G)
169 {	GpuLaunchConfigHalf3D glc(lGradient_kernel<l>, S);
170 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
171 		lGradient_kernel<l><<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, in, out, G);
172 	gpuErrorCheck();
173 }
lGradient_gpu(const vector3<int> & S,const complex * in,std::vector<complex * > out,int l,const matrix3<> & G)174 void lGradient_gpu(const vector3<int>& S, const complex* in, std::vector<complex*> out, int l, const matrix3<>& G)
175 {	SwitchTemplate_l(l, lGradient_gpu, (S, in, out, G))
176 }
177 
178 template<int l> __global__
lDivergence_kernel(int zBlock,const vector3<int> S,const array<const complex *,2* l+1> in,complex * out,const matrix3<> G)179 void lDivergence_kernel(int zBlock, const vector3<int> S, const array<const complex*,2*l+1> in, complex* out, const matrix3<> G)
180 {	COMPUTE_halfGindices
181 	lDivergence_calc<l>(i, iG, IS_NYQUIST, in, out, G);
182 }
lDivergence_gpu(const vector3<int> & S,array<const complex *,2* l+1> in,complex * out,const matrix3<> & G)183 template<int l> void lDivergence_gpu(const vector3<int>& S, array<const complex*,2*l+1> in, complex* out, const matrix3<>& G)
184 {	GpuLaunchConfigHalf3D glc(lDivergence_kernel<l>, S);
185 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
186 		lDivergence_kernel<l><<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, in, out, G);
187 	gpuErrorCheck();
188 }
lDivergence_gpu(const vector3<int> & S,const std::vector<const complex * > & in,complex * out,int l,const matrix3<> & G)189 void lDivergence_gpu(const vector3<int>& S, const std::vector<const complex*>& in, complex* out, int l, const matrix3<>& G)
190 {	SwitchTemplate_l(l, lDivergence_gpu, (S, in, out, G))
191 }
192 
193 
194 template<int l, int m> __global__
lGradientStress_kernel(int zBlock,const vector3<int> S,const matrix3<> G,const RadialFunctionG w,const complex * X,const complex * Y,symmetricMatrix3<> * grad_RRT,complex lPhase)195 void lGradientStress_kernel(int zBlock, const vector3<int> S, const matrix3<> G, const RadialFunctionG w, const complex* X, const complex* Y, symmetricMatrix3<>* grad_RRT, complex lPhase)
196 {	COMPUTE_halfGindices
197 	double weight = real(conj(X[i]) * Y[i] * lPhase) * (IS_NYQUIST
198 			? 0 //drop nyquist frequency contributions
199 			: (iG[2]==0 ? 1 : 2) ); //reciprocal space weights
200 		grad_RRT[i] = weight * lGradientStress_calc<l*(l+1)+m>(iG, G, w);
201 }
lGradientStress_gpu(const vector3<int> & S,const matrix3<> & G,const RadialFunctionG & w,const complex * X,const complex * Y,symmetricMatrix3<> * grad_RRT)202 template<int l, int m> void lGradientStress_gpu(const vector3<int>& S, const matrix3<>& G, const RadialFunctionG& w, const complex* X, const complex* Y, symmetricMatrix3<>* grad_RRT)
203 {	const complex lPhase = cis(l*0.5*M_PI);
204 	GpuLaunchConfigHalf3D glc(lGradientStress_kernel<l,m>, S);
205 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
206 		lGradientStress_kernel<l,m><<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, G, w, X, Y, grad_RRT, lPhase);
207 	gpuErrorCheck();
208 }
lGradientStress_gpu(const vector3<int> & S,const matrix3<> & G,const RadialFunctionG & w,const complex * X,const complex * Y,int l,int m,symmetricMatrix3<> * grad_RRT)209 void lGradientStress_gpu(const vector3<int>& S, const matrix3<>& G, const RadialFunctionG& w, const complex* X, const complex* Y, int l, int m, symmetricMatrix3<>* grad_RRT)
210 {	SwitchTemplate_lm(l, m, lGradientStress_gpu, (S, G, w, X, Y, grad_RRT))
211 }
212 
213 
214 __global__
multiplyBlochPhase_kernel(int zBlock,const vector3<int> S,const vector3<> invS,complex * v,const vector3<> k)215 void multiplyBlochPhase_kernel(int zBlock, const vector3<int> S, const vector3<> invS, complex* v, const vector3<> k)
216 {	COMPUTE_rIndices
217 	v[i] *= blochPhase_calc(iv, invS, k);
218 }
multiplyBlochPhase_gpu(const vector3<int> & S,const vector3<> & invS,complex * v,const vector3<> & k)219 void multiplyBlochPhase_gpu(const vector3<int>& S, const vector3<>& invS, complex* v, const vector3<>& k)
220 {	GpuLaunchConfig3D glc(multiplyBlochPhase_kernel, S);
221 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
222 		 multiplyBlochPhase_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, invS, v, k);
223 	gpuErrorCheck();
224 }
225 
226 
227 __global__
radialFunction_kernel(int zBlock,const vector3<int> S,const matrix3<> GGT,complex * F,const RadialFunctionG f,vector3<> r0)228 void radialFunction_kernel(int zBlock, const vector3<int> S, const matrix3<> GGT,
229 	complex* F, const RadialFunctionG f, vector3<> r0)
230 {	COMPUTE_halfGindices
231 	F[i] = radialFunction_calc(iG, GGT, f, r0);
232 }
radialFunction_gpu(const vector3<int> S,const matrix3<> & GGT,complex * F,const RadialFunctionG & f,vector3<> r0)233 void radialFunction_gpu(const vector3<int> S, const matrix3<>& GGT,
234 	complex* F, const RadialFunctionG& f, vector3<> r0)
235 {	GpuLaunchConfigHalf3D glc(radialFunction_kernel, S);
236 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
237 		radialFunction_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, GGT, F, f, r0);
238 	gpuErrorCheck();
239 }
240 
241 __global__
radialFunctionMultiply_kernel(int zBlock,const vector3<int> S,const matrix3<> GGT,complex * in,const RadialFunctionG f)242 void radialFunctionMultiply_kernel(int zBlock, const vector3<int> S, const matrix3<> GGT, complex* in, const RadialFunctionG f)
243 {	COMPUTE_halfGindices
244 	in[i] *= f(sqrt(GGT.metric_length_squared(iG)));
245 }
radialFunctionMultiply_gpu(const vector3<int> S,const matrix3<> & GGT,complex * in,const RadialFunctionG & f)246 void radialFunctionMultiply_gpu(const vector3<int> S, const matrix3<>& GGT, complex* in, const RadialFunctionG& f)
247 {	GpuLaunchConfigHalf3D glc(radialFunctionMultiply_kernel, S);
248 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
249 		radialFunctionMultiply_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, GGT, in, f);
250 	gpuErrorCheck();
251 }
252 
253 __global__
convolveStress_kernel(int zBlock,vector3<int> S,const matrix3<> GGT,const RadialFunctionG w,const complex * X,const complex * Y,symmetricMatrix3<> * grad_RRT)254 void convolveStress_kernel(int zBlock, vector3<int> S, const matrix3<> GGT, const RadialFunctionG w, const complex* X, const complex* Y, symmetricMatrix3<>* grad_RRT)
255 {	COMPUTE_halfGindices
256 	double weight = ((iG[2]==0) or (2*iG[2]==S[2])) ? 1 : 2; //weight factor for points in reduced reciprocal space of real scalar fields
257 	double G = sqrt(GGT.metric_length_squared(iG));
258 	double minus_wPrime_by_G = G ? (-w.deriv(G)/G) : 0.;
259 	grad_RRT[i] = (weight * minus_wPrime_by_G * real(X[i].conj() * Y[i])) * outer(vector3<>(iG));
260 }
261 
convolveStress_gpu(vector3<int> S,const matrix3<> & GGT,const RadialFunctionG & w,const complex * X,const complex * Y,symmetricMatrix3<> * grad_RRT)262 void convolveStress_gpu(vector3<int> S, const matrix3<>& GGT, const RadialFunctionG& w, const complex* X, const complex* Y, symmetricMatrix3<>* grad_RRT)
263 {	GpuLaunchConfigHalf3D glc(convolveStress_kernel, S);
264 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
265 		convolveStress_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, GGT, w, X, Y, grad_RRT);
266 }
267 
268 __global__
exp_kernel(int N,double * X,double prefac)269 void exp_kernel(int N, double* X, double prefac)
270 {	int i = kernelIndex1D(); if(i<N) X[i] = exp(prefac*X[i]);
271 }
exp_gpu(int N,double * X,double prefac)272 void exp_gpu(int N, double* X, double prefac)
273 {	GpuLaunchConfig1D glc(exp_kernel, N);
274 	exp_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, X, prefac);
275 	gpuErrorCheck();
276 }
277 
278 __global__
log_kernel(int N,double * X,double prefac)279 void log_kernel(int N, double* X, double prefac)
280 {	int i = kernelIndex1D(); if(i<N) X[i] = log(prefac*X[i]);
281 }
log_gpu(int N,double * X,double prefac)282 void log_gpu(int N, double* X, double prefac)
283 {	GpuLaunchConfig1D glc(log_kernel, N);
284 	log_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, X, prefac);
285 	gpuErrorCheck();
286 }
287 
288 __global__
sqrt_kernel(int N,double * X,double prefac)289 void sqrt_kernel(int N, double* X, double prefac)
290 {	int i = kernelIndex1D(); if(i<N) X[i] = sqrt(prefac*X[i]);
291 }
sqrt_gpu(int N,double * X,double prefac)292 void sqrt_gpu(int N, double* X, double prefac)
293 {	GpuLaunchConfig1D glc(sqrt_kernel, N);
294 	sqrt_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, X, prefac);
295 	gpuErrorCheck();
296 }
297 
298 __global__
inv_kernel(int N,double * X,double prefac)299 void inv_kernel(int N, double* X, double prefac)
300 {	int i = kernelIndex1D(); if(i<N) X[i] = prefac/X[i];
301 }
inv_gpu(int N,double * X,double prefac)302 void inv_gpu(int N, double* X, double prefac)
303 {	GpuLaunchConfig1D glc(inv_kernel, N);
304 	inv_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, X, prefac);
305 	gpuErrorCheck();
306 }
307 
308 __global__
pow_kernel(int N,double * X,double scale,double alpha)309 void pow_kernel(int N, double* X, double scale, double alpha)
310 {	int i = kernelIndex1D(); if(i<N) X[i] = pow(scale*X[i],alpha);
311 }
pow_gpu(int N,double * X,double scale,double alpha)312 void pow_gpu(int N, double* X, double scale, double alpha)
313 {	GpuLaunchConfig1D glc(pow_kernel, N);
314 	pow_kernel<<<glc.nBlocks,glc.nPerBlock>>>(N, X, scale, alpha);
315 	gpuErrorCheck();
316 }
317 
318 __global__
gaussConvolve_kernel(int zBlock,const vector3<int> S,const matrix3<> GGT,complex * data,double sigma)319 void gaussConvolve_kernel(int zBlock, const vector3<int> S, const matrix3<> GGT, complex* data, double sigma)
320 {	COMPUTE_halfGindices
321 	data[i] *= exp(-0.5*sigma*sigma*GGT.metric_length_squared(iG));
322 }
gaussConvolve_gpu(const vector3<int> & S,const matrix3<> & GGT,complex * data,double sigma)323 void gaussConvolve_gpu(const vector3<int>& S, const matrix3<>& GGT, complex* data, double sigma)
324 {	GpuLaunchConfig3D glc(gaussConvolve_kernel, S);
325 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
326 		gaussConvolve_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, GGT, data, sigma);
327 }
328 
329 
330 __global__
changeGrid_kernel(int zBlock,const vector3<int> S,const vector3<int> Sin,const vector3<int> Sout,const complex * in,complex * out)331 void changeGrid_kernel(int zBlock, const vector3<int> S, const vector3<int> Sin, const vector3<int> Sout, const complex* in, complex* out)
332 {	COMPUTE_halfGindices
333 	changeGrid_calc(iG, Sin, Sout, in, out);
334 }
changeGrid_gpu(const vector3<int> & S,const vector3<int> & Sin,const vector3<int> & Sout,const complex * in,complex * out)335 void changeGrid_gpu(const vector3<int>& S, const vector3<int>& Sin, const vector3<int>& Sout, const complex* in, complex* out)
336 {	GpuLaunchConfigHalf3D glc(changeGrid_kernel, S);
337 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
338 		changeGrid_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, Sin, Sout, in, out);
339 	gpuErrorCheck();
340 }
341 
342 __global__
changeGridFull_kernel(int zBlock,const vector3<int> S,const vector3<int> Sin,const vector3<int> Sout,const complex * in,complex * out)343 void changeGridFull_kernel(int zBlock, const vector3<int> S, const vector3<int> Sin, const vector3<int> Sout, const complex* in, complex* out)
344 {	COMPUTE_fullGindices
345 	changeGridFull_calc(iG, Sin, Sout, in, out);
346 }
changeGridFull_gpu(const vector3<int> & S,const vector3<int> & Sin,const vector3<int> & Sout,const complex * in,complex * out)347 void changeGridFull_gpu(const vector3<int>& S, const vector3<int>& Sin, const vector3<int>& Sout, const complex* in, complex* out)
348 {	GpuLaunchConfig3D glc(changeGridFull_kernel, S);
349 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
350 		changeGridFull_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, Sin, Sout, in, out);
351 	gpuErrorCheck();
352 }
353 
354 
355 __global__
gradient_kernel(int zBlock,const vector3<int> S,const matrix3<> G,const complex * Xtilde,vector3<complex * > gradTilde)356 void gradient_kernel(int zBlock, const vector3<int> S, const matrix3<> G, const complex* Xtilde, vector3<complex*> gradTilde)
357 {	COMPUTE_halfGindices
358 	gradient_calc(i, iG, IS_NYQUIST, G, Xtilde, gradTilde);
359 }
gradient_gpu(const vector3<int> S,const matrix3<> G,const complex * Xtilde,vector3<complex * > gradTilde)360 void gradient_gpu(const vector3<int> S, const matrix3<> G, const complex* Xtilde, vector3<complex*> gradTilde)
361 {	GpuLaunchConfigHalf3D glc(gradient_kernel, S);
362 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
363 		gradient_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, G, Xtilde, gradTilde);
364 	gpuErrorCheck();
365 }
366 
367 
368 __global__
divergence_kernel(int zBlock,const vector3<int> S,const matrix3<> G,vector3<const complex * > Vtilde,complex * divTilde)369 void divergence_kernel(int zBlock, const vector3<int> S, const matrix3<> G, vector3<const complex*> Vtilde, complex* divTilde)
370 {	COMPUTE_halfGindices
371 	divergence_calc(i, iG, IS_NYQUIST, G, Vtilde, divTilde);
372 }
divergence_gpu(const vector3<int> S,const matrix3<> G,vector3<const complex * > Vtilde,complex * divTilde)373 void divergence_gpu(const vector3<int> S, const matrix3<> G, vector3<const complex*> Vtilde, complex* divTilde)
374 {	GpuLaunchConfigHalf3D glc(divergence_kernel, S);
375 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
376 		divergence_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, G, Vtilde, divTilde);
377 	gpuErrorCheck();
378 }
379 
380 
381 __global__
tensorGradient_kernel(int zBlock,const vector3<int> S,const matrix3<> G,const complex * Xtilde,tensor3<complex * > gradTilde)382 void tensorGradient_kernel(int zBlock, const vector3<int> S, const matrix3<> G, const complex* Xtilde, tensor3<complex*> gradTilde)
383 {	COMPUTE_halfGindices
384 	tensorGradient_calc(i, iG, IS_NYQUIST, G, Xtilde, gradTilde);
385 }
tensorGradient_gpu(const vector3<int> S,const matrix3<> G,const complex * Xtilde,tensor3<complex * > gradTilde)386 void tensorGradient_gpu(const vector3<int> S, const matrix3<> G, const complex* Xtilde, tensor3<complex*> gradTilde)
387 {	GpuLaunchConfigHalf3D glc(tensorGradient_kernel, S);
388 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
389 		tensorGradient_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, G, Xtilde, gradTilde);
390 	gpuErrorCheck();
391 }
392 
393 
394 __global__
tensorDivergence_kernel(int zBlock,const vector3<int> S,const matrix3<> G,tensor3<const complex * > Vtilde,complex * divTilde)395 void tensorDivergence_kernel(int zBlock, const vector3<int> S, const matrix3<> G, tensor3<const complex*> Vtilde, complex* divTilde)
396 {	COMPUTE_halfGindices
397 	tensorDivergence_calc(i, iG, IS_NYQUIST, G, Vtilde, divTilde);
398 }
tensorDivergence_gpu(const vector3<int> S,const matrix3<> G,tensor3<const complex * > Vtilde,complex * divTilde)399 void tensorDivergence_gpu(const vector3<int> S, const matrix3<> G, tensor3<const complex*> Vtilde, complex* divTilde)
400 {	GpuLaunchConfigHalf3D glc(tensorDivergence_kernel, S);
401 	for(int zBlock=0; zBlock<glc.zBlockMax; zBlock++)
402 		tensorDivergence_kernel<<<glc.nBlocks,glc.nPerBlock>>>(zBlock, S, G, Vtilde, divTilde);
403 	gpuErrorCheck();
404 }
405