1 /* ----------------------------------------------------------------------
2 LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3
4 Original Version:
5 http://lammps.sandia.gov, Sandia National Laboratories
6 Steve Plimpton, sjplimp@sandia.gov
7
8 See the README file in the top-level LAMMPS directory.
9
10 -----------------------------------------------------------------------
11
12 USER-CUDA Package and associated modifications:
13 https://sourceforge.net/projects/lammpscuda/
14
15 Christian Trott, christian.trott@tu-ilmenau.de
16 Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17 Theoretical Physics II, University of Technology Ilmenau, Germany
18
19 See the README file in the USER-CUDA directory.
20
21 This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23
24 #define OFFSET 4096
negativCUDA(float f)25 __device__ int negativCUDA(float f)
26 {
27 return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31;
28 }
29
reduceBlock(float * data)30 __device__ void reduceBlock(float* data)
31 {
32 int p2 = 1;
33
34 while(p2 * 2 < blockDim.x) p2 *= 2;
35
36 if(threadIdx.x < blockDim.x - p2)
37 data[threadIdx.x] += data[threadIdx.x + p2];
38
39 __syncthreads();
40
41 for(int i = 2; i <= p2; i *= 2) {
42 if(threadIdx.x < p2 / i)
43 data[threadIdx.x] += data[threadIdx.x + p2 / i];
44
45 __syncthreads();
46 }
47 }
48
reduceBlock(double * data)49 __device__ void reduceBlock(double* data)
50 {
51 int p2 = 1;
52
53 while(p2 * 2 < blockDim.x) p2 *= 2;
54
55 if(threadIdx.x < blockDim.x - p2)
56 data[threadIdx.x] += data[threadIdx.x + p2];
57
58 __syncthreads();
59
60 for(int i = 2; i <= p2; i *= 2) {
61 if(threadIdx.x < p2 / i)
62 data[threadIdx.x] += data[threadIdx.x + p2 / i];
63
64 __syncthreads();
65 }
66 }
67
68 extern __shared__ PPPM_FLOAT sharedmem[];
69
setup_fkxyz_vg(PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald)70 __global__ void setup_fkxyz_vg(PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald)
71 {
72 PPPM_FLOAT my_fkx = unitkx * (int(threadIdx.x) - nx_pppm * (2 * int(threadIdx.x) / nx_pppm));
73 PPPM_FLOAT my_fky = unitky * (int(blockIdx.y) - ny_pppm * (2 * int(blockIdx.y) / ny_pppm));
74 PPPM_FLOAT my_fkz = unitkz * (int(blockIdx.x) - nz_pppm * (2 * int(blockIdx.x) / nz_pppm));
75
76 if((blockIdx.x == 0) && (blockIdx.y == 0)) fkx[threadIdx.x] = my_fkx;
77
78 if((blockIdx.x == 0) && (threadIdx.x == 0)) fky[blockIdx.y] = my_fky;
79
80 if((threadIdx.x == 0) && (blockIdx.y == 0)) fkz[blockIdx.x] = my_fkz;
81
82 __syncthreads();
83
84 if((blockIdx.x >= nzlo_fft) && (blockIdx.x <= nzhi_fft) &&
85 (blockIdx.y >= nylo_fft) && (blockIdx.y <= nyhi_fft) &&
86 (threadIdx.x >= nxlo_fft) && (threadIdx.x <= nxhi_fft)) {
87 int n = ((int(blockIdx.x) - nzlo_fft) * (nyhi_fft - nylo_fft + 1) + int(blockIdx.y) - nylo_fft) * (nxhi_fft - nxlo_fft + 1) + int(threadIdx.x) - nxlo_fft;
88 PPPM_FLOAT sqk = my_fkx * my_fkx + my_fky * my_fky + my_fkz * my_fkz;
89 PPPM_FLOAT vterm = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(-2.0) * (PPPM_F(1.0) / sqk + PPPM_F(0.25) / (g_ewald * g_ewald));
90 vg[6 * n + 0] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkx * my_fkx;
91 vg[6 * n + 1] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fky * my_fky;
92 vg[6 * n + 2] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkz * my_fkz;
93 vg[6 * n + 3] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx * my_fky;
94 vg[6 * n + 4] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx * my_fkz;
95 vg[6 * n + 5] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fky * my_fkz;
96
97 }
98 }
99
gf_denom(PPPM_FLOAT x,PPPM_FLOAT y,PPPM_FLOAT z)100 __device__ PPPM_FLOAT gf_denom(PPPM_FLOAT x, PPPM_FLOAT y, PPPM_FLOAT z)
101 {
102 PPPM_FLOAT sx, sy, sz;
103 sz = sy = sx = PPPM_F(0.0);
104
105 for(int l = order - 1; l >= 0; l--) {
106 sx = gf_b[l] + sx * x;
107 sy = gf_b[l] + sy * y;
108 sz = gf_b[l] + sz * z;
109 }
110
111 PPPM_FLOAT s = sx * sy * sz;
112 return s * s;
113 }
114
setup_greensfn(PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald,int nbx,int nby,int nbz,PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab)115 __global__ void setup_greensfn(PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald,
116 int nbx, int nby, int nbz,
117 PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab)
118 {
119 PPPM_FLOAT sqk;
120 int nx, ny, nz, kper, lper, mper, k, l, m;
121 PPPM_FLOAT snx, sny, snz, snx2, sny2, snz2;
122 PPPM_FLOAT argx, argy, argz, wx, wy, wz, sx, sy, sz, qx, qy, qz;
123 PPPM_FLOAT sum1, dot1, dot2;
124 PPPM_FLOAT numerator, denominator;
125
126 PPPM_FLOAT form = PPPM_F(1.0);
127 int n = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
128 m = blockIdx.x;
129 l = blockIdx.y;
130 k = threadIdx.x;
131
132 mper = m - nz_pppm * (2 * m / nz_pppm);
133 snz = sin(PPPM_F(0.5) * unitkz * mper * zprd_slab / nz_pppm);
134 snz2 = snz * snz;
135
136
137 lper = l - ny_pppm * (2 * l / ny_pppm);
138 sny = sin(PPPM_F(0.5) * unitky * lper * yprd / ny_pppm);
139 sny2 = sny * sny;
140
141 kper = k - nx_pppm * (2 * k / nx_pppm);
142 snx = sin(PPPM_F(0.5) * unitkx * kper * xprd / nx_pppm);
143 snx2 = snx * snx;
144
145 sqk = pow(unitkx * kper, PPPM_F(2.0)) + pow(unitky * lper, PPPM_F(2.0)) +
146 pow(unitkz * mper, PPPM_F(2.0));
147
148 if(sqk != PPPM_F(0.0)) {
149 numerator = form * PPPM_F(12.5663706) / sqk;
150 denominator = gf_denom(snx2, sny2, snz2);
151 sum1 = PPPM_F(0.0);
152
153 for(nx = -nbx; nx <= nbx; nx++) {
154 qx = unitkx * (kper + nx_pppm * nx);
155 sx = exp(PPPM_F(-.25) * pow(qx / g_ewald, PPPM_F(2.0)));
156 wx = PPPM_F(1.0);
157 argx = PPPM_F(0.5) * qx * xprd / nx_pppm;
158
159 if(argx != PPPM_F(0.0)) wx = pow(sin(argx) / argx, order);
160
161 for(ny = -nby; ny <= nby; ny++) {
162 qy = unitky * (lper + ny_pppm * ny);
163 sy = exp(PPPM_F(-.25) * pow(qy / g_ewald, PPPM_F(2.0)));
164 wy = PPPM_F(1.0);
165 argy = PPPM_F(0.5) * qy * yprd / ny_pppm;
166
167 if(argy != PPPM_F(0.0)) wy = pow(sin(argy) / argy, order);
168
169 for(nz = -nbz; nz <= nbz; nz++) {
170 qz = unitkz * (mper + nz_pppm * nz);
171 sz = exp(PPPM_F(-.25) * pow(qz / g_ewald, PPPM_F(2.0)));
172 wz = PPPM_F(1.0);
173 argz = PPPM_F(0.5) * qz * zprd_slab / nz_pppm;
174
175 if(argz != PPPM_F(0.0)) wz = pow(sin(argz) / argz, order);
176
177 dot1 = unitkx * kper * qx + unitky * lper * qy + unitkz * mper * qz;
178 dot2 = qx * qx + qy * qy + qz * qz;
179 sum1 += (dot1 / dot2) * sx * sy * sz * pow(wx * wy * wz, PPPM_F(2.0));
180 }
181 }
182 }
183
184 greensfn[n] = numerator * sum1 / denominator;
185 } else greensfn[n] = PPPM_F(0.0);
186 }
187
poisson_scale_kernel()188 __global__ void poisson_scale_kernel()
189 {
190 int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
191 FFT_FLOAT scaleinv = FFT_F(1.0) / (gridDim.x * gridDim.y * blockDim.x);
192 work1[2 * i] *= scaleinv * greensfn[i];
193 work1[2 * i + 1] *= scaleinv * greensfn[i];
194 }
195
poisson_xgrad_kernel()196 __global__ void poisson_xgrad_kernel()
197 {
198 int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
199 work2[2 * i] = fkx[threadIdx.x] * work1[2 * i + 1];
200 work2[2 * i + 1] = -fkx[threadIdx.x] * work1[2 * i];
201 }
202
poisson_ygrad_kernel()203 __global__ void poisson_ygrad_kernel()
204 {
205 int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
206 work2[2 * i] = fky[blockIdx.y] * work1[2 * i + 1];
207 work2[2 * i + 1] = -fky[blockIdx.y] * work1[2 * i];
208 }
209
poisson_zgrad_kernel()210 __global__ void poisson_zgrad_kernel()
211 {
212 int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
213 work2[2 * i] = fkz[blockIdx.x] * work1[2 * i + 1];
214 work2[2 * i + 1] = -fkz[blockIdx.x] * work1[2 * i];
215 }
216
poisson_vdx_brick_kernel(int ilo,int jlo,int klo)217 __global__ void poisson_vdx_brick_kernel(int ilo, int jlo, int klo)
218 {
219 int k = blockIdx.x + klo;
220 k += nz_pppm * negativCUDA(CUDA_F(1.0) * k) - nz_pppm * negativCUDA(CUDA_F(1.0) * (nz_pppm - k - 1));
221 int j = blockIdx.y + jlo;
222 j += ny_pppm * negativCUDA(CUDA_F(1.0) * j) - ny_pppm * negativCUDA(CUDA_F(1.0) * (ny_pppm - j - 1));
223 int i = threadIdx.x + ilo;
224 i += nx_pppm * negativCUDA(CUDA_F(1.0) * i) - nx_pppm * negativCUDA(CUDA_F(1.0) * (nx_pppm - i - 1));
225 vdx_brick[((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1) + threadIdx.x] = work3[2 * (((k) * ny_pppm + (j)) * nx_pppm + i)];
226 }
227
poisson_vdy_brick_kernel(int ilo,int jlo,int klo)228 __global__ void poisson_vdy_brick_kernel(int ilo, int jlo, int klo)
229 {
230 int k = blockIdx.x + klo;
231 k += nz_pppm * negativCUDA(CUDA_F(1.0) * k) - nz_pppm * negativCUDA(CUDA_F(1.0) * (nz_pppm - k - 1));
232 int j = blockIdx.y + jlo;
233 j += ny_pppm * negativCUDA(CUDA_F(1.0) * j) - ny_pppm * negativCUDA(CUDA_F(1.0) * (ny_pppm - j - 1));
234 int i = threadIdx.x + ilo;
235 i += nx_pppm * negativCUDA(CUDA_F(1.0) * i) - nx_pppm * negativCUDA(CUDA_F(1.0) * (nx_pppm - i - 1));
236 vdy_brick[((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1) + threadIdx.x] = work3[2 * (((k) * ny_pppm + (j)) * nx_pppm + i)];
237 }
238
poisson_vdz_brick_kernel(int ilo,int jlo,int klo)239 __global__ void poisson_vdz_brick_kernel(int ilo, int jlo, int klo)
240 {
241 int k = blockIdx.x + klo;
242 k += nz_pppm * negativCUDA(CUDA_F(1.0) * k) - nz_pppm * negativCUDA(CUDA_F(1.0) * (nz_pppm - k - 1));
243 int j = blockIdx.y + jlo;
244 j += ny_pppm * negativCUDA(CUDA_F(1.0) * j) - ny_pppm * negativCUDA(CUDA_F(1.0) * (ny_pppm - j - 1));
245 int i = threadIdx.x + ilo;
246 i += nx_pppm * negativCUDA(CUDA_F(1.0) * i) - nx_pppm * negativCUDA(CUDA_F(1.0) * (nx_pppm - i - 1));
247 vdz_brick[((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1) + threadIdx.x] = work3[2 * (((k) * ny_pppm + (j)) * nx_pppm + i)];
248 }
249
poisson_energy_kernel(int nxlo_fft,int nylo_fft,int nzlo_fft,int vflag)250 __global__ void poisson_energy_kernel(int nxlo_fft, int nylo_fft, int nzlo_fft, int vflag)
251 {
252 ENERGY_FLOAT scaleinv = FFT_F(1.0) / (nx_pppm * ny_pppm * nz_pppm);
253 int i = (blockIdx.x + nzlo_fft) * ny_pppm * nx_pppm + (blockIdx.y + nylo_fft) * nx_pppm + threadIdx.x + nxlo_fft;
254 ENERGY_FLOAT* s_energy = (ENERGY_FLOAT*) sharedmem;
255 ENERGY_FLOAT myenergy = scaleinv * scaleinv * greensfn[i] * (work1[2 * i] * work1[2 * i] + work1[2 * i + 1] * work1[2 * i + 1]);
256 s_energy[threadIdx.x] = myenergy;
257
258 __syncthreads();
259 reduceBlock(s_energy);
260
261 if(threadIdx.x == 0)
262 energy[blockIdx.x * ny_pppm + blockIdx.y] = s_energy[0];
263
264 if(vflag) {
265 __syncthreads();
266
267 for(int j = 0; j < 6; j++) {
268 s_energy[threadIdx.x] = myenergy * vg[((blockIdx.x * gridDim.y + blockIdx.y) * (blockDim.x) + threadIdx.x) * 6 + j];
269 __syncthreads();
270 reduceBlock(s_energy);
271
272 if(threadIdx.x == 0)
273 virial[blockIdx.x * ny_pppm + blockIdx.y + j * nz_pppm * ny_pppm] = s_energy[0];
274 }
275 }
276 }
277
278
sum_energy_kernel1(int vflag)279 __global__ void sum_energy_kernel1(int vflag)
280 {
281 ENERGY_FLOAT myenergy = energy[(blockIdx.x * ny_pppm + threadIdx.x)];
282 ENERGY_FLOAT* s_energy = (ENERGY_FLOAT*) sharedmem;
283 s_energy[threadIdx.x] = myenergy;
284 __syncthreads();
285 reduceBlock(s_energy);
286
287 if(threadIdx.x == 0)
288 energy[blockIdx.x * ny_pppm] = s_energy[0];
289
290 if(vflag) {
291 __syncthreads();
292
293 for(int j = 0; j < 6; j++) {
294 myenergy = virial[blockIdx.x * ny_pppm + threadIdx.x + j * ny_pppm * nz_pppm];
295 s_energy[threadIdx.x] = myenergy;
296 __syncthreads();
297 reduceBlock(s_energy);
298
299 if(threadIdx.x == 0)
300 virial[blockIdx.x * ny_pppm + j * ny_pppm * nz_pppm] = s_energy[0];
301 }
302 }
303
304 }
305
sum_energy_kernel2(int vflag)306 __global__ void sum_energy_kernel2(int vflag)
307 {
308 ENERGY_FLOAT myenergy = energy[threadIdx.x * ny_pppm];
309 ENERGY_FLOAT* s_energy = (ENERGY_FLOAT*) sharedmem;
310 s_energy[threadIdx.x] = myenergy;
311 __syncthreads();
312 reduceBlock(s_energy);
313
314 if(threadIdx.x == 0)
315 energy[0] = s_energy[0];
316
317 if(vflag) {
318 __syncthreads();
319
320 for(int j = 0; j < 6; j++) {
321 myenergy = virial[threadIdx.x * ny_pppm + j * ny_pppm * nz_pppm];
322 s_energy[threadIdx.x] = myenergy;
323 __syncthreads();
324 reduceBlock(s_energy);
325
326 if(threadIdx.x == 0)
327 virial[j] = s_energy[0];
328 }
329 }
330 }
331
rho1d(int k,PPPM_FLOAT d,PPPM_FLOAT * srho_coeff)332 __device__ PPPM_FLOAT rho1d(int k, PPPM_FLOAT d, PPPM_FLOAT* srho_coeff)
333 {
334 PPPM_FLOAT rho1d_tmp = PPPM_F(0.0);
335
336 for(int l = order - 1; l >= 0; l--)
337 rho1d_tmp = srho_coeff[l * order + k - (1 - order) / 2] + rho1d_tmp * d;
338
339 return rho1d_tmp;
340 }
341
particle_map_kernel(int * flag)342 __global__ void particle_map_kernel(int* flag)
343 {
344 int i = blockIdx.x * blockDim.x + threadIdx.x;
345
346 if(i < nlocal) {
347 int nx, ny, nz;
348 PPPM_FLOAT shift = PPPM_F(0.5) - shiftone; //+OFFSET;
349 nx = (int)((_x[i] - _boxlo[0]) * delxinv + shift); // - OFFSET;
350 ny = (int)((_x[i + nmax] - _boxlo[1]) * delyinv + shift); // - OFFSET;
351 nz = (int)((_x[i + 2 * nmax] - _boxlo[2]) * delzinv + shift); // - OFFSET;
352
353 part2grid[i] = nx;
354 part2grid[i + nmax] = ny;
355 part2grid[i + 2 * nmax] = nz;
356
357 // check that entire stencil around nx,ny,nz will fit in my 3d brick
358 if(nx + nlower < nxlo_out || nx + nupper > nxhi_out ||
359 ny + nlower < nylo_out || ny + nupper > nyhi_out ||
360 nz + nlower < nzlo_out || nz + nupper > nzhi_out) {
361 flag[0]++;
362 debugdata[0] = i;
363 debugdata[1] = _boxlo[0];
364 debugdata[2] = _boxlo[1];
365 debugdata[3] = _boxlo[2];
366 debugdata[4] = nx;
367 debugdata[5] = ny;
368 debugdata[6] = nz;
369 debugdata[7] = _x[i];
370 debugdata[8] = _x[i + _nmax];
371 debugdata[9] = _x[i + 2 * _nmax];
372 debugdata[10] = nlocal;
373
374 }
375 }
376 }
377
make_rho_kernelA()378 __global__ void make_rho_kernelA()
379 {
380 int i, l, m, n, nx, ny, nz, mx, my, mz;
381
382 // clear 3d density array
383
384
385 // loop over my charges, add their contribution to nearby grid points
386 // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
387 // (dx,dy,dz) = distance to "lower left" grid pt
388 // (mx,my,mz) = global coords of moving stencil pt
389
390 i = blockIdx.x * blockDim.x + threadIdx.x;
391
392 if(i < nlocal) {
393
394 PPPM_FLOAT dx, dy, dz, x0, y0, z0;
395 nx = part2grid[i];
396 ny = part2grid[i + nmax];
397 nz = part2grid[i + 2 * nmax];
398 dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv;
399 dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv;
400 dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv;
401
402 z0 = delxinv * delyinv * delzinv * _q[i];
403
404 for(n = nlower; n <= nupper; n++) {
405 mz = n + nz;
406 y0 = z0 * rho1d(n, dz, rho_coeff);
407
408 for(m = nlower; m <= nupper; m++) {
409 my = m + ny;
410 x0 = y0 * rho1d(m, dy, rho_coeff);
411
412 for(l = nlower; l <= nupper; l++) {
413 mx = l + nx;
414 int mzyx = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + mx - nxlo_out;
415
416 while(atomicAdd(&density_brick_int[mzyx], 1) != 0) atomicAdd(&density_brick_int[mzyx], -1);
417
418 density_brick[mzyx] += x0 * rho1d(l, dx, rho_coeff);
419 __threadfence();
420 atomicAdd(&density_brick_int[mzyx], -1);
421 __syncthreads();
422
423 }
424 }
425 }
426 }
427 }
428
make_rho_kernel(int * flag,int read_threads_at_same_time)429 __global__ void make_rho_kernel(int* flag, int read_threads_at_same_time)
430 {
431 int i, l, m, n, nx, ny, nz, mx, my, mz, a, b;
432
433 // clear 3d density array
434
435
436 // loop over my charges, add their contribution to nearby grid points
437 // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
438 // (dx,dy,dz) = distance to "lower left" grid pt
439 // (mx,my,mz) = global coords of moving stencil pt
440 // int nzxy=blockIdx.x*gridDim.y+blockIdx.y;
441
442 int nelements = nupper - nlower + 1;
443 int* idx = (int*) sharedmem;
444 int* sdensity_brick_int = &idx[blockDim.x];
445 PPPM_FLOAT* srho_coeff = (PPPM_FLOAT*) &sdensity_brick_int[nelements * blockDim.x];
446
447 if(threadIdx.x < order * (order / 2 - (1 - order) / 2 + 1))
448 srho_coeff[threadIdx.x] = rho_coeff[threadIdx.x];
449
450 __syncthreads();
451
452 i = blockIdx.x * blockDim.x + threadIdx.x;
453
454 if(false) {
455 if(i < nlocal) {
456
457 PPPM_FLOAT dx, dy, dz, x0, y0, z0;
458 nx = part2grid[i];
459 ny = part2grid[i + nmax];
460 nz = part2grid[i + 2 * nmax];
461 dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv;
462 dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv;
463 dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv;
464
465 z0 = delxinv * delyinv * delzinv * _q[i];
466
467 for(n = nlower; n <= nupper; n++) {
468 mz = n + nz;
469 y0 = z0 * rho1d(n, dz, srho_coeff);
470
471 for(m = nlower; m <= nupper; m++) {
472 my = m + ny;
473 x0 = y0 * rho1d(m, dy, srho_coeff);
474
475 for(l = nlower; l <= nupper; l++) {
476 mx = l + nx;
477 int mzyx = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + mx - nxlo_out;
478
479 a = int(x0 * rho1d(l, dx, srho_coeff) * density_intScale);
480 b = (atomicAdd(&density_brick_int[mzyx], a) | a);
481
482 if(((b) & (0x7c000000)) && (not((b) & (0x80000000)))) {
483 flag[1]++;
484
485 if((b) & (0x60000000)) flag[0]++;
486 }
487
488 __syncthreads();
489 }
490 }
491 }
492 }
493
494 return;
495 }
496
497 i = blockIdx.x * blockDim.x + threadIdx.x;
498 {
499
500 PPPM_FLOAT dx, dy, dz, x0, y0, z0, qtmp;
501
502 if(i < nlocal) {
503 qtmp = _q[i];
504 nx = part2grid[i];
505 ny = part2grid[i + nmax];
506 nz = part2grid[i + 2 * nmax];
507 dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv;
508 dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv;
509 dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv;
510 z0 = delxinv * delyinv * delzinv * qtmp;
511 } else {
512 nx = ny = nz = 1;
513 dx = dy = dz = PPPM_F(0.1);
514 }
515
516 __syncthreads();
517
518 for(n = nlower; n <= nupper; n++) {
519 mz = n + nz;
520 y0 = z0 * rho1d(n, dz, srho_coeff);
521
522 for(m = nlower; m <= nupper; m++) {
523 my = m + ny;
524 x0 = y0 * rho1d(m, dy, srho_coeff);
525
526 if(i < nlocal) {
527 idx[threadIdx.x] = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + nx + nlower - nxlo_out;
528
529 for(l = nlower; l <= nupper; l++) {
530 sdensity_brick_int[threadIdx.x * nelements + l - nlower] = int(x0 * rho1d(l, dx, srho_coeff) * density_intScale);
531 }
532 } else idx[threadIdx.x] = -1;
533
534 __syncthreads();
535
536 for(int ii = 0; ii < blockDim.x; ii += read_threads_at_same_time) {
537 int kk = threadIdx.x / nelements;
538
539 if((threadIdx.x < nelements * read_threads_at_same_time) && (kk + ii < blockDim.x) && (idx[ii + kk] > -1)) {
540 a = sdensity_brick_int[ii * nelements + threadIdx.x];
541 //if(a*a>1e-100)
542 b = (atomicAdd(&density_brick_int[idx[ii + kk] + threadIdx.x - kk * nelements], a) | a);
543
544 //else
545 //b=(density_brick_int[idx[ii+kk]+threadIdx.x-kk*nelements]|a);
546 if(((b) & (0x7c000000)) && (not((b) & (0x80000000)))) {
547 flag[1]++;
548
549 if((b) & (0x60000000)) flag[0]++;
550 }
551 }
552 }
553
554 __syncthreads(); //*/
555 }
556 }
557
558 }
559 }
560
scale_rho_kernel()561 __global__ void scale_rho_kernel()
562 {
563 int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
564 density_brick[i] = (1.0 / density_intScale) * density_brick_int[i];
565 }
566
fieldforce_kernel(int elements_per_thread,int read_threads_at_same_time,int * flag)567 __global__ void fieldforce_kernel(int elements_per_thread, int read_threads_at_same_time, int* flag) //20*x64 0.36
568 {
569 int i;
570
571 // loop over my charges, interpolate electric field from nearby grid points
572 // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
573 // (dx,dy,dz) = distance to "lower left" grid pt
574 // (mx,my,mz) = global coords of moving stencil pt
575 // ek = 3 components of E-field on particle
576 i = blockIdx.x * blockDim.x + threadIdx.x;
577 int* idx = (int*) sharedmem;
578 PPPM_FLOAT* tmp_brick = (PPPM_FLOAT*) &idx[blockDim.x];
579 PPPM_FLOAT* srho_coeff = (PPPM_FLOAT*) &tmp_brick[3 * blockDim.x * elements_per_thread];
580
581 if(threadIdx.x < order * (order / 2 - (1 - order) / 2 + 1))
582 srho_coeff[threadIdx.x] = rho_coeff[threadIdx.x];
583
584 __syncthreads();
585 {
586 int l, m, n, nx, ny, nz, my, mz;
587 PPPM_FLOAT dx, dy, dz, x0, y0, z0;
588 PPPM_FLOAT ek[3];
589
590 if(i < nlocal) {
591 nx = part2grid[i];
592 ny = part2grid[i + nmax];
593 nz = part2grid[i + 2 * nmax];
594 dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv;
595 dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv;
596 dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv;
597
598 ek[0] = ek[1] = ek[2] = PPPM_F(0.0);
599 } else {
600 nx = ny = nz = 1;
601 dx = dy = dz = PPPM_F(0.1);
602 }
603
604 __syncthreads();
605
606 for(n = nlower; n <= nupper; n++) {
607 mz = n + nz;
608 z0 = rho1d(n, dz, srho_coeff);
609
610 for(m = nlower; m <= nupper; m++) {
611 my = m + ny;
612 y0 = z0 * rho1d(m, dy, srho_coeff);
613
614
615 if(i < nlocal)
616 idx[threadIdx.x] = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + nx + nlower - nxlo_out;
617 else idx[threadIdx.x] = -1;
618
619 __syncthreads();
620
621 for(int ii = 0; ii < blockDim.x; ii += read_threads_at_same_time) {
622 int kk = threadIdx.x / elements_per_thread;
623
624 if((threadIdx.x < elements_per_thread * read_threads_at_same_time) && (kk + ii < blockDim.x) && (idx[ii + kk] > -1)) {
625 tmp_brick[ii * elements_per_thread + threadIdx.x] = vdx_brick[idx[ii + kk] + threadIdx.x - kk * elements_per_thread];
626 tmp_brick[(ii + blockDim.x)*elements_per_thread + threadIdx.x] = vdy_brick[idx[ii + kk] + threadIdx.x - kk * elements_per_thread];
627 tmp_brick[(ii + 2 * blockDim.x)*elements_per_thread + threadIdx.x] = vdz_brick[idx[ii + kk] + threadIdx.x - kk * elements_per_thread];
628 }
629 }
630
631 __syncthreads();
632
633 if(i < nlocal)
634 for(l = nlower; l <= nupper; l++) {
635 x0 = y0 * rho1d(l, dx, srho_coeff);
636
637 ek[0] -= x0 * tmp_brick[threadIdx.x * elements_per_thread + l - nlower];
638 ek[1] -= x0 * tmp_brick[threadIdx.x * elements_per_thread + l - nlower + blockDim.x * elements_per_thread];
639 ek[2] -= x0 * tmp_brick[threadIdx.x * elements_per_thread + l - nlower + 2 * blockDim.x * elements_per_thread];
640 }
641
642 __syncthreads();
643 }
644 }
645
646 // convert E-field to force
647
648
649 _f[i] += qqrd2e * _q[i] * ek[0];
650 _f[i + nmax] += qqrd2e * _q[i] * ek[1];
651 _f[i + 2 * nmax] += qqrd2e * _q[i] * ek[2];
652 }
653 }
654
slabcorr_energy_kernel(ENERGY_FLOAT * buf)655 __global__ void slabcorr_energy_kernel(ENERGY_FLOAT* buf)
656 {
657 ENERGY_FLOAT* dipole = (ENERGY_FLOAT*) sharedmem;
658 int i = blockIdx.x * blockDim.x + threadIdx.x;
659
660 if(i < nlocal)
661 dipole[threadIdx.x] = _q[i] * _x[i + 2 * nmax];
662 else
663 dipole[threadIdx.x] = ENERGY_F(0.0);
664
665 __syncthreads();
666 reduceBlock(dipole);
667
668 if(threadIdx.x == 0) buf[blockIdx.x] = dipole[0];
669 }
670
slabcorr_force_kernel(F_FLOAT ffact)671 __global__ void slabcorr_force_kernel(F_FLOAT ffact)
672 {
673 int i = blockIdx.x * blockDim.x + threadIdx.x;
674
675 if(i < nlocal)
676 _f[i + 2 * nmax] += qqrd2e * _q[i] * ffact;
677 }
678
679
initfftdata_core_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)680 __global__ void initfftdata_core_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
681 {
682 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] = in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
683 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x) + 1] = 0;
684 }
685
initfftdata_z_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)686 __global__ void initfftdata_z_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
687 {
688 if(slabflag) {
689 if(blockIdx.x < nzlo_in - nzlo_out)
690 out[2 * (((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
691 } else {
692 if(blockIdx.x < nzlo_in - nzlo_out)
693 out[2 * (((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
694 }
695
696 if(blockIdx.x < nzhi_out - nzhi_in)
697 out[2 * ((((blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + (nzhi_out - nzlo_in)) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
698 }
699
initfftdata_y_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)700 __global__ void initfftdata_y_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
701 {
702 if(blockIdx.y < nylo_in - nylo_out)
703 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + (2 * (nyhi_in + 1) - nylo_in - nyhi_out) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
704
705 if(blockIdx.y < nyhi_out - nyhi_in)
706 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + (nyhi_out - nylo_in)) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
707 }
708
initfftdata_x_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)709 __global__ void initfftdata_x_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
710 {
711 if(threadIdx.x < nxlo_in - nxlo_out)
712 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
713
714 if(threadIdx.x < nxhi_out - nxhi_in)
715 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
716 }
717
initfftdata_yz_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)718 __global__ void initfftdata_yz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
719 {
720 if(slabflag) {
721 if(blockIdx.x < nzlo_in - nzlo_out)
722 if(blockIdx.y < nyhi_out - nyhi_in)
723 out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
724
725 if(blockIdx.x < nzlo_in - nzlo_out)
726 if(blockIdx.y < nylo_in - nylo_out)
727 out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
728 } else {
729 if(blockIdx.x < nzlo_in - nzlo_out)
730 if(blockIdx.y < nyhi_out - nyhi_in)
731 out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
732
733 if(blockIdx.x < nzlo_in - nzlo_out)
734 if(blockIdx.y < nylo_in - nylo_out)
735 out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
736 }
737
738 if(blockIdx.x < nzhi_out - nzhi_in)
739 if(blockIdx.y < nyhi_out - nyhi_in)
740 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
741
742 if(blockIdx.x < nzhi_out - nzhi_in)
743 if(blockIdx.y < nylo_in - nylo_out)
744 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
745 }
746
initfftdata_xz_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)747 __global__ void initfftdata_xz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
748 {
749 if(blockIdx.x < nzhi_out - nzhi_in)
750 if(threadIdx.x < nxlo_in - nxlo_out)
751 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
752
753 if(blockIdx.x < nzhi_out - nzhi_in)
754 if(threadIdx.x < nxhi_out - nxhi_in)
755 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
756
757 if(slabflag) {
758 if(blockIdx.x < nzlo_in - nzlo_out)
759 if(threadIdx.x < nxlo_in - nxlo_out)
760 out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
761
762 if(blockIdx.x < nzlo_in - nzlo_out)
763 if(threadIdx.x < nxhi_out - nxhi_in)
764 out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
765 } else {
766 if(blockIdx.x < nzlo_in - nzlo_out)
767 if(threadIdx.x < nxlo_in - nxlo_out)
768 out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
769
770 if(blockIdx.x < nzlo_in - nzlo_out)
771 if(threadIdx.x < nxhi_out - nxhi_in)
772 out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
773 }
774 }
775
initfftdata_xy_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)776 __global__ void initfftdata_xy_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
777 {
778 if(blockIdx.y < nyhi_out - nyhi_in)
779 if(threadIdx.x < nxlo_in - nxlo_out)
780 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
781
782 if(blockIdx.y < nyhi_out - nyhi_in)
783 if(threadIdx.x < nxhi_out - nxhi_in)
784 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
785
786 if(blockIdx.y < nylo_in - nylo_out)
787 if(threadIdx.x < nxlo_in - nxlo_out)
788 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
789
790 if(blockIdx.y < nylo_in - nylo_out)
791 if(threadIdx.x < nxhi_out - nxhi_in)
792 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
793 }
794
initfftdata_xyz_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)795 __global__ void initfftdata_xyz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
796 {
797 if(blockIdx.x < nzhi_out - nzhi_in)
798 if(blockIdx.y < nyhi_out - nyhi_in)
799 if(threadIdx.x < nxlo_in - nxlo_out)
800 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
801
802 if(blockIdx.x < nzhi_out - nzhi_in)
803 if(blockIdx.y < nyhi_out - nyhi_in)
804 if(threadIdx.x < nxhi_out - nxhi_in)
805 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
806
807 if(blockIdx.x < nzhi_out - nzhi_in)
808 if(blockIdx.y < nylo_in - nylo_out)
809 if(threadIdx.x < nxlo_in - nxlo_out)
810 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
811
812 if(blockIdx.x < nzhi_out - nzhi_in)
813 if(blockIdx.y < nylo_in - nylo_out)
814 if(threadIdx.x < nxhi_out - nxhi_in)
815 out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
816
817 if(slabflag) {
818 if(blockIdx.x < nzlo_in - nzlo_out)
819 if(blockIdx.y < nyhi_out - nyhi_in)
820 if(threadIdx.x < nxlo_in - nxlo_out)
821 out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
822
823 if(blockIdx.x < nzlo_in - nzlo_out)
824 if(blockIdx.y < nyhi_out - nyhi_in)
825 if(threadIdx.x < nxhi_out - nxhi_in)
826 out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
827
828 if(blockIdx.x < nzlo_in - nzlo_out)
829 if(blockIdx.y < nylo_in - nylo_out)
830 if(threadIdx.x < nxlo_in - nxlo_out)
831 out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
832
833 if(blockIdx.x < nzlo_in - nzlo_out)
834 if(blockIdx.y < nylo_in - nylo_out)
835 if(threadIdx.x < nxhi_out - nxhi_in)
836 out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
837 } else {
838 if(blockIdx.x < nzlo_in - nzlo_out)
839 if(blockIdx.y < nyhi_out - nyhi_in)
840 if(threadIdx.x < nxlo_in - nxlo_out)
841 out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
842
843 if(blockIdx.x < nzlo_in - nzlo_out)
844 if(blockIdx.y < nyhi_out - nyhi_in)
845 if(threadIdx.x < nxhi_out - nxhi_in)
846 out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
847
848 if(blockIdx.x < nzlo_in - nzlo_out)
849 if(blockIdx.y < nylo_in - nylo_out)
850 if(threadIdx.x < nxlo_in - nxlo_out)
851 out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
852
853 if(blockIdx.x < nzlo_in - nzlo_out)
854 if(blockIdx.y < nylo_in - nylo_out)
855 if(threadIdx.x < nxhi_out - nxhi_in)
856 out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
857 }
858 }
859