1 /* ----------------------------------------------------------------------
2    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3 
4    Original Version:
5    http://lammps.sandia.gov, Sandia National Laboratories
6    Steve Plimpton, sjplimp@sandia.gov
7 
8    See the README file in the top-level LAMMPS directory.
9 
10    -----------------------------------------------------------------------
11 
12    USER-CUDA Package and associated modifications:
13    https://sourceforge.net/projects/lammpscuda/
14 
15    Christian Trott, christian.trott@tu-ilmenau.de
16    Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17    Theoretical Physics II, University of Technology Ilmenau, Germany
18 
19    See the README file in the USER-CUDA directory.
20 
21    This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23 
24 #define OFFSET 4096
negativCUDA(float f)25 __device__ int negativCUDA(float f)
26 {
27   return ((unsigned int)1 << 31 & (__float_as_int(f))) >> 31;
28 }
29 
reduceBlock(float * data)30 __device__ void reduceBlock(float* data)
31 {
32   int p2 = 1;
33 
34   while(p2 * 2 < blockDim.x) p2 *= 2;
35 
36   if(threadIdx.x < blockDim.x - p2)
37     data[threadIdx.x] += data[threadIdx.x + p2];
38 
39   __syncthreads();
40 
41   for(int i = 2; i <= p2; i *= 2) {
42     if(threadIdx.x < p2 / i)
43       data[threadIdx.x] += data[threadIdx.x + p2 / i];
44 
45     __syncthreads();
46   }
47 }
48 
reduceBlock(double * data)49 __device__ void reduceBlock(double* data)
50 {
51   int p2 = 1;
52 
53   while(p2 * 2 < blockDim.x) p2 *= 2;
54 
55   if(threadIdx.x < blockDim.x - p2)
56     data[threadIdx.x] += data[threadIdx.x + p2];
57 
58   __syncthreads();
59 
60   for(int i = 2; i <= p2; i *= 2) {
61     if(threadIdx.x < p2 / i)
62       data[threadIdx.x] += data[threadIdx.x + p2 / i];
63 
64     __syncthreads();
65   }
66 }
67 
68 extern __shared__ PPPM_FLOAT sharedmem[];
69 
setup_fkxyz_vg(PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald)70 __global__ void setup_fkxyz_vg(PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald)
71 {
72   PPPM_FLOAT my_fkx = unitkx * (int(threadIdx.x) - nx_pppm * (2 * int(threadIdx.x) / nx_pppm));
73   PPPM_FLOAT my_fky = unitky * (int(blockIdx.y) - ny_pppm * (2 * int(blockIdx.y) / ny_pppm));
74   PPPM_FLOAT my_fkz = unitkz * (int(blockIdx.x) - nz_pppm * (2 * int(blockIdx.x) / nz_pppm));
75 
76   if((blockIdx.x == 0) && (blockIdx.y == 0)) fkx[threadIdx.x] = my_fkx;
77 
78   if((blockIdx.x == 0) && (threadIdx.x == 0)) fky[blockIdx.y] = my_fky;
79 
80   if((threadIdx.x == 0) && (blockIdx.y == 0)) fkz[blockIdx.x] = my_fkz;
81 
82   __syncthreads();
83 
84   if((blockIdx.x >= nzlo_fft) && (blockIdx.x <= nzhi_fft) &&
85       (blockIdx.y >= nylo_fft) && (blockIdx.y <= nyhi_fft) &&
86       (threadIdx.x >= nxlo_fft) && (threadIdx.x <= nxhi_fft)) {
87     int n = ((int(blockIdx.x) - nzlo_fft) * (nyhi_fft - nylo_fft + 1) + int(blockIdx.y) - nylo_fft) * (nxhi_fft - nxlo_fft + 1) + int(threadIdx.x) - nxlo_fft;
88     PPPM_FLOAT sqk = my_fkx * my_fkx + my_fky * my_fky + my_fkz * my_fkz;
89     PPPM_FLOAT vterm = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(-2.0) * (PPPM_F(1.0) / sqk + PPPM_F(0.25) / (g_ewald * g_ewald));
90     vg[6 * n + 0] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkx * my_fkx;
91     vg[6 * n + 1] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fky * my_fky;
92     vg[6 * n + 2] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : PPPM_F(1.0) + vterm * my_fkz * my_fkz;
93     vg[6 * n + 3] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx * my_fky;
94     vg[6 * n + 4] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fkx * my_fkz;
95     vg[6 * n + 5] = (sqk == PPPM_F(0.0)) ? PPPM_F(0.0) : vterm * my_fky * my_fkz;
96 
97   }
98 }
99 
gf_denom(PPPM_FLOAT x,PPPM_FLOAT y,PPPM_FLOAT z)100 __device__ PPPM_FLOAT gf_denom(PPPM_FLOAT x, PPPM_FLOAT y, PPPM_FLOAT z)
101 {
102   PPPM_FLOAT sx, sy, sz;
103   sz = sy = sx = PPPM_F(0.0);
104 
105   for(int l = order - 1; l >= 0; l--) {
106     sx = gf_b[l] + sx * x;
107     sy = gf_b[l] + sy * y;
108     sz = gf_b[l] + sz * z;
109   }
110 
111   PPPM_FLOAT s = sx * sy * sz;
112   return s * s;
113 }
114 
setup_greensfn(PPPM_FLOAT unitkx,PPPM_FLOAT unitky,PPPM_FLOAT unitkz,PPPM_FLOAT g_ewald,int nbx,int nby,int nbz,PPPM_FLOAT xprd,PPPM_FLOAT yprd,PPPM_FLOAT zprd_slab)115 __global__ void setup_greensfn(PPPM_FLOAT unitkx, PPPM_FLOAT unitky, PPPM_FLOAT unitkz, PPPM_FLOAT g_ewald,
116                                int nbx, int nby, int nbz,
117                                PPPM_FLOAT xprd, PPPM_FLOAT yprd, PPPM_FLOAT zprd_slab)
118 {
119   PPPM_FLOAT sqk;
120   int nx, ny, nz, kper, lper, mper, k, l, m;
121   PPPM_FLOAT snx, sny, snz, snx2, sny2, snz2;
122   PPPM_FLOAT argx, argy, argz, wx, wy, wz, sx, sy, sz, qx, qy, qz;
123   PPPM_FLOAT sum1, dot1, dot2;
124   PPPM_FLOAT numerator, denominator;
125 
126   PPPM_FLOAT form = PPPM_F(1.0);
127   int n = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
128   m = blockIdx.x;
129   l = blockIdx.y;
130   k = threadIdx.x;
131 
132   mper = m - nz_pppm * (2 * m / nz_pppm);
133   snz = sin(PPPM_F(0.5) * unitkz * mper * zprd_slab / nz_pppm);
134   snz2 = snz * snz;
135 
136 
137   lper = l - ny_pppm * (2 * l / ny_pppm);
138   sny = sin(PPPM_F(0.5) * unitky * lper * yprd / ny_pppm);
139   sny2 = sny * sny;
140 
141   kper = k - nx_pppm * (2 * k / nx_pppm);
142   snx = sin(PPPM_F(0.5) * unitkx * kper * xprd / nx_pppm);
143   snx2 = snx * snx;
144 
145   sqk = pow(unitkx * kper, PPPM_F(2.0)) + pow(unitky * lper, PPPM_F(2.0)) +
146         pow(unitkz * mper, PPPM_F(2.0));
147 
148   if(sqk != PPPM_F(0.0)) {
149     numerator = form * PPPM_F(12.5663706) / sqk;
150     denominator = gf_denom(snx2, sny2, snz2);
151     sum1 = PPPM_F(0.0);
152 
153     for(nx = -nbx; nx <= nbx; nx++) {
154       qx = unitkx * (kper + nx_pppm * nx);
155       sx = exp(PPPM_F(-.25) * pow(qx / g_ewald, PPPM_F(2.0)));
156       wx = PPPM_F(1.0);
157       argx = PPPM_F(0.5) * qx * xprd / nx_pppm;
158 
159       if(argx != PPPM_F(0.0)) wx = pow(sin(argx) / argx, order);
160 
161       for(ny = -nby; ny <= nby; ny++) {
162         qy = unitky * (lper + ny_pppm * ny);
163         sy = exp(PPPM_F(-.25) * pow(qy / g_ewald, PPPM_F(2.0)));
164         wy = PPPM_F(1.0);
165         argy = PPPM_F(0.5) * qy * yprd / ny_pppm;
166 
167         if(argy != PPPM_F(0.0)) wy = pow(sin(argy) / argy, order);
168 
169         for(nz = -nbz; nz <= nbz; nz++) {
170           qz = unitkz * (mper + nz_pppm * nz);
171           sz = exp(PPPM_F(-.25) * pow(qz / g_ewald, PPPM_F(2.0)));
172           wz = PPPM_F(1.0);
173           argz = PPPM_F(0.5) * qz * zprd_slab / nz_pppm;
174 
175           if(argz != PPPM_F(0.0)) wz = pow(sin(argz) / argz, order);
176 
177           dot1 = unitkx * kper * qx + unitky * lper * qy + unitkz * mper * qz;
178           dot2 = qx * qx + qy * qy + qz * qz;
179           sum1 += (dot1 / dot2) * sx * sy * sz * pow(wx * wy * wz, PPPM_F(2.0));
180         }
181       }
182     }
183 
184     greensfn[n] = numerator * sum1 / denominator;
185   } else greensfn[n] = PPPM_F(0.0);
186 }
187 
poisson_scale_kernel()188 __global__ void poisson_scale_kernel()
189 {
190   int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
191   FFT_FLOAT scaleinv = FFT_F(1.0) / (gridDim.x * gridDim.y * blockDim.x);
192   work1[2 * i] *= scaleinv * greensfn[i];
193   work1[2 * i + 1] *= scaleinv * greensfn[i];
194 }
195 
poisson_xgrad_kernel()196 __global__ void poisson_xgrad_kernel()
197 {
198   int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
199   work2[2 * i] = fkx[threadIdx.x] * work1[2 * i + 1];
200   work2[2 * i + 1] = -fkx[threadIdx.x] * work1[2 * i];
201 }
202 
poisson_ygrad_kernel()203 __global__ void poisson_ygrad_kernel()
204 {
205   int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
206   work2[2 * i] = fky[blockIdx.y] * work1[2 * i + 1];
207   work2[2 * i + 1] = -fky[blockIdx.y] * work1[2 * i];
208 }
209 
poisson_zgrad_kernel()210 __global__ void poisson_zgrad_kernel()
211 {
212   int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
213   work2[2 * i] = fkz[blockIdx.x] * work1[2 * i + 1];
214   work2[2 * i + 1] = -fkz[blockIdx.x] * work1[2 * i];
215 }
216 
poisson_vdx_brick_kernel(int ilo,int jlo,int klo)217 __global__ void poisson_vdx_brick_kernel(int ilo, int jlo, int klo)
218 {
219   int k = blockIdx.x + klo;
220   k += nz_pppm * negativCUDA(CUDA_F(1.0) * k) - nz_pppm * negativCUDA(CUDA_F(1.0) * (nz_pppm - k - 1));
221   int j = blockIdx.y + jlo;
222   j += ny_pppm * negativCUDA(CUDA_F(1.0) * j) - ny_pppm * negativCUDA(CUDA_F(1.0) * (ny_pppm - j - 1));
223   int i = threadIdx.x + ilo;
224   i += nx_pppm * negativCUDA(CUDA_F(1.0) * i) - nx_pppm * negativCUDA(CUDA_F(1.0) * (nx_pppm - i - 1));
225   vdx_brick[((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1) + threadIdx.x] = work3[2 * (((k) * ny_pppm + (j)) * nx_pppm + i)];
226 }
227 
poisson_vdy_brick_kernel(int ilo,int jlo,int klo)228 __global__ void poisson_vdy_brick_kernel(int ilo, int jlo, int klo)
229 {
230   int k = blockIdx.x + klo;
231   k += nz_pppm * negativCUDA(CUDA_F(1.0) * k) - nz_pppm * negativCUDA(CUDA_F(1.0) * (nz_pppm - k - 1));
232   int j = blockIdx.y + jlo;
233   j += ny_pppm * negativCUDA(CUDA_F(1.0) * j) - ny_pppm * negativCUDA(CUDA_F(1.0) * (ny_pppm - j - 1));
234   int i = threadIdx.x + ilo;
235   i += nx_pppm * negativCUDA(CUDA_F(1.0) * i) - nx_pppm * negativCUDA(CUDA_F(1.0) * (nx_pppm - i - 1));
236   vdy_brick[((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1) + threadIdx.x] = work3[2 * (((k) * ny_pppm + (j)) * nx_pppm + i)];
237 }
238 
poisson_vdz_brick_kernel(int ilo,int jlo,int klo)239 __global__ void poisson_vdz_brick_kernel(int ilo, int jlo, int klo)
240 {
241   int k = blockIdx.x + klo;
242   k += nz_pppm * negativCUDA(CUDA_F(1.0) * k) - nz_pppm * negativCUDA(CUDA_F(1.0) * (nz_pppm - k - 1));
243   int j = blockIdx.y + jlo;
244   j += ny_pppm * negativCUDA(CUDA_F(1.0) * j) - ny_pppm * negativCUDA(CUDA_F(1.0) * (ny_pppm - j - 1));
245   int i = threadIdx.x + ilo;
246   i += nx_pppm * negativCUDA(CUDA_F(1.0) * i) - nx_pppm * negativCUDA(CUDA_F(1.0) * (nx_pppm - i - 1));
247   vdz_brick[((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1) + threadIdx.x] = work3[2 * (((k) * ny_pppm + (j)) * nx_pppm + i)];
248 }
249 
poisson_energy_kernel(int nxlo_fft,int nylo_fft,int nzlo_fft,int vflag)250 __global__ void poisson_energy_kernel(int nxlo_fft, int nylo_fft, int nzlo_fft, int vflag)
251 {
252   ENERGY_FLOAT scaleinv = FFT_F(1.0) / (nx_pppm * ny_pppm * nz_pppm);
253   int i = (blockIdx.x + nzlo_fft) * ny_pppm * nx_pppm + (blockIdx.y + nylo_fft) * nx_pppm + threadIdx.x + nxlo_fft;
254   ENERGY_FLOAT* s_energy = (ENERGY_FLOAT*) sharedmem;
255   ENERGY_FLOAT myenergy = scaleinv * scaleinv * greensfn[i] * (work1[2 * i] * work1[2 * i] + work1[2 * i + 1] * work1[2 * i + 1]);
256   s_energy[threadIdx.x] = myenergy;
257 
258   __syncthreads();
259   reduceBlock(s_energy);
260 
261   if(threadIdx.x == 0)
262     energy[blockIdx.x * ny_pppm + blockIdx.y] = s_energy[0];
263 
264   if(vflag) {
265     __syncthreads();
266 
267     for(int j = 0; j < 6; j++) {
268       s_energy[threadIdx.x] = myenergy * vg[((blockIdx.x * gridDim.y + blockIdx.y) * (blockDim.x) + threadIdx.x) * 6 + j];
269       __syncthreads();
270       reduceBlock(s_energy);
271 
272       if(threadIdx.x == 0)
273         virial[blockIdx.x * ny_pppm + blockIdx.y + j * nz_pppm * ny_pppm] = s_energy[0];
274     }
275   }
276 }
277 
278 
sum_energy_kernel1(int vflag)279 __global__ void sum_energy_kernel1(int vflag)
280 {
281   ENERGY_FLOAT myenergy = energy[(blockIdx.x * ny_pppm + threadIdx.x)];
282   ENERGY_FLOAT* s_energy = (ENERGY_FLOAT*) sharedmem;
283   s_energy[threadIdx.x] = myenergy;
284   __syncthreads();
285   reduceBlock(s_energy);
286 
287   if(threadIdx.x == 0)
288     energy[blockIdx.x * ny_pppm] = s_energy[0];
289 
290   if(vflag) {
291     __syncthreads();
292 
293     for(int j = 0; j < 6; j++) {
294       myenergy = virial[blockIdx.x * ny_pppm + threadIdx.x + j * ny_pppm * nz_pppm];
295       s_energy[threadIdx.x] = myenergy;
296       __syncthreads();
297       reduceBlock(s_energy);
298 
299       if(threadIdx.x == 0)
300         virial[blockIdx.x * ny_pppm + j * ny_pppm * nz_pppm] = s_energy[0];
301     }
302   }
303 
304 }
305 
sum_energy_kernel2(int vflag)306 __global__ void sum_energy_kernel2(int vflag)
307 {
308   ENERGY_FLOAT myenergy = energy[threadIdx.x * ny_pppm];
309   ENERGY_FLOAT* s_energy = (ENERGY_FLOAT*) sharedmem;
310   s_energy[threadIdx.x] = myenergy;
311   __syncthreads();
312   reduceBlock(s_energy);
313 
314   if(threadIdx.x == 0)
315     energy[0] = s_energy[0];
316 
317   if(vflag) {
318     __syncthreads();
319 
320     for(int j = 0; j < 6; j++) {
321       myenergy = virial[threadIdx.x * ny_pppm + j * ny_pppm * nz_pppm];
322       s_energy[threadIdx.x] = myenergy;
323       __syncthreads();
324       reduceBlock(s_energy);
325 
326       if(threadIdx.x == 0)
327         virial[j] = s_energy[0];
328     }
329   }
330 }
331 
rho1d(int k,PPPM_FLOAT d,PPPM_FLOAT * srho_coeff)332 __device__ PPPM_FLOAT rho1d(int k, PPPM_FLOAT d, PPPM_FLOAT* srho_coeff)
333 {
334   PPPM_FLOAT rho1d_tmp = PPPM_F(0.0);
335 
336   for(int l = order - 1; l >= 0; l--)
337     rho1d_tmp = srho_coeff[l * order + k - (1 - order) / 2] + rho1d_tmp * d;
338 
339   return rho1d_tmp;
340 }
341 
particle_map_kernel(int * flag)342 __global__ void particle_map_kernel(int* flag)
343 {
344   int i = blockIdx.x * blockDim.x + threadIdx.x;
345 
346   if(i < nlocal) {
347     int nx, ny, nz;
348     PPPM_FLOAT shift = PPPM_F(0.5) - shiftone; //+OFFSET;
349     nx = (int)((_x[i] - _boxlo[0]) * delxinv + shift); // - OFFSET;
350     ny = (int)((_x[i + nmax] - _boxlo[1]) * delyinv + shift); // - OFFSET;
351     nz = (int)((_x[i + 2 * nmax] - _boxlo[2]) * delzinv + shift); // - OFFSET;
352 
353     part2grid[i] = nx;
354     part2grid[i + nmax] = ny;
355     part2grid[i + 2 * nmax] = nz;
356 
357     // check that entire stencil around nx,ny,nz will fit in my 3d brick
358     if(nx + nlower < nxlo_out || nx + nupper > nxhi_out ||
359         ny + nlower < nylo_out || ny + nupper > nyhi_out ||
360         nz + nlower < nzlo_out || nz + nupper > nzhi_out) {
361       flag[0]++;
362       debugdata[0] = i;
363       debugdata[1] = _boxlo[0];
364       debugdata[2] = _boxlo[1];
365       debugdata[3] = _boxlo[2];
366       debugdata[4] = nx;
367       debugdata[5] = ny;
368       debugdata[6] = nz;
369       debugdata[7] = _x[i];
370       debugdata[8] = _x[i + _nmax];
371       debugdata[9] = _x[i + 2 * _nmax];
372       debugdata[10] = nlocal;
373 
374     }
375   }
376 }
377 
make_rho_kernelA()378 __global__ void make_rho_kernelA()
379 {
380   int i, l, m, n, nx, ny, nz, mx, my, mz;
381 
382   // clear 3d density array
383 
384 
385   // loop over my charges, add their contribution to nearby grid points
386   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
387   // (dx,dy,dz) = distance to "lower left" grid pt
388   // (mx,my,mz) = global coords of moving stencil pt
389 
390   i = blockIdx.x * blockDim.x + threadIdx.x;
391 
392   if(i < nlocal) {
393 
394     PPPM_FLOAT dx, dy, dz, x0, y0, z0;
395     nx = part2grid[i];
396     ny = part2grid[i + nmax];
397     nz = part2grid[i + 2 * nmax];
398     dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv;
399     dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv;
400     dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv;
401 
402     z0 = delxinv * delyinv * delzinv * _q[i];
403 
404     for(n = nlower; n <= nupper; n++) {
405       mz = n + nz;
406       y0 = z0 * rho1d(n, dz, rho_coeff);
407 
408       for(m = nlower; m <= nupper; m++) {
409         my = m + ny;
410         x0 = y0 * rho1d(m, dy, rho_coeff);
411 
412         for(l = nlower; l <= nupper; l++) {
413           mx = l + nx;
414           int mzyx = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + mx - nxlo_out;
415 
416           while(atomicAdd(&density_brick_int[mzyx], 1) != 0) atomicAdd(&density_brick_int[mzyx], -1);
417 
418           density_brick[mzyx] += x0 * rho1d(l, dx, rho_coeff);
419           __threadfence();
420           atomicAdd(&density_brick_int[mzyx], -1);
421           __syncthreads();
422 
423         }
424       }
425     }
426   }
427 }
428 
make_rho_kernel(int * flag,int read_threads_at_same_time)429 __global__ void make_rho_kernel(int* flag, int read_threads_at_same_time)
430 {
431   int i, l, m, n, nx, ny, nz, mx, my, mz, a, b;
432 
433   // clear 3d density array
434 
435 
436   // loop over my charges, add their contribution to nearby grid points
437   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
438   // (dx,dy,dz) = distance to "lower left" grid pt
439   // (mx,my,mz) = global coords of moving stencil pt
440   // int nzxy=blockIdx.x*gridDim.y+blockIdx.y;
441 
442   int nelements = nupper - nlower + 1;
443   int* idx = (int*) sharedmem;
444   int* sdensity_brick_int = &idx[blockDim.x];
445   PPPM_FLOAT* srho_coeff = (PPPM_FLOAT*) &sdensity_brick_int[nelements * blockDim.x];
446 
447   if(threadIdx.x < order * (order / 2 - (1 - order) / 2 + 1))
448     srho_coeff[threadIdx.x] = rho_coeff[threadIdx.x];
449 
450   __syncthreads();
451 
452   i = blockIdx.x * blockDim.x + threadIdx.x;
453 
454   if(false) {
455     if(i < nlocal) {
456 
457       PPPM_FLOAT dx, dy, dz, x0, y0, z0;
458       nx = part2grid[i];
459       ny = part2grid[i + nmax];
460       nz = part2grid[i + 2 * nmax];
461       dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv;
462       dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv;
463       dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv;
464 
465       z0 = delxinv * delyinv * delzinv * _q[i];
466 
467       for(n = nlower; n <= nupper; n++) {
468         mz = n + nz;
469         y0 = z0 * rho1d(n, dz, srho_coeff);
470 
471         for(m = nlower; m <= nupper; m++) {
472           my = m + ny;
473           x0 = y0 * rho1d(m, dy, srho_coeff);
474 
475           for(l = nlower; l <= nupper; l++) {
476             mx = l + nx;
477             int mzyx = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + mx - nxlo_out;
478 
479             a = int(x0 * rho1d(l, dx, srho_coeff) * density_intScale);
480             b = (atomicAdd(&density_brick_int[mzyx], a) | a);
481 
482             if(((b) & (0x7c000000)) && (not((b) & (0x80000000)))) {
483               flag[1]++;
484 
485               if((b) & (0x60000000)) flag[0]++;
486             }
487 
488             __syncthreads();
489           }
490         }
491       }
492     }
493 
494     return;
495   }
496 
497   i = blockIdx.x * blockDim.x + threadIdx.x;
498   {
499 
500     PPPM_FLOAT dx, dy, dz, x0, y0, z0, qtmp;
501 
502     if(i < nlocal) {
503       qtmp = _q[i];
504       nx = part2grid[i];
505       ny = part2grid[i + nmax];
506       nz = part2grid[i + 2 * nmax];
507       dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv;
508       dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv;
509       dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv;
510       z0 = delxinv * delyinv * delzinv * qtmp;
511     } else {
512       nx = ny = nz = 1;
513       dx = dy = dz = PPPM_F(0.1);
514     }
515 
516     __syncthreads();
517 
518     for(n = nlower; n <= nupper; n++) {
519       mz = n + nz;
520       y0 = z0 * rho1d(n, dz, srho_coeff);
521 
522       for(m = nlower; m <= nupper; m++) {
523         my = m + ny;
524         x0 = y0 * rho1d(m, dy, srho_coeff);
525 
526         if(i < nlocal) {
527           idx[threadIdx.x] = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + nx + nlower - nxlo_out;
528 
529           for(l = nlower; l <= nupper; l++) {
530             sdensity_brick_int[threadIdx.x * nelements + l - nlower] = int(x0 * rho1d(l, dx, srho_coeff) * density_intScale);
531           }
532         } else idx[threadIdx.x] = -1;
533 
534         __syncthreads();
535 
536         for(int ii = 0; ii < blockDim.x; ii += read_threads_at_same_time) {
537           int kk = threadIdx.x / nelements;
538 
539           if((threadIdx.x < nelements * read_threads_at_same_time) && (kk + ii < blockDim.x) && (idx[ii + kk] > -1)) {
540             a = sdensity_brick_int[ii * nelements + threadIdx.x];
541             //if(a*a>1e-100)
542             b = (atomicAdd(&density_brick_int[idx[ii + kk] + threadIdx.x - kk * nelements], a) | a);
543 
544             //else
545             //b=(density_brick_int[idx[ii+kk]+threadIdx.x-kk*nelements]|a);
546             if(((b) & (0x7c000000)) && (not((b) & (0x80000000)))) {
547               flag[1]++;
548 
549               if((b) & (0x60000000)) flag[0]++;
550             }
551           }
552         }
553 
554         __syncthreads();	   //*/
555       }
556     }
557 
558   }
559 }
560 
scale_rho_kernel()561 __global__ void scale_rho_kernel()
562 {
563   int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
564   density_brick[i] = (1.0 / density_intScale) * density_brick_int[i];
565 }
566 
fieldforce_kernel(int elements_per_thread,int read_threads_at_same_time,int * flag)567 __global__ void fieldforce_kernel(int elements_per_thread, int read_threads_at_same_time, int* flag) //20*x64 0.36
568 {
569   int i;
570 
571   // loop over my charges, interpolate electric field from nearby grid points
572   // (nx,ny,nz) = global coords of grid pt to "lower left" of charge
573   // (dx,dy,dz) = distance to "lower left" grid pt
574   // (mx,my,mz) = global coords of moving stencil pt
575   // ek = 3 components of E-field on particle
576   i = blockIdx.x * blockDim.x + threadIdx.x;
577   int* idx = (int*) sharedmem;
578   PPPM_FLOAT* tmp_brick = (PPPM_FLOAT*) &idx[blockDim.x];
579   PPPM_FLOAT* srho_coeff = (PPPM_FLOAT*) &tmp_brick[3 * blockDim.x * elements_per_thread];
580 
581   if(threadIdx.x < order * (order / 2 - (1 - order) / 2 + 1))
582     srho_coeff[threadIdx.x] = rho_coeff[threadIdx.x];
583 
584   __syncthreads();
585   {
586     int l, m, n, nx, ny, nz, my, mz;
587     PPPM_FLOAT dx, dy, dz, x0, y0, z0;
588     PPPM_FLOAT ek[3];
589 
590     if(i < nlocal) {
591       nx = part2grid[i];
592       ny = part2grid[i + nmax];
593       nz = part2grid[i + 2 * nmax];
594       dx = nx + shiftone - (_x[i] - _boxlo[0]) * delxinv;
595       dy = ny + shiftone - (_x[i + nmax] - _boxlo[1]) * delyinv;
596       dz = nz + shiftone - (_x[i + 2 * nmax] - _boxlo[2]) * delzinv;
597 
598       ek[0] = ek[1] = ek[2] = PPPM_F(0.0);
599     } else {
600       nx = ny = nz = 1;
601       dx = dy = dz = PPPM_F(0.1);
602     }
603 
604     __syncthreads();
605 
606     for(n = nlower; n <= nupper; n++) {
607       mz = n + nz;
608       z0 = rho1d(n, dz, srho_coeff);
609 
610       for(m = nlower; m <= nupper; m++) {
611         my = m + ny;
612         y0 = z0 * rho1d(m, dy, srho_coeff);
613 
614 
615         if(i < nlocal)
616           idx[threadIdx.x] = ((mz - nzlo_out) * (nyhi_out - nylo_out + 1) + my - nylo_out) * (nxhi_out - nxlo_out + 1) + nx + nlower - nxlo_out;
617         else idx[threadIdx.x] = -1;
618 
619         __syncthreads();
620 
621         for(int ii = 0; ii < blockDim.x; ii += read_threads_at_same_time) {
622           int kk = threadIdx.x / elements_per_thread;
623 
624           if((threadIdx.x < elements_per_thread * read_threads_at_same_time) && (kk + ii < blockDim.x) && (idx[ii + kk] > -1)) {
625             tmp_brick[ii * elements_per_thread + threadIdx.x] = vdx_brick[idx[ii + kk] + threadIdx.x - kk * elements_per_thread];
626             tmp_brick[(ii + blockDim.x)*elements_per_thread + threadIdx.x] = vdy_brick[idx[ii + kk] + threadIdx.x - kk * elements_per_thread];
627             tmp_brick[(ii + 2 * blockDim.x)*elements_per_thread + threadIdx.x] = vdz_brick[idx[ii + kk] + threadIdx.x - kk * elements_per_thread];
628           }
629         }
630 
631         __syncthreads();
632 
633         if(i < nlocal)
634           for(l = nlower; l <= nupper; l++) {
635             x0 = y0 * rho1d(l, dx, srho_coeff);
636 
637             ek[0] -= x0 * tmp_brick[threadIdx.x * elements_per_thread + l - nlower];
638             ek[1] -= x0 * tmp_brick[threadIdx.x * elements_per_thread + l - nlower + blockDim.x * elements_per_thread];
639             ek[2] -= x0 * tmp_brick[threadIdx.x * elements_per_thread + l - nlower + 2 * blockDim.x * elements_per_thread];
640           }
641 
642         __syncthreads();
643       }
644     }
645 
646     // convert E-field to force
647 
648 
649     _f[i] += qqrd2e * _q[i] * ek[0];
650     _f[i + nmax] += qqrd2e * _q[i] * ek[1];
651     _f[i + 2 * nmax] += qqrd2e * _q[i] * ek[2];
652   }
653 }
654 
slabcorr_energy_kernel(ENERGY_FLOAT * buf)655 __global__ void slabcorr_energy_kernel(ENERGY_FLOAT* buf)
656 {
657   ENERGY_FLOAT* dipole = (ENERGY_FLOAT*) sharedmem;
658   int i = blockIdx.x * blockDim.x + threadIdx.x;
659 
660   if(i < nlocal)
661     dipole[threadIdx.x] = _q[i] * _x[i + 2 * nmax];
662   else
663     dipole[threadIdx.x] = ENERGY_F(0.0);
664 
665   __syncthreads();
666   reduceBlock(dipole);
667 
668   if(threadIdx.x == 0) buf[blockIdx.x] = dipole[0];
669 }
670 
slabcorr_force_kernel(F_FLOAT ffact)671 __global__ void slabcorr_force_kernel(F_FLOAT ffact)
672 {
673   int i = blockIdx.x * blockDim.x + threadIdx.x;
674 
675   if(i < nlocal)
676     _f[i + 2 * nmax] += qqrd2e * _q[i] * ffact;
677 }
678 
679 
initfftdata_core_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)680 __global__ void initfftdata_core_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
681 {
682   out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] = in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
683   out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x) + 1] = 0;
684 }
685 
initfftdata_z_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)686 __global__ void initfftdata_z_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
687 {
688   if(slabflag) {
689     if(blockIdx.x < nzlo_in - nzlo_out)
690       out[2 * (((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
691   } else {
692     if(blockIdx.x < nzlo_in - nzlo_out)
693       out[2 * (((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
694   }
695 
696   if(blockIdx.x < nzhi_out - nzhi_in)
697     out[2 * ((((blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + (nzhi_out - nzlo_in)) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
698 }
699 
initfftdata_y_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)700 __global__ void initfftdata_y_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
701 {
702   if(blockIdx.y < nylo_in - nylo_out)
703     out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + (2 * (nyhi_in + 1) - nylo_in - nyhi_out) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
704 
705   if(blockIdx.y < nyhi_out - nyhi_in)
706     out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + (nyhi_out - nylo_in)) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
707 }
708 
initfftdata_x_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)709 __global__ void initfftdata_x_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
710 {
711   if(threadIdx.x < nxlo_in - nxlo_out)
712     out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
713 
714   if(threadIdx.x < nxhi_out - nxhi_in)
715     out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
716 }
717 
initfftdata_yz_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)718 __global__ void initfftdata_yz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
719 {
720   if(slabflag) {
721     if(blockIdx.x < nzlo_in - nzlo_out)
722       if(blockIdx.y < nyhi_out - nyhi_in)
723         out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
724 
725     if(blockIdx.x < nzlo_in - nzlo_out)
726       if(blockIdx.y < nylo_in - nylo_out)
727         out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
728   } else {
729     if(blockIdx.x < nzlo_in - nzlo_out)
730       if(blockIdx.y < nyhi_out - nyhi_in)
731         out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
732 
733     if(blockIdx.x < nzlo_in - nzlo_out)
734       if(blockIdx.y < nylo_in - nylo_out)
735         out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
736   }
737 
738   if(blockIdx.x < nzhi_out - nzhi_in)
739     if(blockIdx.y < nyhi_out - nyhi_in)
740       out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
741 
742   if(blockIdx.x < nzhi_out - nzhi_in)
743     if(blockIdx.y < nylo_in - nylo_out)
744       out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxlo_in - nxlo_out];
745 }
746 
initfftdata_xz_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)747 __global__ void initfftdata_xz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
748 {
749   if(blockIdx.x < nzhi_out - nzhi_in)
750     if(threadIdx.x < nxlo_in - nxlo_out)
751       out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
752 
753   if(blockIdx.x < nzhi_out - nzhi_in)
754     if(threadIdx.x < nxhi_out - nxhi_in)
755       out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
756 
757   if(slabflag) {
758     if(blockIdx.x < nzlo_in - nzlo_out)
759       if(threadIdx.x < nxlo_in - nxlo_out)
760         out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
761 
762     if(blockIdx.x < nzlo_in - nzlo_out)
763       if(threadIdx.x < nxhi_out - nxhi_in)
764         out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
765   } else {
766     if(blockIdx.x < nzlo_in - nzlo_out)
767       if(threadIdx.x < nxlo_in - nxlo_out)
768         out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
769 
770     if(blockIdx.x < nzlo_in - nzlo_out)
771       if(threadIdx.x < nxhi_out - nxhi_in)
772         out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nylo_in - nylo_out) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
773   }
774 }
775 
initfftdata_xy_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)776 __global__ void initfftdata_xy_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
777 {
778   if(blockIdx.y < nyhi_out - nyhi_in)
779     if(threadIdx.x < nxlo_in - nxlo_out)
780       out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
781 
782   if(blockIdx.y < nyhi_out - nyhi_in)
783     if(threadIdx.x < nxhi_out - nxhi_in)
784       out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
785 
786   if(blockIdx.y < nylo_in - nylo_out)
787     if(threadIdx.x < nxlo_in - nxlo_out)
788       out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
789 
790   if(blockIdx.y < nylo_in - nylo_out)
791     if(threadIdx.x < nxhi_out - nxhi_in)
792       out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzlo_in - nzlo_out) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
793 }
794 
initfftdata_xyz_kernel(PPPM_FLOAT * in,FFT_FLOAT * out)795 __global__ void initfftdata_xyz_kernel(PPPM_FLOAT* in, FFT_FLOAT* out)
796 {
797   if(blockIdx.x < nzhi_out - nzhi_in)
798     if(blockIdx.y < nyhi_out - nyhi_in)
799       if(threadIdx.x < nxlo_in - nxlo_out)
800         out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
801 
802   if(blockIdx.x < nzhi_out - nzhi_in)
803     if(blockIdx.y < nyhi_out - nyhi_in)
804       if(threadIdx.x < nxhi_out - nxhi_in)
805         out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
806 
807   if(blockIdx.x < nzhi_out - nzhi_in)
808     if(blockIdx.y < nylo_in - nylo_out)
809       if(threadIdx.x < nxlo_in - nxlo_out)
810         out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
811 
812   if(blockIdx.x < nzhi_out - nzhi_in)
813     if(blockIdx.y < nylo_in - nylo_out)
814       if(threadIdx.x < nxhi_out - nxhi_in)
815         out[2 * (((blockIdx.x * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x + nzhi_in - nzlo_out + 1) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
816 
817   if(slabflag) {
818     if(blockIdx.x < nzlo_in - nzlo_out)
819       if(blockIdx.y < nyhi_out - nyhi_in)
820         if(threadIdx.x < nxlo_in - nxlo_out)
821           out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
822 
823     if(blockIdx.x < nzlo_in - nzlo_out)
824       if(blockIdx.y < nyhi_out - nyhi_in)
825         if(threadIdx.x < nxhi_out - nxhi_in)
826           out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
827 
828     if(blockIdx.x < nzlo_in - nzlo_out)
829       if(blockIdx.y < nylo_in - nylo_out)
830         if(threadIdx.x < nxlo_in - nxlo_out)
831           out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
832 
833     if(blockIdx.x < nzlo_in - nzlo_out)
834       if(blockIdx.y < nylo_in - nylo_out)
835         if(threadIdx.x < nxhi_out - nxhi_in)
836           out[2 * ((((nzhi_in - nzlo_in + 2 - nupper - slabflag + blockIdx.x) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
837   } else {
838     if(blockIdx.x < nzlo_in - nzlo_out)
839       if(blockIdx.y < nyhi_out - nyhi_in)
840         if(threadIdx.x < nxlo_in - nxlo_out)
841           out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
842 
843     if(blockIdx.x < nzlo_in - nzlo_out)
844       if(blockIdx.y < nyhi_out - nyhi_in)
845         if(threadIdx.x < nxhi_out - nxhi_in)
846           out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y + nyhi_in - nylo_out + 1) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
847 
848     if(blockIdx.x < nzlo_in - nzlo_out)
849       if(blockIdx.y < nylo_in - nylo_out)
850         if(threadIdx.x < nxlo_in - nxlo_out)
851           out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x + 2 * (nxhi_in + 1) - nxlo_in - nxhi_out)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x];
852 
853     if(blockIdx.x < nzlo_in - nzlo_out)
854       if(blockIdx.y < nylo_in - nylo_out)
855         if(threadIdx.x < nxhi_out - nxhi_in)
856           out[2 * ((((blockIdx.x + 2 * (nzhi_in + 1) - nzlo_in - nzhi_out) * (nyhi_in - nylo_in + 1) + blockIdx.y + 2 * (nyhi_in + 1) - nylo_in - nyhi_out) * (nxhi_in - nxlo_in + 1)) + threadIdx.x)] += in[(((blockIdx.x) * (nyhi_out - nylo_out + 1) + blockIdx.y) * (nxhi_out - nxlo_out + 1)) + threadIdx.x + nxhi_in - nxlo_out + 1];
857   }
858 }
859