1 /* ----------------------------------------------------------------------
2    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3 
4    Original Version:
5    http://lammps.sandia.gov, Sandia National Laboratories
6    Steve Plimpton, sjplimp@sandia.gov
7 
8    See the README file in the top-level LAMMPS directory.
9 
10    -----------------------------------------------------------------------
11 
12    USER-CUDA Package and associated modifications:
13    https://sourceforge.net/projects/lammpscuda/
14 
15    Christian Trott, christian.trott@tu-ilmenau.de
16    Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17    Theoretical Physics II, University of Technology Ilmenau, Germany
18 
19    See the README file in the USER-CUDA directory.
20 
21    This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23 #define EWALD_F   1.12837917
24 #define EWALD_P   0.3275911
25 #define A1        0.254829592
26 #define A2       -0.284496736
27 #define A3        1.421413741
28 #define A4       -1.453152027
29 #define A5        1.061405429
30 
31 
32 template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
Pair_Kernel_TpA(int eflag,int vflag,int eflag_atom,int vflag_atom)33 __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom)
34 {
35   ENERGY_FLOAT evdwl = ENERGY_F(0.0);
36   ENERGY_FLOAT ecoul = ENERGY_F(0.0);
37 
38   ENERGY_FLOAT* sharedE;
39   ENERGY_FLOAT* sharedECoul;
40   ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
41 
42   if(eflag || eflag_atom) {
43     sharedE = &sharedmem[threadIdx.x];
44     sharedE[0] = ENERGY_F(0.0);
45     sharedV += blockDim.x;
46 
47     if(coul_type != COUL_NONE) {
48       sharedECoul = sharedE + blockDim.x;
49       sharedECoul[0] = ENERGY_F(0.0);
50       sharedV += blockDim.x;
51     }
52   }
53 
54   if(vflag || vflag_atom) {
55     sharedV[0 * blockDim.x] = ENERGY_F(0.0);
56     sharedV[1 * blockDim.x] = ENERGY_F(0.0);
57     sharedV[2 * blockDim.x] = ENERGY_F(0.0);
58     sharedV[3 * blockDim.x] = ENERGY_F(0.0);
59     sharedV[4 * blockDim.x] = ENERGY_F(0.0);
60     sharedV[5 * blockDim.x] = ENERGY_F(0.0);
61   }
62 
63   int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
64 
65   X_FLOAT xtmp, ytmp, ztmp;
66   X_FLOAT4 myxtype;
67   F_FLOAT fxtmp, fytmp, fztmp, fpair;
68   F_FLOAT delx, dely, delz;
69   F_FLOAT factor_lj, factor_coul;
70   F_FLOAT qtmp;
71   int itype, i, j;
72   int jnum = 0;
73   int* jlist;
74 
75   if(ii < _inum) {
76     i = _ilist[ii];
77 
78     myxtype = fetchXType(i);
79     xtmp = myxtype.x;
80     ytmp = myxtype.y;
81     ztmp = myxtype.z;
82     itype = static_cast <int>(myxtype.w);
83 
84 
85     fxtmp = F_F(0.0);
86     fytmp = F_F(0.0);
87     fztmp = F_F(0.0);
88 
89     if(coul_type != COUL_NONE)
90       qtmp = fetchQ(i);
91 
92     jnum = _numneigh[i];
93     jlist = &_neighbors[i];
94   }
95 
96   __syncthreads();
97 
98   for(int jj = 0; jj < jnum; jj++) {
99     if(ii < _inum)
100       if(jj < jnum) {
101         fpair = F_F(0.0);
102         j = jlist[jj * _nlocal];
103         factor_lj =  _special_lj[sbmask(j)];
104 
105         if(coul_type != COUL_NONE)
106           factor_coul = _special_coul[sbmask(j)];
107 
108         j &= NEIGHMASK;
109 
110         myxtype = fetchXType(j);
111         delx = xtmp - myxtype.x;
112         dely = ytmp - myxtype.y;
113         delz = ztmp - myxtype.z;
114         int jtype = static_cast <int>(myxtype.w);
115 
116 
117         const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
118 
119         bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
120 
121         if(in_cutoff) {
122           switch(pair_type) {
123             case PAIR_BORN:
124               fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
125               break;
126 
127             case PAIR_BUCK:
128               fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
129               break;
130 
131             case PAIR_CG_CMM:
132               fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
133               break;
134 
135             case PAIR_LJ_CHARMM:
136               fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
137               break;
138 
139             case PAIR_LJ_CLASS2:
140               fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
141               break;
142 
143             case PAIR_LJ_CUT:
144               fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
145               break;
146 
147             case PAIR_LJ_EXPAND:
148               fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
149               break;
150 
151             case PAIR_LJ_GROMACS:
152               fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
153               break;
154 
155             case PAIR_LJ_SMOOTH:
156               fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
157               break;
158 
159             case PAIR_LJ96_CUT:
160               fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
161               break;
162 
163             case PAIR_MORSE_R6:
164               fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
165               break;
166 
167             case PAIR_MORSE:
168               fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
169               break;
170           }
171         }
172 
173         if(coul_type != COUL_NONE) {
174           const F_FLOAT qiqj = qtmp * fetchQ(j);
175 
176           if(qiqj * qiqj > 1e-8) {
177             const bool in_coul_cutoff =
178               rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
179 
180             if(in_coul_cutoff) {
181               switch(coul_type) {
182                 case COUL_CHARMM:
183                   fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
184                   break;
185 
186                 case COUL_CHARMM_IMPLICIT:
187                   fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
188                   break;
189 
190                 case COUL_CUT: {
191                   const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
192 
193                   if(eflag) {
194                     ecoul += forcecoul;
195                   }
196 
197                   fpair += forcecoul * (F_F(1.0) / rsq);
198                 }
199                 break;
200 
201                 case COUL_DEBYE: {
202                   const F_FLOAT r2inv = F_F(1.0) / rsq;
203                   const X_FLOAT r = _RSQRT_(r2inv);
204                   const X_FLOAT rinv = F_F(1.0) / r;
205                   const F_FLOAT screening = _EXP_(-_kappa * r);
206                   F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
207 
208                   if(eflag) {
209                     ecoul += forcecoul * rinv;
210                   }
211 
212                   forcecoul *= (_kappa + rinv);
213                   fpair += forcecoul * r2inv;
214                 }
215                 break;
216 
217                 case COUL_GROMACS:
218                   fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
219                   break;
220 
221                 case COUL_LONG: {
222                   const F_FLOAT r2inv = F_F(1.0) / rsq;
223                   const F_FLOAT r = _RSQRT_(r2inv);
224                   const F_FLOAT grij = _g_ewald * r;
225                   const F_FLOAT expm2 = _EXP_(-grij * grij);
226                   const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
227                   const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
228                   const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
229                   F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
230 
231                   if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
232 
233                   if(eflag) {
234                     ecoul += prefactor * erfc;
235 
236                     if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
237                   }
238 
239                   fpair += forcecoul * r2inv;
240                 }
241                 break;
242               }
243             }
244 
245             in_cutoff = in_cutoff || in_coul_cutoff;
246           }
247         }
248 
249 
250         if(in_cutoff) {
251           F_FLOAT dxfp, dyfp, dzfp;
252           fxtmp += dxfp = delx * fpair;
253           fytmp += dyfp = dely * fpair;
254           fztmp += dzfp = delz * fpair;
255 
256           if(vflag) {
257             sharedV[0 * blockDim.x] += delx * dxfp;
258             sharedV[1 * blockDim.x] += dely * dyfp;
259             sharedV[2 * blockDim.x] += delz * dzfp;
260             sharedV[3 * blockDim.x] += delx * dyfp;
261             sharedV[4 * blockDim.x] += delx * dzfp;
262             sharedV[5 * blockDim.x] += dely * dzfp;
263           }
264         }
265       }
266   }
267 
268   __syncthreads();
269 
270   if(ii < _inum) {
271     F_FLOAT* my_f;
272 
273     if(_collect_forces_later) {
274       ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
275 
276       if(eflag) {
277         buffer = &buffer[1 * gridDim.x * gridDim.y];
278 
279         if(coul_type != COUL_NONE)
280           buffer = &buffer[1 * gridDim.x * gridDim.y];
281       }
282 
283       if(vflag) {
284         buffer = &buffer[6 * gridDim.x * gridDim.y];
285       }
286 
287       my_f = (F_FLOAT*) buffer;
288       my_f += i;
289       *my_f = fxtmp;
290       my_f += _nmax;
291       *my_f = fytmp;
292       my_f += _nmax;
293       *my_f = fztmp;
294     } else {
295       my_f = _f + i;
296       *my_f += fxtmp;
297       my_f += _nmax;
298       *my_f += fytmp;
299       my_f += _nmax;
300       *my_f += fztmp;
301     }
302   }
303 
304   __syncthreads();
305 
306   if(eflag) {
307     sharedE[0] = evdwl;
308 
309     if(coul_type != COUL_NONE)
310       sharedECoul[0] = ecoul;
311   }
312 
313   if(eflag_atom && i < _nlocal) {
314     if(coul_type != COUL_NONE)
315       _eatom[i] += evdwl + ecoul;
316     else
317       _eatom[i] += evdwl;
318   }
319 
320   if(vflag_atom && i < _nlocal) {
321     _vatom[i]         += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
322     _vatom[i + _nmax]   += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
323     _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
324     _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
325     _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
326     _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
327   }
328 
329   if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, coul_type != COUL_NONE ? 1 : 0);
330 }
331 
332 template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
Pair_Kernel_BpA(int eflag,int vflag,int eflag_atom,int vflag_atom)333 __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_atom)
334 {
335   int ii = (blockIdx.x * gridDim.y + blockIdx.y);
336 
337   if(ii >= _inum)
338     return;
339 
340   ENERGY_FLOAT evdwl = ENERGY_F(0.0);
341   ENERGY_FLOAT ecoul = ENERGY_F(0.0);
342   F_FLOAT3* sharedVirial1;
343   F_FLOAT3* sharedVirial2;
344   F_FLOAT* sharedEnergy;
345   F_FLOAT* sharedEnergyCoul;
346 
347   F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
348 
349   if(vflag) {
350     sharedVirial1 = &sharedForce[64];
351     sharedVirial2 = &sharedVirial1[64];
352   } else {
353     sharedVirial1 = &sharedForce[0];
354     sharedVirial2 = &sharedVirial1[0];
355   }
356 
357   if(eflag) {
358     if(vflag || vflag_atom)
359       sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
360     else
361       sharedEnergy = (F_FLOAT*) &sharedForce[64];
362 
363     if(coul_type != COUL_NONE)
364       sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
365 
366   }
367 
368   F_FLOAT3 partialForce = { F_F(0.0),  F_F(0.0),  F_F(0.0) };
369   F_FLOAT3 partialVirial1 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };
370   F_FLOAT3 partialVirial2 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };
371 
372   X_FLOAT xtmp, ytmp, ztmp;
373   X_FLOAT4 myxtype;
374   F_FLOAT delx, dely, delz;
375   F_FLOAT factor_lj, factor_coul;
376   F_FLOAT fpair;
377   F_FLOAT qtmp;
378   int itype, jnum, i, j;
379   int* jlist;
380 
381   i = _ilist[ii];
382 
383   myxtype = fetchXType(i);
384 
385   xtmp = myxtype.x;
386   ytmp = myxtype.y;
387   ztmp = myxtype.z;
388   itype = static_cast <int>(myxtype.w);
389 
390   if(coul_type != COUL_NONE)
391     qtmp = fetchQ(i);
392 
393   jnum = _numneigh[i];
394 
395   jlist = &_neighbors[i * _maxneighbors];
396   __syncthreads();
397 
398   for(int jj = threadIdx.x; jj < jnum + blockDim.x; jj += blockDim.x) {
399     if(jj < jnum) {
400       fpair = F_F(0.0);
401       j = jlist[jj];
402       factor_lj =  _special_lj[sbmask(j)];
403 
404       if(coul_type != COUL_NONE)
405         factor_coul = _special_coul[sbmask(j)];
406 
407       j &= NEIGHMASK;
408 
409       myxtype = fetchXType(j);
410 
411       delx = xtmp - myxtype.x;
412       dely = ytmp - myxtype.y;
413       delz = ztmp - myxtype.z;
414       int jtype = static_cast <int>(myxtype.w);
415 
416       const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
417 
418       bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
419       bool in_coul_cutoff;
420 
421       if(in_cutoff) {
422         switch(pair_type) {
423           case PAIR_BORN:
424             fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
425             break;
426 
427           case PAIR_BUCK:
428             fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
429             break;
430 
431           case PAIR_CG_CMM:
432             fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
433             break;
434 
435           case PAIR_LJ_CHARMM:
436             fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
437             break;
438 
439           case PAIR_LJ_CLASS2:
440             fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
441             break;
442 
443           case PAIR_LJ_CUT:
444             fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
445             break;
446 
447           case PAIR_LJ_EXPAND:
448             fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
449             break;
450 
451           case PAIR_LJ_GROMACS:
452             fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
453             break;
454 
455           case PAIR_LJ_SMOOTH:
456             fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
457             break;
458 
459           case PAIR_LJ96_CUT:
460             fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
461             break;
462 
463           case PAIR_MORSE_R6:
464             fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
465             break;
466 
467           case PAIR_MORSE:
468             fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
469             break;
470         }
471       }
472 
473       if(coul_type != COUL_NONE) {
474         const F_FLOAT qiqj = qtmp * fetchQ(j);
475 
476         if(qiqj * qiqj > (1e-8f)) {
477           in_coul_cutoff =
478             rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
479 
480           if(in_coul_cutoff) {
481             switch(coul_type) {
482               case COUL_CHARMM:
483                 fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
484                 break;
485 
486               case COUL_CHARMM_IMPLICIT:
487                 fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
488                 break;
489 
490               case COUL_GROMACS:
491                 fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
492                 break;
493 
494               case COUL_LONG: {
495                 const F_FLOAT r2inv = F_F(1.0) / rsq;
496                 const F_FLOAT r = _RSQRT_(r2inv);
497                 const F_FLOAT grij = _g_ewald * r;
498                 const F_FLOAT expm2 = _EXP_(-grij * grij);
499                 const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
500                 const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
501                 const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
502                 F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
503 
504                 if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
505 
506                 if(eflag) {
507                   ecoul += prefactor * erfc;
508 
509                   if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
510                 }
511 
512                 fpair += forcecoul * r2inv;
513               }
514               break;
515 
516               case COUL_DEBYE: {
517                 const F_FLOAT r2inv = F_F(1.0) / rsq;
518                 const X_FLOAT r = _RSQRT_(r2inv);
519                 const X_FLOAT rinv = F_F(1.0) / r;
520                 const F_FLOAT screening = _EXP_(-_kappa * r);
521                 F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
522 
523                 if(eflag) {
524                   ecoul += forcecoul * rinv;
525                 }
526 
527                 forcecoul *= (_kappa + rinv);
528                 fpair += forcecoul * r2inv;
529               }
530               break;
531 
532               case COUL_CUT: {
533                 const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
534 
535                 if(eflag) {
536                   ecoul += forcecoul;
537                 }
538 
539                 fpair += forcecoul * (F_F(1.0) / rsq);
540               }
541               break;
542 
543 
544             }
545           }
546         }
547       }
548 
549 
550 
551       if(in_cutoff || in_coul_cutoff) {
552         F_FLOAT dxfp, dyfp, dzfp;
553         partialForce.x += dxfp = delx * fpair;
554         partialForce.y += dyfp = dely * fpair;
555         partialForce.z += dzfp = delz * fpair;
556 
557         if(vflag) {
558           partialVirial1.x += delx * dxfp;
559           partialVirial1.y += dely * dyfp;
560           partialVirial1.z += delz * dzfp;
561           partialVirial2.x += delx * dyfp;
562           partialVirial2.y += delx * dzfp;
563           partialVirial2.z += dely * dzfp;
564         }
565       }
566     }
567   }
568 
569   if(eflag) {
570     sharedEnergy[threadIdx.x] = evdwl;
571 
572     if(coul_type != COUL_NONE)
573       sharedEnergyCoul[threadIdx.x] = ecoul;
574   }
575 
576   sharedForce[threadIdx.x] = partialForce;
577 
578   if(vflag) {
579     sharedVirial1[threadIdx.x] = partialVirial1;
580     sharedVirial2[threadIdx.x] = partialVirial2;
581   }
582 
583   __syncthreads();
584 
585 
586   for(unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
587 
588     if(threadIdx.x < s) {
589       sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
590       sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
591       sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;
592 
593       if(vflag) {
594         sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
595         sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
596         sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;
597 
598         sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
599         sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
600         sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
601       }
602 
603       if(eflag) {
604         sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];
605 
606         if(coul_type != COUL_NONE)
607           sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
608       }
609     }
610 
611     __syncthreads();
612   }
613 
614   if(threadIdx.x == 0) {
615 
616     ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
617 
618     if(eflag) {
619       ENERGY_FLOAT tmp_evdwl;
620       buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];
621 
622       if(eflag_atom)
623         _eatom[i] = tmp_evdwl;
624 
625       buffer = &buffer[gridDim.x * gridDim.y];
626 
627       if(coul_type != COUL_NONE) {
628         buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergyCoul[0];
629 
630         if(eflag_atom)
631           _eatom[i] += tmp_evdwl;
632 
633         buffer = &buffer[gridDim.x * gridDim.y];
634       }
635     }
636 
637     if(vflag) {
638       ENERGY_FLOAT tmp;
639       buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
640 
641       if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;
642 
643       buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].y;
644 
645       if(vflag_atom) _vatom[i + 1 * _nmax] = tmp;
646 
647       buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].z;
648 
649       if(vflag_atom) _vatom[i + 2 * _nmax] = tmp;
650 
651       buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].x;
652 
653       if(vflag_atom) _vatom[i + 3 * _nmax] = tmp;
654 
655       buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].y;
656 
657       if(vflag_atom) _vatom[i + 4 * _nmax] = tmp;
658 
659       buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].z;
660 
661       if(vflag_atom) _vatom[i + 5 * _nmax] = tmp;
662 
663       buffer = &buffer[6 * gridDim.x * gridDim.y];
664     }
665 
666     F_FLOAT* my_f;
667 
668     if(_collect_forces_later) {
669       my_f = (F_FLOAT*) buffer;
670       my_f += i;
671       *my_f = sharedForce[0].x;
672       my_f += _nmax;
673       *my_f = sharedForce[0].y;
674       my_f += _nmax;
675       *my_f = sharedForce[0].z;
676     } else {
677       my_f = _f + i;
678       *my_f += sharedForce[0].x;
679       my_f += _nmax;
680       *my_f += sharedForce[0].y;
681       my_f += _nmax;
682       *my_f += sharedForce[0].z;
683     }
684   }
685 }
686 
687 
688 template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
Pair_Kernel_TpA_opt(int eflag,int vflag,int eflag_atom,int vflag_atom,int comm_phase)689 __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase)
690 {
691   ENERGY_FLOAT evdwl = ENERGY_F(0.0);
692   ENERGY_FLOAT ecoul = ENERGY_F(0.0);
693 
694   ENERGY_FLOAT* sharedE;
695   ENERGY_FLOAT* sharedECoul;
696   ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
697 
698   if(eflag || eflag_atom) {
699     sharedE = &sharedmem[threadIdx.x];
700     sharedE[0] = ENERGY_F(0.0);
701     sharedV += blockDim.x;
702 
703     if(coul_type != COUL_NONE) {
704       sharedECoul = sharedE + blockDim.x;
705       sharedECoul[0] = ENERGY_F(0.0);
706       sharedV += blockDim.x;
707     }
708   }
709 
710   if(vflag || vflag_atom) {
711     sharedV[0 * blockDim.x] = ENERGY_F(0.0);
712     sharedV[1 * blockDim.x] = ENERGY_F(0.0);
713     sharedV[2 * blockDim.x] = ENERGY_F(0.0);
714     sharedV[3 * blockDim.x] = ENERGY_F(0.0);
715     sharedV[4 * blockDim.x] = ENERGY_F(0.0);
716     sharedV[5 * blockDim.x] = ENERGY_F(0.0);
717   }
718 
719   int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
720 
721   X_FLOAT xtmp, ytmp, ztmp;
722   X_FLOAT4 myxtype;
723   F_FLOAT fxtmp, fytmp, fztmp, fpair;
724   F_FLOAT delx, dely, delz;
725   F_FLOAT factor_lj, factor_coul;
726   F_FLOAT qtmp;
727   int itype, i, j;
728   int jnum = 0;
729   int* jlist;
730 
731   if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) {
732     i = comm_phase < 2 ? _ilist[ii] : _ilist_border[ii] ;
733 
734     myxtype = fetchXType(i);
735     myxtype = _x_type[i];
736     xtmp = myxtype.x;
737     ytmp = myxtype.y;
738     ztmp = myxtype.z;
739     itype = static_cast <int>(myxtype.w);
740 
741 
742     fxtmp = F_F(0.0);
743     fytmp = F_F(0.0);
744     fztmp = F_F(0.0);
745 
746     if(coul_type != COUL_NONE)
747       qtmp = fetchQ(i);
748 
749     jnum = comm_phase == 0 ? _numneigh[i] : (comm_phase == 1 ? _numneigh_inner[i] : _numneigh_border[ii]);
750 
751 
752     jlist = comm_phase == 0 ? &_neighbors[i] : (comm_phase == 1 ? &_neighbors_inner[i] : &_neighbors_border[ii]);
753   }
754 
755   __syncthreads();
756 
757   for(int jj = 0; jj < jnum; jj++) {
758     if(ii < (comm_phase < 2 ? _inum : _inum_border[0]))
759       if(jj < jnum) {
760         fpair = F_F(0.0);
761         j = jlist[jj * _nlocal];
762 
763         factor_lj = j < _nall ? F_F(1.0) : _special_lj[j / _nall];
764 
765         if(coul_type != COUL_NONE)
766           factor_coul = j < _nall ? F_F(1.0) : _special_coul[j / _nall];
767 
768         j = j < _nall ? j : j % _nall;
769 
770         myxtype = fetchXType(j);
771         delx = xtmp - myxtype.x;
772         dely = ytmp - myxtype.y;
773         delz = ztmp - myxtype.z;
774         int jtype = static_cast <int>(myxtype.w);
775 
776 
777         const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
778 
779         bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
780 
781         if(in_cutoff) {
782           switch(pair_type) {
783             case PAIR_BORN:
784               fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
785               break;
786 
787             case PAIR_BUCK:
788               fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
789               break;
790 
791             case PAIR_CG_CMM:
792               fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
793               break;
794 
795             case PAIR_LJ_CHARMM:
796               fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
797               break;
798 
799             case PAIR_LJ_CLASS2:
800               fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
801               break;
802 
803             case PAIR_LJ_CUT:
804               fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
805               break;
806 
807             case PAIR_LJ_EXPAND:
808               fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
809               break;
810 
811             case PAIR_LJ_GROMACS:
812               fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
813               break;
814 
815             case PAIR_LJ_SMOOTH:
816               fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
817               break;
818 
819             case PAIR_LJ96_CUT:
820               fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
821               break;
822 
823             case PAIR_MORSE_R6:
824               fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
825               break;
826 
827             case PAIR_MORSE:
828               fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
829               break;
830           }
831         }
832 
833         if(coul_type != COUL_NONE) {
834           const F_FLOAT qiqj = qtmp * fetchQ(j);
835 
836           if(qiqj * qiqj > 1e-8) {
837             const bool in_coul_cutoff =
838               rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
839 
840             if(in_coul_cutoff) {
841               switch(coul_type) {
842                 case COUL_CHARMM:
843                   fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
844                   break;
845 
846                 case COUL_CHARMM_IMPLICIT:
847                   fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
848                   break;
849 
850                 case COUL_CUT: {
851                   const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
852 
853                   if(eflag) {
854                     ecoul += forcecoul;
855                   }
856 
857                   fpair += forcecoul * (F_F(1.0) / rsq);
858                 }
859                 break;
860 
861                 case COUL_DEBYE: {
862                   const F_FLOAT r2inv = F_F(1.0) / rsq;
863                   const X_FLOAT r = _RSQRT_(r2inv);
864                   const X_FLOAT rinv = F_F(1.0) / r;
865                   const F_FLOAT screening = _EXP_(-_kappa * r);
866                   F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
867 
868                   if(eflag) {
869                     ecoul += forcecoul * rinv;
870                   }
871 
872                   forcecoul *= (_kappa + rinv);
873                   fpair += forcecoul * r2inv;
874                 }
875                 break;
876 
877                 case COUL_GROMACS:
878                   fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
879                   break;
880 
881                 case COUL_LONG: {
882                   const F_FLOAT r2inv = F_F(1.0) / rsq;
883                   const F_FLOAT r = _RSQRT_(r2inv);
884                   const F_FLOAT grij = _g_ewald * r;
885                   const F_FLOAT expm2 = _EXP_(-grij * grij);
886                   const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
887                   const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
888                   const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
889                   F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
890 
891                   if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
892 
893                   if(eflag) {
894                     ecoul += prefactor * erfc;
895 
896                     if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
897                   }
898 
899                   fpair += forcecoul * r2inv;
900                 }
901                 break;
902 
903               }
904             }
905 
906             in_cutoff = in_cutoff || in_coul_cutoff;
907           }
908         }
909 
910 
911         if(in_cutoff) {
912           F_FLOAT dxfp, dyfp, dzfp;
913           fxtmp += dxfp = delx * fpair;
914           fytmp += dyfp = dely * fpair;
915           fztmp += dzfp = delz * fpair;
916 
917           if(vflag) {
918             sharedV[0 * blockDim.x] += delx * dxfp;
919             sharedV[1 * blockDim.x] += dely * dyfp;
920             sharedV[2 * blockDim.x] += delz * dzfp;
921             sharedV[3 * blockDim.x] += delx * dyfp;
922             sharedV[4 * blockDim.x] += delx * dzfp;
923             sharedV[5 * blockDim.x] += dely * dzfp;
924           }
925         }
926       }
927   }
928 
929   __syncthreads();
930 
931   if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) {
932     F_FLOAT* my_f;
933 
934     if(_collect_forces_later) {
935       ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
936 
937       if(eflag) {
938         buffer = &buffer[1 * gridDim.x * gridDim.y];
939 
940         if(coul_type != COUL_NONE)
941           buffer = &buffer[1 * gridDim.x * gridDim.y];
942       }
943 
944       if(vflag) {
945         buffer = &buffer[6 * gridDim.x * gridDim.y];
946       }
947 
948       my_f = (F_FLOAT*) buffer;
949       my_f += i;
950       *my_f = fxtmp;
951       my_f += _nmax;
952       *my_f = fytmp;
953       my_f += _nmax;
954       *my_f = fztmp;
955     } else {
956       my_f = _f + i;
957       *my_f += fxtmp;
958       my_f += _nmax;
959       *my_f += fytmp;
960       my_f += _nmax;
961       *my_f += fztmp;
962     }
963   }
964 
965   __syncthreads();
966 
967   if(eflag) {
968     sharedE[0] = evdwl;
969 
970     if(coul_type != COUL_NONE)
971       sharedECoul[0] = ecoul;
972   }
973 
974   if(eflag_atom && i < _nlocal) {
975     if(coul_type != COUL_NONE)
976       _eatom[i] += evdwl + ecoul;
977     else
978       _eatom[i] += evdwl;
979   }
980 
981   if(vflag_atom && i < _nlocal) {
982     _vatom[i]         += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
983     _vatom[i + _nmax]   += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
984     _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
985     _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
986     _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
987     _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
988   }
989 
990   if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, coul_type != COUL_NONE ? 1 : 0);
991 }
992 
993 template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
Pair_Kernel_BpA_opt(int eflag,int vflag,int eflag_atom,int vflag_atom,int comm_phase)994 __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase)
995 {
996   int ii = (blockIdx.x * gridDim.y + blockIdx.y);
997 
998   if(ii >= (comm_phase < 2 ? _inum : _inum_border[0]))
999     return;
1000 
1001   ENERGY_FLOAT evdwl = ENERGY_F(0.0);
1002   ENERGY_FLOAT ecoul = ENERGY_F(0.0);
1003   F_FLOAT3* sharedVirial1;
1004   F_FLOAT3* sharedVirial2;
1005   F_FLOAT* sharedEnergy;
1006   F_FLOAT* sharedEnergyCoul;
1007 
1008   F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
1009 
1010   if(vflag) {
1011     sharedVirial1 = &sharedForce[64];
1012     sharedVirial2 = &sharedVirial1[64];
1013   } else {
1014     sharedVirial1 = &sharedForce[0];
1015     sharedVirial2 = &sharedVirial1[0];
1016   }
1017 
1018   if(eflag) {
1019     if(vflag || vflag_atom)
1020       sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
1021     else
1022       sharedEnergy = (F_FLOAT*) &sharedForce[64];
1023 
1024     if(coul_type != COUL_NONE)
1025       sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
1026 
1027   }
1028 
1029   F_FLOAT3 partialForce = { F_F(0.0),  F_F(0.0),  F_F(0.0) };
1030   F_FLOAT3 partialVirial1 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };
1031   F_FLOAT3 partialVirial2 = {  F_F(0.0),  F_F(0.0),  F_F(0.0) };
1032 
1033   X_FLOAT xtmp, ytmp, ztmp;
1034   X_FLOAT4 myxtype;
1035   F_FLOAT delx, dely, delz;
1036   F_FLOAT factor_lj, factor_coul;
1037   F_FLOAT fpair;
1038   F_FLOAT qtmp;
1039   int itype, jnum, i, j;
1040   int* jlist;
1041 
1042   i = comm_phase < 2 ? _ilist[ii] : _ilist_border[ii];
1043 
1044   myxtype = fetchXType(i);
1045 
1046   xtmp = myxtype.x;
1047   ytmp = myxtype.y;
1048   ztmp = myxtype.z;
1049   itype = static_cast <int>(myxtype.w);
1050 
1051   if(coul_type != COUL_NONE)
1052     qtmp = fetchQ(i);
1053 
1054   jnum = comm_phase == 0 ? _numneigh[i] : (comm_phase == 1 ? _numneigh_inner[i] : _numneigh_border[ii]);
1055 
1056   jlist = comm_phase == 0 ? &_neighbors[i * _maxneighbors] : (comm_phase == 1 ? &_neighbors_inner[i * _maxneighbors] : &_neighbors_border[ii * _maxneighbors]);
1057   __syncthreads();
1058 
1059   for(int jj = threadIdx.x; jj < jnum + blockDim.x; jj += blockDim.x) {
1060     if(jj < jnum) {
1061       fpair = F_F(0.0);
1062       j = jlist[jj];
1063       factor_lj   = j < _nall ? F_F(1.0) : _special_lj[j / _nall];
1064 
1065       if(coul_type != COUL_NONE)
1066         factor_coul = j < _nall ? F_F(1.0) : _special_coul[j / _nall];
1067 
1068       j 			= j < _nall ? j : j % _nall;
1069 
1070       myxtype = fetchXType(j);
1071 
1072       delx = xtmp - myxtype.x;
1073       dely = ytmp - myxtype.y;
1074       delz = ztmp - myxtype.z;
1075       int jtype = static_cast <int>(myxtype.w);
1076 
1077       const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
1078 
1079       bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
1080       bool in_coul_cutoff;
1081 
1082       if(in_cutoff) {
1083         switch(pair_type) {
1084           case PAIR_BORN:
1085             fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1086             break;
1087 
1088           case PAIR_BUCK:
1089             fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1090             break;
1091 
1092           case PAIR_CG_CMM:
1093             fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1094             break;
1095 
1096           case PAIR_LJ_CHARMM:
1097             fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1098             break;
1099 
1100           case PAIR_LJ_CLASS2:
1101             fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1102             break;
1103 
1104           case PAIR_LJ_CUT:
1105             fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1106             break;
1107 
1108           case PAIR_LJ_EXPAND:
1109             fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1110             break;
1111 
1112           case PAIR_LJ_GROMACS:
1113             fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1114             break;
1115 
1116           case PAIR_LJ_SMOOTH:
1117             fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1118             break;
1119 
1120           case PAIR_LJ96_CUT:
1121             fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1122             break;
1123 
1124           case PAIR_MORSE_R6:
1125             fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1126             break;
1127 
1128           case PAIR_MORSE:
1129             fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1130             break;
1131         }
1132       }
1133 
1134       if(coul_type != COUL_NONE) {
1135         const F_FLOAT qiqj = qtmp * fetchQ(j);
1136 
1137         if(qiqj * qiqj > (1e-8f)) {
1138           in_coul_cutoff =
1139             rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
1140 
1141           if(in_coul_cutoff) {
1142             switch(coul_type) {
1143               case COUL_CHARMM:
1144                 fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
1145                 break;
1146 
1147               case COUL_CHARMM_IMPLICIT:
1148                 fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
1149                 break;
1150 
1151               case COUL_GROMACS:
1152                 fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
1153                 break;
1154 
1155               case COUL_LONG: {
1156                 const F_FLOAT r2inv = F_F(1.0) / rsq;
1157                 const F_FLOAT r = _RSQRT_(r2inv);
1158                 const F_FLOAT grij = _g_ewald * r;
1159                 const F_FLOAT expm2 = _EXP_(-grij * grij);
1160                 const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
1161                 const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
1162                 const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
1163                 F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
1164 
1165                 if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
1166 
1167                 if(eflag) {
1168                   ecoul += prefactor * erfc;
1169 
1170                   if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
1171                 }
1172 
1173                 fpair += forcecoul * r2inv;
1174               }
1175               break;
1176 
1177               case COUL_DEBYE: {
1178                 const F_FLOAT r2inv = F_F(1.0) / rsq;
1179                 const X_FLOAT r = _RSQRT_(r2inv);
1180                 const X_FLOAT rinv = F_F(1.0) / r;
1181                 const F_FLOAT screening = _EXP_(-_kappa * r);
1182                 F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
1183 
1184                 if(eflag) {
1185                   ecoul += forcecoul * rinv;
1186                 }
1187 
1188                 forcecoul *= (_kappa + rinv);
1189                 fpair += forcecoul * r2inv;
1190               }
1191               break;
1192 
1193               case COUL_CUT: {
1194                 const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
1195 
1196                 if(eflag) {
1197                   ecoul += forcecoul;
1198                 }
1199 
1200                 fpair += forcecoul * (F_F(1.0) / rsq);
1201               }
1202               break;
1203 
1204 
1205             }
1206           }
1207         }
1208       }
1209 
1210 
1211 
1212       if(in_cutoff || in_coul_cutoff) {
1213         F_FLOAT dxfp, dyfp, dzfp;
1214         partialForce.x += dxfp = delx * fpair;
1215         partialForce.y += dyfp = dely * fpair;
1216         partialForce.z += dzfp = delz * fpair;
1217 
1218         if(vflag) {
1219           partialVirial1.x += delx * dxfp;
1220           partialVirial1.y += dely * dyfp;
1221           partialVirial1.z += delz * dzfp;
1222           partialVirial2.x += delx * dyfp;
1223           partialVirial2.y += delx * dzfp;
1224           partialVirial2.z += dely * dzfp;
1225         }
1226       }
1227     }
1228   }
1229 
1230   if(eflag) {
1231     sharedEnergy[threadIdx.x] = evdwl;
1232 
1233     if(coul_type != COUL_NONE)
1234       sharedEnergyCoul[threadIdx.x] = ecoul;
1235   }
1236 
1237   sharedForce[threadIdx.x] = partialForce;
1238 
1239   if(vflag) {
1240     sharedVirial1[threadIdx.x] = partialVirial1;
1241     sharedVirial2[threadIdx.x] = partialVirial2;
1242   }
1243 
1244   __syncthreads();
1245 
1246 
1247   for(unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
1248 
1249     if(threadIdx.x < s) {
1250       sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
1251       sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
1252       sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;
1253 
1254       if(vflag) {
1255         sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
1256         sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
1257         sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;
1258 
1259         sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
1260         sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
1261         sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
1262       }
1263 
1264       if(eflag) {
1265         sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];
1266 
1267         if(coul_type != COUL_NONE)
1268           sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
1269       }
1270     }
1271 
1272     __syncthreads();
1273   }
1274 
1275   if(threadIdx.x == 0) {
1276 
1277     ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
1278 
1279     if(eflag) {
1280       ENERGY_FLOAT tmp_evdwl;
1281       buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];
1282 
1283       if(eflag_atom)
1284         _eatom[i] = tmp_evdwl;
1285 
1286       buffer = &buffer[gridDim.x * gridDim.y];
1287 
1288       if(coul_type != COUL_NONE) {
1289         buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergyCoul[0];
1290 
1291         if(eflag_atom)
1292           _eatom[i] += tmp_evdwl;
1293 
1294         buffer = &buffer[gridDim.x * gridDim.y];
1295       }
1296     }
1297 
1298     if(vflag) {
1299       ENERGY_FLOAT tmp;
1300       buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
1301 
1302       if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;
1303 
1304       buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].y;
1305 
1306       if(vflag_atom) _vatom[i + 1 * _nmax] = tmp;
1307 
1308       buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].z;
1309 
1310       if(vflag_atom) _vatom[i + 2 * _nmax] = tmp;
1311 
1312       buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].x;
1313 
1314       if(vflag_atom) _vatom[i + 3 * _nmax] = tmp;
1315 
1316       buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].y;
1317 
1318       if(vflag_atom) _vatom[i + 4 * _nmax] = tmp;
1319 
1320       buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].z;
1321 
1322       if(vflag_atom) _vatom[i + 5 * _nmax] = tmp;
1323 
1324       buffer = &buffer[6 * gridDim.x * gridDim.y];
1325     }
1326 
1327     F_FLOAT* my_f;
1328 
1329     if(_collect_forces_later) {
1330       my_f = (F_FLOAT*) buffer;
1331       my_f += i;
1332       *my_f = sharedForce[0].x;
1333       my_f += _nmax;
1334       *my_f = sharedForce[0].y;
1335       my_f += _nmax;
1336       *my_f = sharedForce[0].z;
1337     } else {
1338       my_f = _f + i;
1339       *my_f += sharedForce[0].x;
1340       my_f += _nmax;
1341       *my_f += sharedForce[0].y;
1342       my_f += _nmax;
1343       *my_f += sharedForce[0].z;
1344     }
1345   }
1346 }
1347 
Pair_GenerateXType_Kernel()1348 __global__ void Pair_GenerateXType_Kernel()
1349 {
1350   int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
1351 
1352   if(i < _nall) {
1353     X_FLOAT4 xtype;
1354     xtype.x = _x[i];
1355     xtype.y = _x[i + _nmax];
1356     xtype.z = _x[i + 2 * _nmax];
1357     xtype.w = _type[i];
1358     _x_type[i] = xtype;
1359   }
1360 
1361 }
1362 
Pair_GenerateVRadius_Kernel()1363 __global__ void Pair_GenerateVRadius_Kernel()
1364 {
1365   int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
1366 
1367   if(i < _nall) {
1368     V_FLOAT4 vradius;
1369     vradius.x = _v[i];
1370     vradius.y = _v[i + _nmax];
1371     vradius.z = _v[i + 2 * _nmax];
1372     vradius.w = _radius[i];
1373     _v_radius[i] = vradius;
1374   }
1375 }
1376 
Pair_GenerateOmegaRmass_Kernel()1377 __global__ void Pair_GenerateOmegaRmass_Kernel()
1378 {
1379   int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
1380 
1381   if(i < _nall) {
1382     V_FLOAT4 omegarmass;
1383     omegarmass.x = _omega[i];
1384     omegarmass.y = _omega[i + _nmax];
1385     omegarmass.z = _omega[i + 2 * _nmax];
1386     omegarmass.w = _rmass[i];
1387     _omega_rmass[i] = omegarmass;
1388   }
1389 }
1390 
Pair_RevertXType_Kernel()1391 __global__ void Pair_RevertXType_Kernel()
1392 {
1393   int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
1394 
1395   if(i < _nall) {
1396     X_FLOAT4 xtype = _x_type[i];
1397     _x[i] = xtype.x;
1398     _x[i + _nmax] = xtype.y;
1399     _x[i + 2 * _nmax] = xtype.z;
1400     _type[i] = static_cast <int>(xtype.w);
1401   }
1402 
1403 }
1404 
Pair_BuildXHold_Kernel()1405 __global__ void Pair_BuildXHold_Kernel()
1406 {
1407   int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
1408 
1409   if(i < _nall) {
1410     X_FLOAT4 xtype = _x_type[i];
1411     _xhold[i] = xtype.x;
1412     _xhold[i + _nmax] = xtype.y;
1413     _xhold[i + 2 * _nmax] = xtype.z;
1414   }
1415 
1416 }
1417 
Pair_CollectForces_Kernel(int nperblock,int n)1418 __global__ void Pair_CollectForces_Kernel(int nperblock, int n)
1419 {
1420   int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
1421 
1422   if(i >= _nlocal) return;
1423 
1424   ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
1425 
1426   F_FLOAT* buf_f = (F_FLOAT*) &buf[nperblock * n];
1427   F_FLOAT* my_f = _f + i;
1428   buf_f += i;
1429   *my_f += * buf_f;
1430   my_f += _nmax;
1431   buf_f += _nmax;
1432   *my_f += * buf_f;
1433   my_f += _nmax;
1434   buf_f += _nmax;
1435   *my_f += * buf_f;
1436   my_f += _nmax;
1437 }
1438