1 /* ----------------------------------------------------------------------
2 LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3
4 Original Version:
5 http://lammps.sandia.gov, Sandia National Laboratories
6 Steve Plimpton, sjplimp@sandia.gov
7
8 See the README file in the top-level LAMMPS directory.
9
10 -----------------------------------------------------------------------
11
12 USER-CUDA Package and associated modifications:
13 https://sourceforge.net/projects/lammpscuda/
14
15 Christian Trott, christian.trott@tu-ilmenau.de
16 Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17 Theoretical Physics II, University of Technology Ilmenau, Germany
18
19 See the README file in the USER-CUDA directory.
20
21 This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23 #define EWALD_F 1.12837917
24 #define EWALD_P 0.3275911
25 #define A1 0.254829592
26 #define A2 -0.284496736
27 #define A3 1.421413741
28 #define A4 -1.453152027
29 #define A5 1.061405429
30
31
32 template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
Pair_Kernel_TpA(int eflag,int vflag,int eflag_atom,int vflag_atom)33 __global__ void Pair_Kernel_TpA(int eflag, int vflag, int eflag_atom, int vflag_atom)
34 {
35 ENERGY_FLOAT evdwl = ENERGY_F(0.0);
36 ENERGY_FLOAT ecoul = ENERGY_F(0.0);
37
38 ENERGY_FLOAT* sharedE;
39 ENERGY_FLOAT* sharedECoul;
40 ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
41
42 if(eflag || eflag_atom) {
43 sharedE = &sharedmem[threadIdx.x];
44 sharedE[0] = ENERGY_F(0.0);
45 sharedV += blockDim.x;
46
47 if(coul_type != COUL_NONE) {
48 sharedECoul = sharedE + blockDim.x;
49 sharedECoul[0] = ENERGY_F(0.0);
50 sharedV += blockDim.x;
51 }
52 }
53
54 if(vflag || vflag_atom) {
55 sharedV[0 * blockDim.x] = ENERGY_F(0.0);
56 sharedV[1 * blockDim.x] = ENERGY_F(0.0);
57 sharedV[2 * blockDim.x] = ENERGY_F(0.0);
58 sharedV[3 * blockDim.x] = ENERGY_F(0.0);
59 sharedV[4 * blockDim.x] = ENERGY_F(0.0);
60 sharedV[5 * blockDim.x] = ENERGY_F(0.0);
61 }
62
63 int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
64
65 X_FLOAT xtmp, ytmp, ztmp;
66 X_FLOAT4 myxtype;
67 F_FLOAT fxtmp, fytmp, fztmp, fpair;
68 F_FLOAT delx, dely, delz;
69 F_FLOAT factor_lj, factor_coul;
70 F_FLOAT qtmp;
71 int itype, i, j;
72 int jnum = 0;
73 int* jlist;
74
75 if(ii < _inum) {
76 i = _ilist[ii];
77
78 myxtype = fetchXType(i);
79 xtmp = myxtype.x;
80 ytmp = myxtype.y;
81 ztmp = myxtype.z;
82 itype = static_cast <int>(myxtype.w);
83
84
85 fxtmp = F_F(0.0);
86 fytmp = F_F(0.0);
87 fztmp = F_F(0.0);
88
89 if(coul_type != COUL_NONE)
90 qtmp = fetchQ(i);
91
92 jnum = _numneigh[i];
93 jlist = &_neighbors[i];
94 }
95
96 __syncthreads();
97
98 for(int jj = 0; jj < jnum; jj++) {
99 if(ii < _inum)
100 if(jj < jnum) {
101 fpair = F_F(0.0);
102 j = jlist[jj * _nlocal];
103 factor_lj = _special_lj[sbmask(j)];
104
105 if(coul_type != COUL_NONE)
106 factor_coul = _special_coul[sbmask(j)];
107
108 j &= NEIGHMASK;
109
110 myxtype = fetchXType(j);
111 delx = xtmp - myxtype.x;
112 dely = ytmp - myxtype.y;
113 delz = ztmp - myxtype.z;
114 int jtype = static_cast <int>(myxtype.w);
115
116
117 const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
118
119 bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
120
121 if(in_cutoff) {
122 switch(pair_type) {
123 case PAIR_BORN:
124 fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
125 break;
126
127 case PAIR_BUCK:
128 fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
129 break;
130
131 case PAIR_CG_CMM:
132 fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
133 break;
134
135 case PAIR_LJ_CHARMM:
136 fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
137 break;
138
139 case PAIR_LJ_CLASS2:
140 fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
141 break;
142
143 case PAIR_LJ_CUT:
144 fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
145 break;
146
147 case PAIR_LJ_EXPAND:
148 fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
149 break;
150
151 case PAIR_LJ_GROMACS:
152 fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
153 break;
154
155 case PAIR_LJ_SMOOTH:
156 fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
157 break;
158
159 case PAIR_LJ96_CUT:
160 fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
161 break;
162
163 case PAIR_MORSE_R6:
164 fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
165 break;
166
167 case PAIR_MORSE:
168 fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
169 break;
170 }
171 }
172
173 if(coul_type != COUL_NONE) {
174 const F_FLOAT qiqj = qtmp * fetchQ(j);
175
176 if(qiqj * qiqj > 1e-8) {
177 const bool in_coul_cutoff =
178 rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
179
180 if(in_coul_cutoff) {
181 switch(coul_type) {
182 case COUL_CHARMM:
183 fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
184 break;
185
186 case COUL_CHARMM_IMPLICIT:
187 fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
188 break;
189
190 case COUL_CUT: {
191 const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
192
193 if(eflag) {
194 ecoul += forcecoul;
195 }
196
197 fpair += forcecoul * (F_F(1.0) / rsq);
198 }
199 break;
200
201 case COUL_DEBYE: {
202 const F_FLOAT r2inv = F_F(1.0) / rsq;
203 const X_FLOAT r = _RSQRT_(r2inv);
204 const X_FLOAT rinv = F_F(1.0) / r;
205 const F_FLOAT screening = _EXP_(-_kappa * r);
206 F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
207
208 if(eflag) {
209 ecoul += forcecoul * rinv;
210 }
211
212 forcecoul *= (_kappa + rinv);
213 fpair += forcecoul * r2inv;
214 }
215 break;
216
217 case COUL_GROMACS:
218 fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
219 break;
220
221 case COUL_LONG: {
222 const F_FLOAT r2inv = F_F(1.0) / rsq;
223 const F_FLOAT r = _RSQRT_(r2inv);
224 const F_FLOAT grij = _g_ewald * r;
225 const F_FLOAT expm2 = _EXP_(-grij * grij);
226 const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
227 const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
228 const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
229 F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
230
231 if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
232
233 if(eflag) {
234 ecoul += prefactor * erfc;
235
236 if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
237 }
238
239 fpair += forcecoul * r2inv;
240 }
241 break;
242 }
243 }
244
245 in_cutoff = in_cutoff || in_coul_cutoff;
246 }
247 }
248
249
250 if(in_cutoff) {
251 F_FLOAT dxfp, dyfp, dzfp;
252 fxtmp += dxfp = delx * fpair;
253 fytmp += dyfp = dely * fpair;
254 fztmp += dzfp = delz * fpair;
255
256 if(vflag) {
257 sharedV[0 * blockDim.x] += delx * dxfp;
258 sharedV[1 * blockDim.x] += dely * dyfp;
259 sharedV[2 * blockDim.x] += delz * dzfp;
260 sharedV[3 * blockDim.x] += delx * dyfp;
261 sharedV[4 * blockDim.x] += delx * dzfp;
262 sharedV[5 * blockDim.x] += dely * dzfp;
263 }
264 }
265 }
266 }
267
268 __syncthreads();
269
270 if(ii < _inum) {
271 F_FLOAT* my_f;
272
273 if(_collect_forces_later) {
274 ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
275
276 if(eflag) {
277 buffer = &buffer[1 * gridDim.x * gridDim.y];
278
279 if(coul_type != COUL_NONE)
280 buffer = &buffer[1 * gridDim.x * gridDim.y];
281 }
282
283 if(vflag) {
284 buffer = &buffer[6 * gridDim.x * gridDim.y];
285 }
286
287 my_f = (F_FLOAT*) buffer;
288 my_f += i;
289 *my_f = fxtmp;
290 my_f += _nmax;
291 *my_f = fytmp;
292 my_f += _nmax;
293 *my_f = fztmp;
294 } else {
295 my_f = _f + i;
296 *my_f += fxtmp;
297 my_f += _nmax;
298 *my_f += fytmp;
299 my_f += _nmax;
300 *my_f += fztmp;
301 }
302 }
303
304 __syncthreads();
305
306 if(eflag) {
307 sharedE[0] = evdwl;
308
309 if(coul_type != COUL_NONE)
310 sharedECoul[0] = ecoul;
311 }
312
313 if(eflag_atom && i < _nlocal) {
314 if(coul_type != COUL_NONE)
315 _eatom[i] += evdwl + ecoul;
316 else
317 _eatom[i] += evdwl;
318 }
319
320 if(vflag_atom && i < _nlocal) {
321 _vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
322 _vatom[i + _nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
323 _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
324 _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
325 _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
326 _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
327 }
328
329 if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, coul_type != COUL_NONE ? 1 : 0);
330 }
331
332 template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
Pair_Kernel_BpA(int eflag,int vflag,int eflag_atom,int vflag_atom)333 __global__ void Pair_Kernel_BpA(int eflag, int vflag, int eflag_atom, int vflag_atom)
334 {
335 int ii = (blockIdx.x * gridDim.y + blockIdx.y);
336
337 if(ii >= _inum)
338 return;
339
340 ENERGY_FLOAT evdwl = ENERGY_F(0.0);
341 ENERGY_FLOAT ecoul = ENERGY_F(0.0);
342 F_FLOAT3* sharedVirial1;
343 F_FLOAT3* sharedVirial2;
344 F_FLOAT* sharedEnergy;
345 F_FLOAT* sharedEnergyCoul;
346
347 F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
348
349 if(vflag) {
350 sharedVirial1 = &sharedForce[64];
351 sharedVirial2 = &sharedVirial1[64];
352 } else {
353 sharedVirial1 = &sharedForce[0];
354 sharedVirial2 = &sharedVirial1[0];
355 }
356
357 if(eflag) {
358 if(vflag || vflag_atom)
359 sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
360 else
361 sharedEnergy = (F_FLOAT*) &sharedForce[64];
362
363 if(coul_type != COUL_NONE)
364 sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
365
366 }
367
368 F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
369 F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
370 F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
371
372 X_FLOAT xtmp, ytmp, ztmp;
373 X_FLOAT4 myxtype;
374 F_FLOAT delx, dely, delz;
375 F_FLOAT factor_lj, factor_coul;
376 F_FLOAT fpair;
377 F_FLOAT qtmp;
378 int itype, jnum, i, j;
379 int* jlist;
380
381 i = _ilist[ii];
382
383 myxtype = fetchXType(i);
384
385 xtmp = myxtype.x;
386 ytmp = myxtype.y;
387 ztmp = myxtype.z;
388 itype = static_cast <int>(myxtype.w);
389
390 if(coul_type != COUL_NONE)
391 qtmp = fetchQ(i);
392
393 jnum = _numneigh[i];
394
395 jlist = &_neighbors[i * _maxneighbors];
396 __syncthreads();
397
398 for(int jj = threadIdx.x; jj < jnum + blockDim.x; jj += blockDim.x) {
399 if(jj < jnum) {
400 fpair = F_F(0.0);
401 j = jlist[jj];
402 factor_lj = _special_lj[sbmask(j)];
403
404 if(coul_type != COUL_NONE)
405 factor_coul = _special_coul[sbmask(j)];
406
407 j &= NEIGHMASK;
408
409 myxtype = fetchXType(j);
410
411 delx = xtmp - myxtype.x;
412 dely = ytmp - myxtype.y;
413 delz = ztmp - myxtype.z;
414 int jtype = static_cast <int>(myxtype.w);
415
416 const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
417
418 bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
419 bool in_coul_cutoff;
420
421 if(in_cutoff) {
422 switch(pair_type) {
423 case PAIR_BORN:
424 fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
425 break;
426
427 case PAIR_BUCK:
428 fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
429 break;
430
431 case PAIR_CG_CMM:
432 fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
433 break;
434
435 case PAIR_LJ_CHARMM:
436 fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
437 break;
438
439 case PAIR_LJ_CLASS2:
440 fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
441 break;
442
443 case PAIR_LJ_CUT:
444 fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
445 break;
446
447 case PAIR_LJ_EXPAND:
448 fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
449 break;
450
451 case PAIR_LJ_GROMACS:
452 fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
453 break;
454
455 case PAIR_LJ_SMOOTH:
456 fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
457 break;
458
459 case PAIR_LJ96_CUT:
460 fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
461 break;
462
463 case PAIR_MORSE_R6:
464 fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
465 break;
466
467 case PAIR_MORSE:
468 fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
469 break;
470 }
471 }
472
473 if(coul_type != COUL_NONE) {
474 const F_FLOAT qiqj = qtmp * fetchQ(j);
475
476 if(qiqj * qiqj > (1e-8f)) {
477 in_coul_cutoff =
478 rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
479
480 if(in_coul_cutoff) {
481 switch(coul_type) {
482 case COUL_CHARMM:
483 fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
484 break;
485
486 case COUL_CHARMM_IMPLICIT:
487 fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
488 break;
489
490 case COUL_GROMACS:
491 fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
492 break;
493
494 case COUL_LONG: {
495 const F_FLOAT r2inv = F_F(1.0) / rsq;
496 const F_FLOAT r = _RSQRT_(r2inv);
497 const F_FLOAT grij = _g_ewald * r;
498 const F_FLOAT expm2 = _EXP_(-grij * grij);
499 const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
500 const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
501 const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
502 F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
503
504 if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
505
506 if(eflag) {
507 ecoul += prefactor * erfc;
508
509 if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
510 }
511
512 fpair += forcecoul * r2inv;
513 }
514 break;
515
516 case COUL_DEBYE: {
517 const F_FLOAT r2inv = F_F(1.0) / rsq;
518 const X_FLOAT r = _RSQRT_(r2inv);
519 const X_FLOAT rinv = F_F(1.0) / r;
520 const F_FLOAT screening = _EXP_(-_kappa * r);
521 F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
522
523 if(eflag) {
524 ecoul += forcecoul * rinv;
525 }
526
527 forcecoul *= (_kappa + rinv);
528 fpair += forcecoul * r2inv;
529 }
530 break;
531
532 case COUL_CUT: {
533 const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
534
535 if(eflag) {
536 ecoul += forcecoul;
537 }
538
539 fpair += forcecoul * (F_F(1.0) / rsq);
540 }
541 break;
542
543
544 }
545 }
546 }
547 }
548
549
550
551 if(in_cutoff || in_coul_cutoff) {
552 F_FLOAT dxfp, dyfp, dzfp;
553 partialForce.x += dxfp = delx * fpair;
554 partialForce.y += dyfp = dely * fpair;
555 partialForce.z += dzfp = delz * fpair;
556
557 if(vflag) {
558 partialVirial1.x += delx * dxfp;
559 partialVirial1.y += dely * dyfp;
560 partialVirial1.z += delz * dzfp;
561 partialVirial2.x += delx * dyfp;
562 partialVirial2.y += delx * dzfp;
563 partialVirial2.z += dely * dzfp;
564 }
565 }
566 }
567 }
568
569 if(eflag) {
570 sharedEnergy[threadIdx.x] = evdwl;
571
572 if(coul_type != COUL_NONE)
573 sharedEnergyCoul[threadIdx.x] = ecoul;
574 }
575
576 sharedForce[threadIdx.x] = partialForce;
577
578 if(vflag) {
579 sharedVirial1[threadIdx.x] = partialVirial1;
580 sharedVirial2[threadIdx.x] = partialVirial2;
581 }
582
583 __syncthreads();
584
585
586 for(unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
587
588 if(threadIdx.x < s) {
589 sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
590 sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
591 sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;
592
593 if(vflag) {
594 sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
595 sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
596 sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;
597
598 sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
599 sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
600 sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
601 }
602
603 if(eflag) {
604 sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];
605
606 if(coul_type != COUL_NONE)
607 sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
608 }
609 }
610
611 __syncthreads();
612 }
613
614 if(threadIdx.x == 0) {
615
616 ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
617
618 if(eflag) {
619 ENERGY_FLOAT tmp_evdwl;
620 buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];
621
622 if(eflag_atom)
623 _eatom[i] = tmp_evdwl;
624
625 buffer = &buffer[gridDim.x * gridDim.y];
626
627 if(coul_type != COUL_NONE) {
628 buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergyCoul[0];
629
630 if(eflag_atom)
631 _eatom[i] += tmp_evdwl;
632
633 buffer = &buffer[gridDim.x * gridDim.y];
634 }
635 }
636
637 if(vflag) {
638 ENERGY_FLOAT tmp;
639 buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
640
641 if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;
642
643 buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].y;
644
645 if(vflag_atom) _vatom[i + 1 * _nmax] = tmp;
646
647 buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].z;
648
649 if(vflag_atom) _vatom[i + 2 * _nmax] = tmp;
650
651 buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].x;
652
653 if(vflag_atom) _vatom[i + 3 * _nmax] = tmp;
654
655 buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].y;
656
657 if(vflag_atom) _vatom[i + 4 * _nmax] = tmp;
658
659 buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].z;
660
661 if(vflag_atom) _vatom[i + 5 * _nmax] = tmp;
662
663 buffer = &buffer[6 * gridDim.x * gridDim.y];
664 }
665
666 F_FLOAT* my_f;
667
668 if(_collect_forces_later) {
669 my_f = (F_FLOAT*) buffer;
670 my_f += i;
671 *my_f = sharedForce[0].x;
672 my_f += _nmax;
673 *my_f = sharedForce[0].y;
674 my_f += _nmax;
675 *my_f = sharedForce[0].z;
676 } else {
677 my_f = _f + i;
678 *my_f += sharedForce[0].x;
679 my_f += _nmax;
680 *my_f += sharedForce[0].y;
681 my_f += _nmax;
682 *my_f += sharedForce[0].z;
683 }
684 }
685 }
686
687
688 template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
Pair_Kernel_TpA_opt(int eflag,int vflag,int eflag_atom,int vflag_atom,int comm_phase)689 __global__ void Pair_Kernel_TpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase)
690 {
691 ENERGY_FLOAT evdwl = ENERGY_F(0.0);
692 ENERGY_FLOAT ecoul = ENERGY_F(0.0);
693
694 ENERGY_FLOAT* sharedE;
695 ENERGY_FLOAT* sharedECoul;
696 ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
697
698 if(eflag || eflag_atom) {
699 sharedE = &sharedmem[threadIdx.x];
700 sharedE[0] = ENERGY_F(0.0);
701 sharedV += blockDim.x;
702
703 if(coul_type != COUL_NONE) {
704 sharedECoul = sharedE + blockDim.x;
705 sharedECoul[0] = ENERGY_F(0.0);
706 sharedV += blockDim.x;
707 }
708 }
709
710 if(vflag || vflag_atom) {
711 sharedV[0 * blockDim.x] = ENERGY_F(0.0);
712 sharedV[1 * blockDim.x] = ENERGY_F(0.0);
713 sharedV[2 * blockDim.x] = ENERGY_F(0.0);
714 sharedV[3 * blockDim.x] = ENERGY_F(0.0);
715 sharedV[4 * blockDim.x] = ENERGY_F(0.0);
716 sharedV[5 * blockDim.x] = ENERGY_F(0.0);
717 }
718
719 int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
720
721 X_FLOAT xtmp, ytmp, ztmp;
722 X_FLOAT4 myxtype;
723 F_FLOAT fxtmp, fytmp, fztmp, fpair;
724 F_FLOAT delx, dely, delz;
725 F_FLOAT factor_lj, factor_coul;
726 F_FLOAT qtmp;
727 int itype, i, j;
728 int jnum = 0;
729 int* jlist;
730
731 if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) {
732 i = comm_phase < 2 ? _ilist[ii] : _ilist_border[ii] ;
733
734 myxtype = fetchXType(i);
735 myxtype = _x_type[i];
736 xtmp = myxtype.x;
737 ytmp = myxtype.y;
738 ztmp = myxtype.z;
739 itype = static_cast <int>(myxtype.w);
740
741
742 fxtmp = F_F(0.0);
743 fytmp = F_F(0.0);
744 fztmp = F_F(0.0);
745
746 if(coul_type != COUL_NONE)
747 qtmp = fetchQ(i);
748
749 jnum = comm_phase == 0 ? _numneigh[i] : (comm_phase == 1 ? _numneigh_inner[i] : _numneigh_border[ii]);
750
751
752 jlist = comm_phase == 0 ? &_neighbors[i] : (comm_phase == 1 ? &_neighbors_inner[i] : &_neighbors_border[ii]);
753 }
754
755 __syncthreads();
756
757 for(int jj = 0; jj < jnum; jj++) {
758 if(ii < (comm_phase < 2 ? _inum : _inum_border[0]))
759 if(jj < jnum) {
760 fpair = F_F(0.0);
761 j = jlist[jj * _nlocal];
762
763 factor_lj = j < _nall ? F_F(1.0) : _special_lj[j / _nall];
764
765 if(coul_type != COUL_NONE)
766 factor_coul = j < _nall ? F_F(1.0) : _special_coul[j / _nall];
767
768 j = j < _nall ? j : j % _nall;
769
770 myxtype = fetchXType(j);
771 delx = xtmp - myxtype.x;
772 dely = ytmp - myxtype.y;
773 delz = ztmp - myxtype.z;
774 int jtype = static_cast <int>(myxtype.w);
775
776
777 const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
778
779 bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
780
781 if(in_cutoff) {
782 switch(pair_type) {
783 case PAIR_BORN:
784 fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
785 break;
786
787 case PAIR_BUCK:
788 fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
789 break;
790
791 case PAIR_CG_CMM:
792 fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
793 break;
794
795 case PAIR_LJ_CHARMM:
796 fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
797 break;
798
799 case PAIR_LJ_CLASS2:
800 fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
801 break;
802
803 case PAIR_LJ_CUT:
804 fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
805 break;
806
807 case PAIR_LJ_EXPAND:
808 fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
809 break;
810
811 case PAIR_LJ_GROMACS:
812 fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
813 break;
814
815 case PAIR_LJ_SMOOTH:
816 fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
817 break;
818
819 case PAIR_LJ96_CUT:
820 fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
821 break;
822
823 case PAIR_MORSE_R6:
824 fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
825 break;
826
827 case PAIR_MORSE:
828 fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
829 break;
830 }
831 }
832
833 if(coul_type != COUL_NONE) {
834 const F_FLOAT qiqj = qtmp * fetchQ(j);
835
836 if(qiqj * qiqj > 1e-8) {
837 const bool in_coul_cutoff =
838 rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
839
840 if(in_coul_cutoff) {
841 switch(coul_type) {
842 case COUL_CHARMM:
843 fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
844 break;
845
846 case COUL_CHARMM_IMPLICIT:
847 fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
848 break;
849
850 case COUL_CUT: {
851 const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
852
853 if(eflag) {
854 ecoul += forcecoul;
855 }
856
857 fpair += forcecoul * (F_F(1.0) / rsq);
858 }
859 break;
860
861 case COUL_DEBYE: {
862 const F_FLOAT r2inv = F_F(1.0) / rsq;
863 const X_FLOAT r = _RSQRT_(r2inv);
864 const X_FLOAT rinv = F_F(1.0) / r;
865 const F_FLOAT screening = _EXP_(-_kappa * r);
866 F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
867
868 if(eflag) {
869 ecoul += forcecoul * rinv;
870 }
871
872 forcecoul *= (_kappa + rinv);
873 fpair += forcecoul * r2inv;
874 }
875 break;
876
877 case COUL_GROMACS:
878 fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
879 break;
880
881 case COUL_LONG: {
882 const F_FLOAT r2inv = F_F(1.0) / rsq;
883 const F_FLOAT r = _RSQRT_(r2inv);
884 const F_FLOAT grij = _g_ewald * r;
885 const F_FLOAT expm2 = _EXP_(-grij * grij);
886 const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
887 const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
888 const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
889 F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
890
891 if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
892
893 if(eflag) {
894 ecoul += prefactor * erfc;
895
896 if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
897 }
898
899 fpair += forcecoul * r2inv;
900 }
901 break;
902
903 }
904 }
905
906 in_cutoff = in_cutoff || in_coul_cutoff;
907 }
908 }
909
910
911 if(in_cutoff) {
912 F_FLOAT dxfp, dyfp, dzfp;
913 fxtmp += dxfp = delx * fpair;
914 fytmp += dyfp = dely * fpair;
915 fztmp += dzfp = delz * fpair;
916
917 if(vflag) {
918 sharedV[0 * blockDim.x] += delx * dxfp;
919 sharedV[1 * blockDim.x] += dely * dyfp;
920 sharedV[2 * blockDim.x] += delz * dzfp;
921 sharedV[3 * blockDim.x] += delx * dyfp;
922 sharedV[4 * blockDim.x] += delx * dzfp;
923 sharedV[5 * blockDim.x] += dely * dzfp;
924 }
925 }
926 }
927 }
928
929 __syncthreads();
930
931 if(ii < (comm_phase < 2 ? _inum : _inum_border[0])) {
932 F_FLOAT* my_f;
933
934 if(_collect_forces_later) {
935 ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
936
937 if(eflag) {
938 buffer = &buffer[1 * gridDim.x * gridDim.y];
939
940 if(coul_type != COUL_NONE)
941 buffer = &buffer[1 * gridDim.x * gridDim.y];
942 }
943
944 if(vflag) {
945 buffer = &buffer[6 * gridDim.x * gridDim.y];
946 }
947
948 my_f = (F_FLOAT*) buffer;
949 my_f += i;
950 *my_f = fxtmp;
951 my_f += _nmax;
952 *my_f = fytmp;
953 my_f += _nmax;
954 *my_f = fztmp;
955 } else {
956 my_f = _f + i;
957 *my_f += fxtmp;
958 my_f += _nmax;
959 *my_f += fytmp;
960 my_f += _nmax;
961 *my_f += fztmp;
962 }
963 }
964
965 __syncthreads();
966
967 if(eflag) {
968 sharedE[0] = evdwl;
969
970 if(coul_type != COUL_NONE)
971 sharedECoul[0] = ecoul;
972 }
973
974 if(eflag_atom && i < _nlocal) {
975 if(coul_type != COUL_NONE)
976 _eatom[i] += evdwl + ecoul;
977 else
978 _eatom[i] += evdwl;
979 }
980
981 if(vflag_atom && i < _nlocal) {
982 _vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
983 _vatom[i + _nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
984 _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
985 _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
986 _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
987 _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
988 }
989
990 if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, coul_type != COUL_NONE ? 1 : 0);
991 }
992
993 template <const PAIR_FORCES pair_type, const COUL_FORCES coul_type, const unsigned int extended_data>
Pair_Kernel_BpA_opt(int eflag,int vflag,int eflag_atom,int vflag_atom,int comm_phase)994 __global__ void Pair_Kernel_BpA_opt(int eflag, int vflag, int eflag_atom, int vflag_atom, int comm_phase)
995 {
996 int ii = (blockIdx.x * gridDim.y + blockIdx.y);
997
998 if(ii >= (comm_phase < 2 ? _inum : _inum_border[0]))
999 return;
1000
1001 ENERGY_FLOAT evdwl = ENERGY_F(0.0);
1002 ENERGY_FLOAT ecoul = ENERGY_F(0.0);
1003 F_FLOAT3* sharedVirial1;
1004 F_FLOAT3* sharedVirial2;
1005 F_FLOAT* sharedEnergy;
1006 F_FLOAT* sharedEnergyCoul;
1007
1008 F_FLOAT3* sharedForce = (F_FLOAT3*) &sharedmem[0];
1009
1010 if(vflag) {
1011 sharedVirial1 = &sharedForce[64];
1012 sharedVirial2 = &sharedVirial1[64];
1013 } else {
1014 sharedVirial1 = &sharedForce[0];
1015 sharedVirial2 = &sharedVirial1[0];
1016 }
1017
1018 if(eflag) {
1019 if(vflag || vflag_atom)
1020 sharedEnergy = (F_FLOAT*) &sharedVirial2[64];
1021 else
1022 sharedEnergy = (F_FLOAT*) &sharedForce[64];
1023
1024 if(coul_type != COUL_NONE)
1025 sharedEnergyCoul = (F_FLOAT*) &sharedEnergy[64];
1026
1027 }
1028
1029 F_FLOAT3 partialForce = { F_F(0.0), F_F(0.0), F_F(0.0) };
1030 F_FLOAT3 partialVirial1 = { F_F(0.0), F_F(0.0), F_F(0.0) };
1031 F_FLOAT3 partialVirial2 = { F_F(0.0), F_F(0.0), F_F(0.0) };
1032
1033 X_FLOAT xtmp, ytmp, ztmp;
1034 X_FLOAT4 myxtype;
1035 F_FLOAT delx, dely, delz;
1036 F_FLOAT factor_lj, factor_coul;
1037 F_FLOAT fpair;
1038 F_FLOAT qtmp;
1039 int itype, jnum, i, j;
1040 int* jlist;
1041
1042 i = comm_phase < 2 ? _ilist[ii] : _ilist_border[ii];
1043
1044 myxtype = fetchXType(i);
1045
1046 xtmp = myxtype.x;
1047 ytmp = myxtype.y;
1048 ztmp = myxtype.z;
1049 itype = static_cast <int>(myxtype.w);
1050
1051 if(coul_type != COUL_NONE)
1052 qtmp = fetchQ(i);
1053
1054 jnum = comm_phase == 0 ? _numneigh[i] : (comm_phase == 1 ? _numneigh_inner[i] : _numneigh_border[ii]);
1055
1056 jlist = comm_phase == 0 ? &_neighbors[i * _maxneighbors] : (comm_phase == 1 ? &_neighbors_inner[i * _maxneighbors] : &_neighbors_border[ii * _maxneighbors]);
1057 __syncthreads();
1058
1059 for(int jj = threadIdx.x; jj < jnum + blockDim.x; jj += blockDim.x) {
1060 if(jj < jnum) {
1061 fpair = F_F(0.0);
1062 j = jlist[jj];
1063 factor_lj = j < _nall ? F_F(1.0) : _special_lj[j / _nall];
1064
1065 if(coul_type != COUL_NONE)
1066 factor_coul = j < _nall ? F_F(1.0) : _special_coul[j / _nall];
1067
1068 j = j < _nall ? j : j % _nall;
1069
1070 myxtype = fetchXType(j);
1071
1072 delx = xtmp - myxtype.x;
1073 dely = ytmp - myxtype.y;
1074 delz = ztmp - myxtype.z;
1075 int jtype = static_cast <int>(myxtype.w);
1076
1077 const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
1078
1079 bool in_cutoff = rsq < (_cutsq_global > X_F(0.0) ? _cutsq_global : _cutsq[itype * _cuda_ntypes + jtype]);
1080 bool in_coul_cutoff;
1081
1082 if(in_cutoff) {
1083 switch(pair_type) {
1084 case PAIR_BORN:
1085 fpair += PairBornCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1086 break;
1087
1088 case PAIR_BUCK:
1089 fpair += PairBuckCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1090 break;
1091
1092 case PAIR_CG_CMM:
1093 fpair += PairLJSDKCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1094 break;
1095
1096 case PAIR_LJ_CHARMM:
1097 fpair += PairLJCharmmCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1098 break;
1099
1100 case PAIR_LJ_CLASS2:
1101 fpair += PairLJClass2Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1102 break;
1103
1104 case PAIR_LJ_CUT:
1105 fpair += PairLJCutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1106 break;
1107
1108 case PAIR_LJ_EXPAND:
1109 fpair += PairLJExpandCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1110 break;
1111
1112 case PAIR_LJ_GROMACS:
1113 fpair += PairLJGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1114 break;
1115
1116 case PAIR_LJ_SMOOTH:
1117 fpair += PairLJSmoothCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1118 break;
1119
1120 case PAIR_LJ96_CUT:
1121 fpair += PairLJ96CutCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1122 break;
1123
1124 case PAIR_MORSE_R6:
1125 fpair += PairMorseR6Cuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1126 break;
1127
1128 case PAIR_MORSE:
1129 fpair += PairMorseCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_lj, eflag, evdwl);
1130 break;
1131 }
1132 }
1133
1134 if(coul_type != COUL_NONE) {
1135 const F_FLOAT qiqj = qtmp * fetchQ(j);
1136
1137 if(qiqj * qiqj > (1e-8f)) {
1138 in_coul_cutoff =
1139 rsq < (_cut_coulsq_global > X_F(0.0) ? _cut_coulsq_global : _cut_coulsq[itype * _cuda_ntypes + jtype]);
1140
1141 if(in_coul_cutoff) {
1142 switch(coul_type) {
1143 case COUL_CHARMM:
1144 fpair += CoulCharmmCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
1145 break;
1146
1147 case COUL_CHARMM_IMPLICIT:
1148 fpair += CoulCharmmImplicitCuda_Eval(rsq, factor_coul, eflag, ecoul, qiqj);
1149 break;
1150
1151 case COUL_GROMACS:
1152 fpair += CoulGromacsCuda_Eval(rsq, itype * _cuda_ntypes + jtype, factor_coul, eflag, ecoul, qiqj);
1153 break;
1154
1155 case COUL_LONG: {
1156 const F_FLOAT r2inv = F_F(1.0) / rsq;
1157 const F_FLOAT r = _RSQRT_(r2inv);
1158 const F_FLOAT grij = _g_ewald * r;
1159 const F_FLOAT expm2 = _EXP_(-grij * grij);
1160 const F_FLOAT t = F_F(1.0) / (F_F(1.0) + EWALD_P * grij);
1161 const F_FLOAT erfc = t * (A1 + t * (A2 + t * (A3 + t * (A4 + t * A5)))) * expm2;
1162 const F_FLOAT prefactor = _qqrd2e * qiqj * (F_F(1.0) / r);
1163 F_FLOAT forcecoul = prefactor * (erfc + EWALD_F * grij * expm2);
1164
1165 if(factor_coul < 1.0) forcecoul -= (1.0 - factor_coul) * prefactor;
1166
1167 if(eflag) {
1168 ecoul += prefactor * erfc;
1169
1170 if(factor_coul < 1.0) ecoul -= (1.0 - factor_coul) * prefactor;
1171 }
1172
1173 fpair += forcecoul * r2inv;
1174 }
1175 break;
1176
1177 case COUL_DEBYE: {
1178 const F_FLOAT r2inv = F_F(1.0) / rsq;
1179 const X_FLOAT r = _RSQRT_(r2inv);
1180 const X_FLOAT rinv = F_F(1.0) / r;
1181 const F_FLOAT screening = _EXP_(-_kappa * r);
1182 F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * screening ;
1183
1184 if(eflag) {
1185 ecoul += forcecoul * rinv;
1186 }
1187
1188 forcecoul *= (_kappa + rinv);
1189 fpair += forcecoul * r2inv;
1190 }
1191 break;
1192
1193 case COUL_CUT: {
1194 const F_FLOAT forcecoul = factor_coul * _qqrd2e * qiqj * _RSQRT_(rsq);
1195
1196 if(eflag) {
1197 ecoul += forcecoul;
1198 }
1199
1200 fpair += forcecoul * (F_F(1.0) / rsq);
1201 }
1202 break;
1203
1204
1205 }
1206 }
1207 }
1208 }
1209
1210
1211
1212 if(in_cutoff || in_coul_cutoff) {
1213 F_FLOAT dxfp, dyfp, dzfp;
1214 partialForce.x += dxfp = delx * fpair;
1215 partialForce.y += dyfp = dely * fpair;
1216 partialForce.z += dzfp = delz * fpair;
1217
1218 if(vflag) {
1219 partialVirial1.x += delx * dxfp;
1220 partialVirial1.y += dely * dyfp;
1221 partialVirial1.z += delz * dzfp;
1222 partialVirial2.x += delx * dyfp;
1223 partialVirial2.y += delx * dzfp;
1224 partialVirial2.z += dely * dzfp;
1225 }
1226 }
1227 }
1228 }
1229
1230 if(eflag) {
1231 sharedEnergy[threadIdx.x] = evdwl;
1232
1233 if(coul_type != COUL_NONE)
1234 sharedEnergyCoul[threadIdx.x] = ecoul;
1235 }
1236
1237 sharedForce[threadIdx.x] = partialForce;
1238
1239 if(vflag) {
1240 sharedVirial1[threadIdx.x] = partialVirial1;
1241 sharedVirial2[threadIdx.x] = partialVirial2;
1242 }
1243
1244 __syncthreads();
1245
1246
1247 for(unsigned int s = blockDim.x >> 1; s > 0; s >>= 1) {
1248
1249 if(threadIdx.x < s) {
1250 sharedForce[ threadIdx.x ].x += sharedForce[ threadIdx.x + s ].x;
1251 sharedForce[ threadIdx.x ].y += sharedForce[ threadIdx.x + s ].y;
1252 sharedForce[ threadIdx.x ].z += sharedForce[ threadIdx.x + s ].z;
1253
1254 if(vflag) {
1255 sharedVirial1[ threadIdx.x ].x += sharedVirial1[ threadIdx.x + s ].x;
1256 sharedVirial1[ threadIdx.x ].y += sharedVirial1[ threadIdx.x + s ].y;
1257 sharedVirial1[ threadIdx.x ].z += sharedVirial1[ threadIdx.x + s ].z;
1258
1259 sharedVirial2[ threadIdx.x ].x += sharedVirial2[ threadIdx.x + s ].x;
1260 sharedVirial2[ threadIdx.x ].y += sharedVirial2[ threadIdx.x + s ].y;
1261 sharedVirial2[ threadIdx.x ].z += sharedVirial2[ threadIdx.x + s ].z;
1262 }
1263
1264 if(eflag) {
1265 sharedEnergy[ threadIdx.x ] += sharedEnergy[ threadIdx.x + s ];
1266
1267 if(coul_type != COUL_NONE)
1268 sharedEnergyCoul[ threadIdx.x ] += sharedEnergyCoul[ threadIdx.x + s ];
1269 }
1270 }
1271
1272 __syncthreads();
1273 }
1274
1275 if(threadIdx.x == 0) {
1276
1277 ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
1278
1279 if(eflag) {
1280 ENERGY_FLOAT tmp_evdwl;
1281 buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergy[0];
1282
1283 if(eflag_atom)
1284 _eatom[i] = tmp_evdwl;
1285
1286 buffer = &buffer[gridDim.x * gridDim.y];
1287
1288 if(coul_type != COUL_NONE) {
1289 buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp_evdwl = ENERGY_F(0.5) * sharedEnergyCoul[0];
1290
1291 if(eflag_atom)
1292 _eatom[i] += tmp_evdwl;
1293
1294 buffer = &buffer[gridDim.x * gridDim.y];
1295 }
1296 }
1297
1298 if(vflag) {
1299 ENERGY_FLOAT tmp;
1300 buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].x;
1301
1302 if(vflag_atom) _vatom[i + 0 * _nmax] = tmp;
1303
1304 buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].y;
1305
1306 if(vflag_atom) _vatom[i + 1 * _nmax] = tmp;
1307
1308 buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial1[0].z;
1309
1310 if(vflag_atom) _vatom[i + 2 * _nmax] = tmp;
1311
1312 buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].x;
1313
1314 if(vflag_atom) _vatom[i + 3 * _nmax] = tmp;
1315
1316 buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].y;
1317
1318 if(vflag_atom) _vatom[i + 4 * _nmax] = tmp;
1319
1320 buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = tmp = ENERGY_F(0.5) * sharedVirial2[0].z;
1321
1322 if(vflag_atom) _vatom[i + 5 * _nmax] = tmp;
1323
1324 buffer = &buffer[6 * gridDim.x * gridDim.y];
1325 }
1326
1327 F_FLOAT* my_f;
1328
1329 if(_collect_forces_later) {
1330 my_f = (F_FLOAT*) buffer;
1331 my_f += i;
1332 *my_f = sharedForce[0].x;
1333 my_f += _nmax;
1334 *my_f = sharedForce[0].y;
1335 my_f += _nmax;
1336 *my_f = sharedForce[0].z;
1337 } else {
1338 my_f = _f + i;
1339 *my_f += sharedForce[0].x;
1340 my_f += _nmax;
1341 *my_f += sharedForce[0].y;
1342 my_f += _nmax;
1343 *my_f += sharedForce[0].z;
1344 }
1345 }
1346 }
1347
Pair_GenerateXType_Kernel()1348 __global__ void Pair_GenerateXType_Kernel()
1349 {
1350 int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
1351
1352 if(i < _nall) {
1353 X_FLOAT4 xtype;
1354 xtype.x = _x[i];
1355 xtype.y = _x[i + _nmax];
1356 xtype.z = _x[i + 2 * _nmax];
1357 xtype.w = _type[i];
1358 _x_type[i] = xtype;
1359 }
1360
1361 }
1362
Pair_GenerateVRadius_Kernel()1363 __global__ void Pair_GenerateVRadius_Kernel()
1364 {
1365 int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
1366
1367 if(i < _nall) {
1368 V_FLOAT4 vradius;
1369 vradius.x = _v[i];
1370 vradius.y = _v[i + _nmax];
1371 vradius.z = _v[i + 2 * _nmax];
1372 vradius.w = _radius[i];
1373 _v_radius[i] = vradius;
1374 }
1375 }
1376
Pair_GenerateOmegaRmass_Kernel()1377 __global__ void Pair_GenerateOmegaRmass_Kernel()
1378 {
1379 int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
1380
1381 if(i < _nall) {
1382 V_FLOAT4 omegarmass;
1383 omegarmass.x = _omega[i];
1384 omegarmass.y = _omega[i + _nmax];
1385 omegarmass.z = _omega[i + 2 * _nmax];
1386 omegarmass.w = _rmass[i];
1387 _omega_rmass[i] = omegarmass;
1388 }
1389 }
1390
Pair_RevertXType_Kernel()1391 __global__ void Pair_RevertXType_Kernel()
1392 {
1393 int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
1394
1395 if(i < _nall) {
1396 X_FLOAT4 xtype = _x_type[i];
1397 _x[i] = xtype.x;
1398 _x[i + _nmax] = xtype.y;
1399 _x[i + 2 * _nmax] = xtype.z;
1400 _type[i] = static_cast <int>(xtype.w);
1401 }
1402
1403 }
1404
Pair_BuildXHold_Kernel()1405 __global__ void Pair_BuildXHold_Kernel()
1406 {
1407 int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
1408
1409 if(i < _nall) {
1410 X_FLOAT4 xtype = _x_type[i];
1411 _xhold[i] = xtype.x;
1412 _xhold[i + _nmax] = xtype.y;
1413 _xhold[i + 2 * _nmax] = xtype.z;
1414 }
1415
1416 }
1417
Pair_CollectForces_Kernel(int nperblock,int n)1418 __global__ void Pair_CollectForces_Kernel(int nperblock, int n)
1419 {
1420 int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
1421
1422 if(i >= _nlocal) return;
1423
1424 ENERGY_FLOAT* buf = (ENERGY_FLOAT*) _buffer;
1425
1426 F_FLOAT* buf_f = (F_FLOAT*) &buf[nperblock * n];
1427 F_FLOAT* my_f = _f + i;
1428 buf_f += i;
1429 *my_f += * buf_f;
1430 my_f += _nmax;
1431 buf_f += _nmax;
1432 *my_f += * buf_f;
1433 my_f += _nmax;
1434 buf_f += _nmax;
1435 *my_f += * buf_f;
1436 my_f += _nmax;
1437 }
1438