1 // **************************************************************************
2 //                                coul_dsf.cu
3 //                             -------------------
4 //                           Trung Dac Nguyen (ORNL)
5 //
6 //  Device code for acceleration of the coul/dsf pair style
7 //
8 // __________________________________________________________________________
9 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10 // __________________________________________________________________________
11 //
12 //    begin                : 8/15/2012
13 //    email                : nguyentd@ornl.gov
14 // ***************************************************************************
15 
16 #if defined(NV_KERNEL) || defined(USE_HIP)
17 
18 #include "lal_aux_fun1.h"
19 #ifndef _DOUBLE_DOUBLE
20 _texture( pos_tex,float4);
21 _texture( q_tex,float);
22 #else
23 _texture_2d( pos_tex,int4);
24 _texture( q_tex,int2);
25 #endif
26 
27 #else
28 #define pos_tex x_
29 #define q_tex q_
30 #endif
31 
32 #define MY_PIS (acctyp)1.77245385090551602729
33 
k_coul_dsf(const __global numtyp4 * restrict x_,const int lj_types,const __global numtyp * restrict sp_lj_in,const __global int * dev_nbor,const __global int * dev_packed,__global acctyp4 * restrict ans,__global acctyp * restrict engv,const int eflag,const int vflag,const int inum,const int nbor_pitch,const __global numtyp * restrict q_,const numtyp cut_coulsq,const numtyp qqrd2e,const numtyp e_shift,const numtyp f_shift,const numtyp alpha,const int t_per_atom)34 __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
35                          const int lj_types,
36                          const __global numtyp *restrict sp_lj_in,
37                          const __global int *dev_nbor,
38                          const __global int *dev_packed,
39                          __global acctyp4 *restrict ans,
40                          __global acctyp *restrict engv,
41                          const int eflag, const int vflag, const int inum,
42                          const int nbor_pitch,
43                          const __global numtyp *restrict q_ ,
44                          const numtyp cut_coulsq, const numtyp qqrd2e,
45                          const numtyp e_shift, const numtyp f_shift,
46                          const numtyp alpha, const int t_per_atom) {
47   int tid, ii, offset;
48   atom_info(t_per_atom,ii,tid,offset);
49 
50   __local numtyp sp_lj[4];
51   int n_stride;
52   local_allocate_store_charge();
53 
54   sp_lj[0]=sp_lj_in[0];
55   sp_lj[1]=sp_lj_in[1];
56   sp_lj[2]=sp_lj_in[2];
57   sp_lj[3]=sp_lj_in[3];
58 
59   acctyp4 f;
60   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
61   acctyp energy, e_coul, virial[6];
62   if (EVFLAG) {
63     energy=(acctyp)0;
64     e_coul=(acctyp)0;
65     for (int i=0; i<6; i++) virial[i]=(acctyp)0;
66   }
67 
68   if (ii<inum) {
69     int nbor, nbor_end;
70     int i, numj;
71     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
72               n_stride,nbor_end,nbor);
73 
74     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
75     numtyp qtmp; fetch(qtmp,i,q_tex);
76 
77     if (EVFLAG && eflag) {
78       acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
79         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
80       e_coul += (acctyp)2.0*e_self;
81     }
82 
83     for ( ; nbor<nbor_end; nbor+=n_stride) {
84       int j=dev_packed[nbor];
85 
86       numtyp factor_coul, r, prefactor, erfcc;
87       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)];
88       j &= NEIGHMASK;
89 
90       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
91 
92       // Compute r12
93       numtyp delx = ix.x-jx.x;
94       numtyp dely = ix.y-jx.y;
95       numtyp delz = ix.z-jx.z;
96       numtyp rsq = delx*delx+dely*dely+delz*delz;
97 
98       if (rsq < cut_coulsq) {
99         numtyp r2inv=ucl_recip(rsq);
100         numtyp forcecoul, force;
101 
102         r = ucl_sqrt(rsq);
103         fetch(prefactor,j,q_tex);
104         prefactor *= qqrd2e*qtmp/r;
105         numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
106         numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
107         erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
108         forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
109           rsq*f_shift-factor_coul);
110 
111         force = forcecoul * r2inv;
112 
113         f.x+=delx*force;
114         f.y+=dely*force;
115         f.z+=delz*force;
116 
117         if (EVFLAG && eflag) {
118           numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul);
119           e_coul += e;
120         }
121         if (EVFLAG && vflag) {
122           virial[0] += delx*delx*force;
123           virial[1] += dely*dely*force;
124           virial[2] += delz*delz*force;
125           virial[3] += delx*dely*force;
126           virial[4] += delx*delz*force;
127           virial[5] += dely*delz*force;
128         }
129       }
130 
131     } // for nbor
132   } // if ii
133   store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
134                   vflag,ans,engv);
135 }
136 
k_coul_dsf_fast(const __global numtyp4 * restrict x_,const __global numtyp * restrict sp_lj_in,const __global int * dev_nbor,const __global int * dev_packed,__global acctyp4 * restrict ans,__global acctyp * restrict engv,const int eflag,const int vflag,const int inum,const int nbor_pitch,const __global numtyp * restrict q_,const numtyp cut_coulsq,const numtyp qqrd2e,const numtyp e_shift,const numtyp f_shift,const numtyp alpha,const int t_per_atom)137 __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
138                               const __global numtyp *restrict sp_lj_in,
139                               const __global int *dev_nbor,
140                               const __global int *dev_packed,
141                               __global acctyp4 *restrict ans,
142                               __global acctyp *restrict engv,
143                               const int eflag, const int vflag, const int inum,
144                               const int nbor_pitch,
145                               const __global numtyp *restrict q_,
146                               const numtyp cut_coulsq, const numtyp qqrd2e,
147                               const numtyp e_shift, const numtyp f_shift,
148                               const numtyp alpha, const int t_per_atom) {
149   int tid, ii, offset;
150   atom_info(t_per_atom,ii,tid,offset);
151 
152   __local numtyp sp_lj[4];
153   int n_stride;
154   local_allocate_store_charge();
155 
156   if (tid<4)
157     sp_lj[tid]=sp_lj_in[tid];
158 
159   acctyp4 f;
160   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
161   acctyp energy, e_coul, virial[6];
162   if (EVFLAG) {
163     energy=(acctyp)0;
164     e_coul=(acctyp)0;
165     for (int i=0; i<6; i++) virial[i]=(acctyp)0;
166   }
167 
168   __syncthreads();
169 
170   if (ii<inum) {
171     int nbor, nbor_end;
172     int i, numj;
173     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
174               n_stride,nbor_end,nbor);
175 
176     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
177     numtyp qtmp; fetch(qtmp,i,q_tex);
178 
179     if (EVFLAG && eflag) {
180       acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
181         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
182       e_coul += (acctyp)2.0*e_self;
183     }
184 
185     for ( ; nbor<nbor_end; nbor+=n_stride) {
186       int j=dev_packed[nbor];
187 
188       numtyp factor_coul, r, prefactor, erfcc;
189       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)];
190       j &= NEIGHMASK;
191 
192       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
193 
194       // Compute r12
195       numtyp delx = ix.x-jx.x;
196       numtyp dely = ix.y-jx.y;
197       numtyp delz = ix.z-jx.z;
198       numtyp rsq = delx*delx+dely*dely+delz*delz;
199 
200       if (rsq < cut_coulsq) {
201         numtyp r2inv=ucl_recip(rsq);
202         numtyp forcecoul, force;
203 
204         r = ucl_sqrt(rsq);
205         fetch(prefactor,j,q_tex);
206         prefactor *= qqrd2e*qtmp/r;
207         numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
208         numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
209         erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
210         forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
211           rsq*f_shift-factor_coul);
212 
213         force = forcecoul * r2inv;
214 
215         f.x+=delx*force;
216         f.y+=dely*force;
217         f.z+=delz*force;
218 
219         if (EVFLAG && eflag) {
220           numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul);
221           e_coul += e;
222         }
223         if (EVFLAG && vflag) {
224           virial[0] += delx*delx*force;
225           virial[1] += dely*dely*force;
226           virial[2] += delz*delz*force;
227           virial[3] += delx*dely*force;
228           virial[4] += delx*delz*force;
229           virial[5] += dely*delz*force;
230         }
231       }
232 
233     } // for nbor
234   } // if ii
235   store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
236                   vflag,ans,engv);
237 }
238