1 // **************************************************************************
2 // coul_dsf.cu
3 // -------------------
4 // Trung Dac Nguyen (ORNL)
5 //
6 // Device code for acceleration of the coul/dsf pair style
7 //
8 // __________________________________________________________________________
9 // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10 // __________________________________________________________________________
11 //
12 // begin : 8/15/2012
13 // email : nguyentd@ornl.gov
14 // ***************************************************************************
15
16 #if defined(NV_KERNEL) || defined(USE_HIP)
17
18 #include "lal_aux_fun1.h"
19 #ifndef _DOUBLE_DOUBLE
20 _texture( pos_tex,float4);
21 _texture( q_tex,float);
22 #else
23 _texture_2d( pos_tex,int4);
24 _texture( q_tex,int2);
25 #endif
26
27 #else
28 #define pos_tex x_
29 #define q_tex q_
30 #endif
31
32 #define MY_PIS (acctyp)1.77245385090551602729
33
k_coul_dsf(const __global numtyp4 * restrict x_,const int lj_types,const __global numtyp * restrict sp_lj_in,const __global int * dev_nbor,const __global int * dev_packed,__global acctyp4 * restrict ans,__global acctyp * restrict engv,const int eflag,const int vflag,const int inum,const int nbor_pitch,const __global numtyp * restrict q_,const numtyp cut_coulsq,const numtyp qqrd2e,const numtyp e_shift,const numtyp f_shift,const numtyp alpha,const int t_per_atom)34 __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
35 const int lj_types,
36 const __global numtyp *restrict sp_lj_in,
37 const __global int *dev_nbor,
38 const __global int *dev_packed,
39 __global acctyp4 *restrict ans,
40 __global acctyp *restrict engv,
41 const int eflag, const int vflag, const int inum,
42 const int nbor_pitch,
43 const __global numtyp *restrict q_ ,
44 const numtyp cut_coulsq, const numtyp qqrd2e,
45 const numtyp e_shift, const numtyp f_shift,
46 const numtyp alpha, const int t_per_atom) {
47 int tid, ii, offset;
48 atom_info(t_per_atom,ii,tid,offset);
49
50 __local numtyp sp_lj[4];
51 int n_stride;
52 local_allocate_store_charge();
53
54 sp_lj[0]=sp_lj_in[0];
55 sp_lj[1]=sp_lj_in[1];
56 sp_lj[2]=sp_lj_in[2];
57 sp_lj[3]=sp_lj_in[3];
58
59 acctyp4 f;
60 f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
61 acctyp energy, e_coul, virial[6];
62 if (EVFLAG) {
63 energy=(acctyp)0;
64 e_coul=(acctyp)0;
65 for (int i=0; i<6; i++) virial[i]=(acctyp)0;
66 }
67
68 if (ii<inum) {
69 int nbor, nbor_end;
70 int i, numj;
71 nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
72 n_stride,nbor_end,nbor);
73
74 numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
75 numtyp qtmp; fetch(qtmp,i,q_tex);
76
77 if (EVFLAG && eflag) {
78 acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
79 qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
80 e_coul += (acctyp)2.0*e_self;
81 }
82
83 for ( ; nbor<nbor_end; nbor+=n_stride) {
84 int j=dev_packed[nbor];
85
86 numtyp factor_coul, r, prefactor, erfcc;
87 factor_coul = (numtyp)1.0-sp_lj[sbmask(j)];
88 j &= NEIGHMASK;
89
90 numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
91
92 // Compute r12
93 numtyp delx = ix.x-jx.x;
94 numtyp dely = ix.y-jx.y;
95 numtyp delz = ix.z-jx.z;
96 numtyp rsq = delx*delx+dely*dely+delz*delz;
97
98 if (rsq < cut_coulsq) {
99 numtyp r2inv=ucl_recip(rsq);
100 numtyp forcecoul, force;
101
102 r = ucl_sqrt(rsq);
103 fetch(prefactor,j,q_tex);
104 prefactor *= qqrd2e*qtmp/r;
105 numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
106 numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
107 erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
108 forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
109 rsq*f_shift-factor_coul);
110
111 force = forcecoul * r2inv;
112
113 f.x+=delx*force;
114 f.y+=dely*force;
115 f.z+=delz*force;
116
117 if (EVFLAG && eflag) {
118 numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul);
119 e_coul += e;
120 }
121 if (EVFLAG && vflag) {
122 virial[0] += delx*delx*force;
123 virial[1] += dely*dely*force;
124 virial[2] += delz*delz*force;
125 virial[3] += delx*dely*force;
126 virial[4] += delx*delz*force;
127 virial[5] += dely*delz*force;
128 }
129 }
130
131 } // for nbor
132 } // if ii
133 store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
134 vflag,ans,engv);
135 }
136
k_coul_dsf_fast(const __global numtyp4 * restrict x_,const __global numtyp * restrict sp_lj_in,const __global int * dev_nbor,const __global int * dev_packed,__global acctyp4 * restrict ans,__global acctyp * restrict engv,const int eflag,const int vflag,const int inum,const int nbor_pitch,const __global numtyp * restrict q_,const numtyp cut_coulsq,const numtyp qqrd2e,const numtyp e_shift,const numtyp f_shift,const numtyp alpha,const int t_per_atom)137 __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
138 const __global numtyp *restrict sp_lj_in,
139 const __global int *dev_nbor,
140 const __global int *dev_packed,
141 __global acctyp4 *restrict ans,
142 __global acctyp *restrict engv,
143 const int eflag, const int vflag, const int inum,
144 const int nbor_pitch,
145 const __global numtyp *restrict q_,
146 const numtyp cut_coulsq, const numtyp qqrd2e,
147 const numtyp e_shift, const numtyp f_shift,
148 const numtyp alpha, const int t_per_atom) {
149 int tid, ii, offset;
150 atom_info(t_per_atom,ii,tid,offset);
151
152 __local numtyp sp_lj[4];
153 int n_stride;
154 local_allocate_store_charge();
155
156 if (tid<4)
157 sp_lj[tid]=sp_lj_in[tid];
158
159 acctyp4 f;
160 f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
161 acctyp energy, e_coul, virial[6];
162 if (EVFLAG) {
163 energy=(acctyp)0;
164 e_coul=(acctyp)0;
165 for (int i=0; i<6; i++) virial[i]=(acctyp)0;
166 }
167
168 __syncthreads();
169
170 if (ii<inum) {
171 int nbor, nbor_end;
172 int i, numj;
173 nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
174 n_stride,nbor_end,nbor);
175
176 numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
177 numtyp qtmp; fetch(qtmp,i,q_tex);
178
179 if (EVFLAG && eflag) {
180 acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
181 qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
182 e_coul += (acctyp)2.0*e_self;
183 }
184
185 for ( ; nbor<nbor_end; nbor+=n_stride) {
186 int j=dev_packed[nbor];
187
188 numtyp factor_coul, r, prefactor, erfcc;
189 factor_coul = (numtyp)1.0-sp_lj[sbmask(j)];
190 j &= NEIGHMASK;
191
192 numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
193
194 // Compute r12
195 numtyp delx = ix.x-jx.x;
196 numtyp dely = ix.y-jx.y;
197 numtyp delz = ix.z-jx.z;
198 numtyp rsq = delx*delx+dely*dely+delz*delz;
199
200 if (rsq < cut_coulsq) {
201 numtyp r2inv=ucl_recip(rsq);
202 numtyp forcecoul, force;
203
204 r = ucl_sqrt(rsq);
205 fetch(prefactor,j,q_tex);
206 prefactor *= qqrd2e*qtmp/r;
207 numtyp erfcd = ucl_exp(-alpha*alpha*rsq);
208 numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*alpha*r);
209 erfcc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * erfcd;
210 forcecoul = prefactor * (erfcc + (numtyp)2.0*alpha/MY_PIS*r*erfcd +
211 rsq*f_shift-factor_coul);
212
213 force = forcecoul * r2inv;
214
215 f.x+=delx*force;
216 f.y+=dely*force;
217 f.z+=delz*force;
218
219 if (EVFLAG && eflag) {
220 numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul);
221 e_coul += e;
222 }
223 if (EVFLAG && vflag) {
224 virial[0] += delx*delx*force;
225 virial[1] += dely*dely*force;
226 virial[2] += delz*delz*force;
227 virial[3] += delx*dely*force;
228 virial[4] += delx*delz*force;
229 virial[5] += dely*delz*force;
230 }
231 }
232
233 } // for nbor
234 } // if ii
235 store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
236 vflag,ans,engv);
237 }
238