1 // **************************************************************************
2 // lj_class2_long.cu
3 // -------------------
4 // W. Michael Brown
5 //
6 // Device code for COMPASS LJ long acceleration
7 //
8 // __________________________________________________________________________
9 // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10 // __________________________________________________________________________
11 //
12 // begin : Mon May 16 2011
13 // email : brownw@ornl.gov
14 // ***************************************************************************
15
16 #if defined(NV_KERNEL) || defined(USE_HIP)
17
18 #include "lal_aux_fun1.h"
19 #ifndef _DOUBLE_DOUBLE
20 _texture( pos_tex,float4);
21 _texture( q_tex,float);
22 #else
23 _texture_2d( pos_tex,int4);
24 _texture( q_tex,int2);
25 #endif
26
27 #else
28 #define pos_tex x_
29 #define q_tex q_
30 #endif
31
k_lj_class2_long(const __global numtyp4 * restrict x_,const __global numtyp4 * restrict lj1,const __global numtyp4 * restrict lj3,const int lj_types,const __global numtyp * restrict sp_lj_in,const __global int * dev_nbor,const __global int * dev_packed,__global acctyp4 * restrict ans,__global acctyp * restrict engv,const int eflag,const int vflag,const int inum,const int nbor_pitch,const __global numtyp * restrict q_,const numtyp cut_coulsq,const numtyp qqrd2e,const numtyp g_ewald,const int t_per_atom)32 __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
33 const __global numtyp4 *restrict lj1,
34 const __global numtyp4 *restrict lj3,
35 const int lj_types,
36 const __global numtyp *restrict sp_lj_in,
37 const __global int *dev_nbor,
38 const __global int *dev_packed,
39 __global acctyp4 *restrict ans,
40 __global acctyp *restrict engv,
41 const int eflag, const int vflag,
42 const int inum, const int nbor_pitch,
43 const __global numtyp *restrict q_,
44 const numtyp cut_coulsq, const numtyp qqrd2e,
45 const numtyp g_ewald, const int t_per_atom) {
46 int tid, ii, offset;
47 atom_info(t_per_atom,ii,tid,offset);
48
49 __local numtyp sp_lj[8];
50 int n_stride;
51 local_allocate_store_charge();
52
53 sp_lj[0]=sp_lj_in[0];
54 sp_lj[1]=sp_lj_in[1];
55 sp_lj[2]=sp_lj_in[2];
56 sp_lj[3]=sp_lj_in[3];
57 sp_lj[4]=sp_lj_in[4];
58 sp_lj[5]=sp_lj_in[5];
59 sp_lj[6]=sp_lj_in[6];
60 sp_lj[7]=sp_lj_in[7];
61
62 acctyp4 f;
63 f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
64 acctyp energy, e_coul, virial[6];
65 if (EVFLAG) {
66 energy=(acctyp)0;
67 e_coul=(acctyp)0;
68 for (int i=0; i<6; i++) virial[i]=(acctyp)0;
69 }
70
71 if (ii<inum) {
72 int nbor, nbor_end;
73 int i, numj;
74 nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
75 n_stride,nbor_end,nbor);
76
77 numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
78 numtyp qtmp; fetch(qtmp,i,q_tex);
79 int itype=ix.w;
80
81 for ( ; nbor<nbor_end; nbor+=n_stride) {
82 int j=dev_packed[nbor];
83
84 numtyp factor_lj, factor_coul;
85 factor_lj = sp_lj[sbmask(j)];
86 factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
87 j &= NEIGHMASK;
88
89 numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
90 int jtype=jx.w;
91
92 // Compute r12
93 numtyp delx = ix.x-jx.x;
94 numtyp dely = ix.y-jx.y;
95 numtyp delz = ix.z-jx.z;
96 numtyp rsq = delx*delx+dely*dely+delz*delz;
97
98 int mtype=itype*lj_types+jtype;
99 if (rsq<lj1[mtype].z) {
100 numtyp r2inv=ucl_recip(rsq);
101 numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
102
103 if (rsq < lj1[mtype].w) {
104 numtyp rinv=ucl_rsqrt(rsq);
105 r3inv=r2inv*rinv;
106 r6inv = r3inv*r3inv;
107 force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
108 } else
109 force_lj = (numtyp)0.0;
110
111 if (rsq < cut_coulsq) {
112 numtyp r = ucl_rsqrt(r2inv);
113 numtyp grij = g_ewald * r;
114 numtyp expm2 = ucl_exp(-grij*grij);
115 numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
116 _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
117 fetch(prefactor,j,q_tex);
118 prefactor *= qqrd2e * qtmp/r;
119 forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
120 } else
121 forcecoul = (numtyp)0.0;
122
123 force = (force_lj + forcecoul) * r2inv;
124
125 f.x+=delx*force;
126 f.y+=dely*force;
127 f.z+=delz*force;
128
129 if (EVFLAG && eflag) {
130 if (rsq < cut_coulsq)
131 e_coul += prefactor*(_erfc-factor_coul);
132 if (rsq < lj1[mtype].w) {
133 numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
134 energy+=factor_lj*(e-lj3[mtype].z);
135 }
136 }
137 if (EVFLAG && vflag) {
138 virial[0] += delx*delx*force;
139 virial[1] += dely*dely*force;
140 virial[2] += delz*delz*force;
141 virial[3] += delx*dely*force;
142 virial[4] += delx*delz*force;
143 virial[5] += dely*delz*force;
144 }
145 }
146
147 } // for nbor
148 } // if ii
149 store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
150 vflag,ans,engv);
151 }
152
k_lj_class2_long_fast(const __global numtyp4 * restrict x_,const __global numtyp4 * restrict lj1_in,const __global numtyp4 * restrict lj3_in,const __global numtyp * restrict sp_lj_in,const __global int * dev_nbor,const __global int * dev_packed,__global acctyp4 * restrict ans,__global acctyp * restrict engv,const int eflag,const int vflag,const int inum,const int nbor_pitch,const __global numtyp * restrict q_,const numtyp cut_coulsq,const numtyp qqrd2e,const numtyp g_ewald,const int t_per_atom)153 __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
154 const __global numtyp4 *restrict lj1_in,
155 const __global numtyp4 *restrict lj3_in,
156 const __global numtyp *restrict sp_lj_in,
157 const __global int *dev_nbor,
158 const __global int *dev_packed,
159 __global acctyp4 *restrict ans,
160 __global acctyp *restrict engv,
161 const int eflag, const int vflag,
162 const int inum, const int nbor_pitch,
163 const __global numtyp *restrict q_,
164 const numtyp cut_coulsq,
165 const numtyp qqrd2e,
166 const numtyp g_ewald,
167 const int t_per_atom) {
168 int tid, ii, offset;
169 atom_info(t_per_atom,ii,tid,offset);
170
171 __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
172 __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
173 __local numtyp sp_lj[8];
174 int n_stride;
175 local_allocate_store_charge();
176
177 if (tid<8)
178 sp_lj[tid]=sp_lj_in[tid];
179 if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
180 lj1[tid]=lj1_in[tid];
181 if (EVFLAG && eflag)
182 lj3[tid]=lj3_in[tid];
183 }
184
185 acctyp4 f;
186 f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
187 acctyp energy, e_coul, virial[6];
188 if (EVFLAG) {
189 energy=(acctyp)0;
190 e_coul=(acctyp)0;
191 for (int i=0; i<6; i++) virial[i]=(acctyp)0;
192 }
193
194 __syncthreads();
195
196 if (ii<inum) {
197 int nbor, nbor_end;
198 int i, numj;
199 nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
200 n_stride,nbor_end,nbor);
201
202 numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
203 numtyp qtmp; fetch(qtmp,i,q_tex);
204 int iw=ix.w;
205 int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
206
207 for ( ; nbor<nbor_end; nbor+=n_stride) {
208 int j=dev_packed[nbor];
209
210 numtyp factor_lj, factor_coul;
211 factor_lj = sp_lj[sbmask(j)];
212 factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
213 j &= NEIGHMASK;
214
215 numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
216 int mtype=itype+jx.w;
217
218 // Compute r12
219 numtyp delx = ix.x-jx.x;
220 numtyp dely = ix.y-jx.y;
221 numtyp delz = ix.z-jx.z;
222 numtyp rsq = delx*delx+dely*dely+delz*delz;
223
224 if (rsq<lj1[mtype].z) {
225 numtyp r2inv=ucl_recip(rsq);
226 numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
227
228 if (rsq < lj1[mtype].w) {
229 numtyp rinv=ucl_rsqrt(rsq);
230 r3inv=r2inv*rinv;
231 r6inv = r3inv*r3inv;
232 force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
233 } else
234 force_lj = (numtyp)0.0;
235
236 if (rsq < cut_coulsq) {
237 numtyp r = ucl_rsqrt(r2inv);
238 numtyp grij = g_ewald * r;
239 numtyp expm2 = ucl_exp(-grij*grij);
240 numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
241 _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
242 fetch(prefactor,j,q_tex);
243 prefactor *= qqrd2e * qtmp/r;
244 forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
245 } else
246 forcecoul = (numtyp)0.0;
247
248 force = (force_lj + forcecoul) * r2inv;
249
250 f.x+=delx*force;
251 f.y+=dely*force;
252 f.z+=delz*force;
253
254 if (EVFLAG && eflag) {
255 if (rsq < cut_coulsq)
256 e_coul += prefactor*(_erfc-factor_coul);
257 if (rsq < lj1[mtype].w) {
258 numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
259 energy+=factor_lj*(e-lj3[mtype].z);
260 }
261 }
262 if (EVFLAG && vflag) {
263 virial[0] += delx*delx*force;
264 virial[1] += dely*dely*force;
265 virial[2] += delz*delz*force;
266 virial[3] += delx*dely*force;
267 virial[4] += delx*delz*force;
268 virial[5] += dely*delz*force;
269 }
270 }
271
272 } // for nbor
273 } // if ii
274 store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
275 vflag,ans,engv);
276 }
277
278