1 // **************************************************************************
2 //                              lj_class2_long.cu
3 //                             -------------------
4 //                               W. Michael Brown
5 //
6 //  Device code for COMPASS LJ long acceleration
7 //
8 // __________________________________________________________________________
9 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10 // __________________________________________________________________________
11 //
12 //    begin                : Mon May 16 2011
13 //    email                : brownw@ornl.gov
14 // ***************************************************************************
15 
16 #if defined(NV_KERNEL) || defined(USE_HIP)
17 
18 #include "lal_aux_fun1.h"
19 #ifndef _DOUBLE_DOUBLE
20 _texture( pos_tex,float4);
21 _texture( q_tex,float);
22 #else
23 _texture_2d( pos_tex,int4);
24 _texture( q_tex,int2);
25 #endif
26 
27 #else
28 #define pos_tex x_
29 #define q_tex q_
30 #endif
31 
k_lj_class2_long(const __global numtyp4 * restrict x_,const __global numtyp4 * restrict lj1,const __global numtyp4 * restrict lj3,const int lj_types,const __global numtyp * restrict sp_lj_in,const __global int * dev_nbor,const __global int * dev_packed,__global acctyp4 * restrict ans,__global acctyp * restrict engv,const int eflag,const int vflag,const int inum,const int nbor_pitch,const __global numtyp * restrict q_,const numtyp cut_coulsq,const numtyp qqrd2e,const numtyp g_ewald,const int t_per_atom)32 __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
33                                const __global numtyp4 *restrict lj1,
34                                const __global numtyp4 *restrict lj3,
35                                const int lj_types,
36                                const __global numtyp *restrict sp_lj_in,
37                                const __global int *dev_nbor,
38                                const __global int *dev_packed,
39                                __global acctyp4 *restrict ans,
40                                __global acctyp *restrict engv,
41                                const int eflag,  const int vflag,
42                                const int inum, const int nbor_pitch,
43                                const __global numtyp *restrict q_,
44                                const numtyp cut_coulsq, const numtyp qqrd2e,
45                                const numtyp g_ewald, const int t_per_atom) {
46   int tid, ii, offset;
47   atom_info(t_per_atom,ii,tid,offset);
48 
49   __local numtyp sp_lj[8];
50   int n_stride;
51   local_allocate_store_charge();
52 
53   sp_lj[0]=sp_lj_in[0];
54   sp_lj[1]=sp_lj_in[1];
55   sp_lj[2]=sp_lj_in[2];
56   sp_lj[3]=sp_lj_in[3];
57   sp_lj[4]=sp_lj_in[4];
58   sp_lj[5]=sp_lj_in[5];
59   sp_lj[6]=sp_lj_in[6];
60   sp_lj[7]=sp_lj_in[7];
61 
62   acctyp4 f;
63   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
64   acctyp energy, e_coul, virial[6];
65   if (EVFLAG) {
66     energy=(acctyp)0;
67     e_coul=(acctyp)0;
68     for (int i=0; i<6; i++) virial[i]=(acctyp)0;
69   }
70 
71   if (ii<inum) {
72     int nbor, nbor_end;
73     int i, numj;
74     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
75               n_stride,nbor_end,nbor);
76 
77     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
78     numtyp qtmp; fetch(qtmp,i,q_tex);
79     int itype=ix.w;
80 
81     for ( ; nbor<nbor_end; nbor+=n_stride) {
82       int j=dev_packed[nbor];
83 
84       numtyp factor_lj, factor_coul;
85       factor_lj = sp_lj[sbmask(j)];
86       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
87       j &= NEIGHMASK;
88 
89       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
90       int jtype=jx.w;
91 
92       // Compute r12
93       numtyp delx = ix.x-jx.x;
94       numtyp dely = ix.y-jx.y;
95       numtyp delz = ix.z-jx.z;
96       numtyp rsq = delx*delx+dely*dely+delz*delz;
97 
98       int mtype=itype*lj_types+jtype;
99       if (rsq<lj1[mtype].z) {
100         numtyp r2inv=ucl_recip(rsq);
101         numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
102 
103         if (rsq < lj1[mtype].w) {
104           numtyp rinv=ucl_rsqrt(rsq);
105           r3inv=r2inv*rinv;
106           r6inv = r3inv*r3inv;
107           force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
108         } else
109           force_lj = (numtyp)0.0;
110 
111         if (rsq < cut_coulsq) {
112           numtyp r = ucl_rsqrt(r2inv);
113           numtyp grij = g_ewald * r;
114           numtyp expm2 = ucl_exp(-grij*grij);
115           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
116           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
117           fetch(prefactor,j,q_tex);
118           prefactor *= qqrd2e * qtmp/r;
119           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
120         } else
121           forcecoul = (numtyp)0.0;
122 
123         force = (force_lj + forcecoul) * r2inv;
124 
125         f.x+=delx*force;
126         f.y+=dely*force;
127         f.z+=delz*force;
128 
129         if (EVFLAG && eflag) {
130           if (rsq < cut_coulsq)
131             e_coul += prefactor*(_erfc-factor_coul);
132           if (rsq < lj1[mtype].w) {
133             numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
134             energy+=factor_lj*(e-lj3[mtype].z);
135           }
136         }
137         if (EVFLAG && vflag) {
138           virial[0] += delx*delx*force;
139           virial[1] += dely*dely*force;
140           virial[2] += delz*delz*force;
141           virial[3] += delx*dely*force;
142           virial[4] += delx*delz*force;
143           virial[5] += dely*delz*force;
144         }
145       }
146 
147     } // for nbor
148   } // if ii
149   store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
150                   vflag,ans,engv);
151 }
152 
k_lj_class2_long_fast(const __global numtyp4 * restrict x_,const __global numtyp4 * restrict lj1_in,const __global numtyp4 * restrict lj3_in,const __global numtyp * restrict sp_lj_in,const __global int * dev_nbor,const __global int * dev_packed,__global acctyp4 * restrict ans,__global acctyp * restrict engv,const int eflag,const int vflag,const int inum,const int nbor_pitch,const __global numtyp * restrict q_,const numtyp cut_coulsq,const numtyp qqrd2e,const numtyp g_ewald,const int t_per_atom)153 __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
154                                     const __global numtyp4 *restrict lj1_in,
155                                     const __global numtyp4 *restrict lj3_in,
156                                     const __global numtyp *restrict sp_lj_in,
157                                     const __global int *dev_nbor,
158                                     const __global int *dev_packed,
159                                     __global acctyp4 *restrict ans,
160                                     __global acctyp *restrict engv,
161                                     const int eflag, const int vflag,
162                                     const int inum, const int nbor_pitch,
163                                     const __global numtyp *restrict q_,
164                                     const numtyp cut_coulsq,
165                                     const numtyp qqrd2e,
166                                     const numtyp g_ewald,
167                                     const int t_per_atom) {
168   int tid, ii, offset;
169   atom_info(t_per_atom,ii,tid,offset);
170 
171   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
172   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
173   __local numtyp sp_lj[8];
174   int n_stride;
175   local_allocate_store_charge();
176 
177   if (tid<8)
178     sp_lj[tid]=sp_lj_in[tid];
179   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
180     lj1[tid]=lj1_in[tid];
181     if (EVFLAG && eflag)
182       lj3[tid]=lj3_in[tid];
183   }
184 
185   acctyp4 f;
186   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
187   acctyp energy, e_coul, virial[6];
188   if (EVFLAG) {
189     energy=(acctyp)0;
190     e_coul=(acctyp)0;
191     for (int i=0; i<6; i++) virial[i]=(acctyp)0;
192   }
193 
194   __syncthreads();
195 
196   if (ii<inum) {
197     int nbor, nbor_end;
198     int i, numj;
199     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
200               n_stride,nbor_end,nbor);
201 
202     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
203     numtyp qtmp; fetch(qtmp,i,q_tex);
204     int iw=ix.w;
205     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
206 
207     for ( ; nbor<nbor_end; nbor+=n_stride) {
208       int j=dev_packed[nbor];
209 
210       numtyp factor_lj, factor_coul;
211       factor_lj = sp_lj[sbmask(j)];
212       factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
213       j &= NEIGHMASK;
214 
215       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
216       int mtype=itype+jx.w;
217 
218       // Compute r12
219       numtyp delx = ix.x-jx.x;
220       numtyp dely = ix.y-jx.y;
221       numtyp delz = ix.z-jx.z;
222       numtyp rsq = delx*delx+dely*dely+delz*delz;
223 
224       if (rsq<lj1[mtype].z) {
225         numtyp r2inv=ucl_recip(rsq);
226         numtyp forcecoul, force_lj, force, r6inv, r3inv, prefactor, _erfc;
227 
228         if (rsq < lj1[mtype].w) {
229           numtyp rinv=ucl_rsqrt(rsq);
230           r3inv=r2inv*rinv;
231           r6inv = r3inv*r3inv;
232           force_lj = factor_lj*r6inv*(lj1[mtype].x*r3inv-lj1[mtype].y);
233         } else
234           force_lj = (numtyp)0.0;
235 
236         if (rsq < cut_coulsq) {
237           numtyp r = ucl_rsqrt(r2inv);
238           numtyp grij = g_ewald * r;
239           numtyp expm2 = ucl_exp(-grij*grij);
240           numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
241           _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
242           fetch(prefactor,j,q_tex);
243           prefactor *= qqrd2e * qtmp/r;
244           forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
245         } else
246           forcecoul = (numtyp)0.0;
247 
248         force = (force_lj + forcecoul) * r2inv;
249 
250         f.x+=delx*force;
251         f.y+=dely*force;
252         f.z+=delz*force;
253 
254         if (EVFLAG && eflag) {
255           if (rsq < cut_coulsq)
256             e_coul += prefactor*(_erfc-factor_coul);
257           if (rsq < lj1[mtype].w) {
258             numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
259             energy+=factor_lj*(e-lj3[mtype].z);
260           }
261         }
262         if (EVFLAG && vflag) {
263           virial[0] += delx*delx*force;
264           virial[1] += dely*dely*force;
265           virial[2] += delz*delz*force;
266           virial[3] += delx*dely*force;
267           virial[4] += delx*delz*force;
268           virial[5] += dely*delz*force;
269         }
270       }
271 
272     } // for nbor
273   } // if ii
274   store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
275                   vflag,ans,engv);
276 }
277 
278