1 // **************************************************************************
2 //                                lj_coul.cu
3 //                             -------------------
4 //                           W. Michael Brown (ORNL)
5 //
6 //  Device code for acceleration of the lj/coul/cut pair style
7 //
8 // __________________________________________________________________________
9 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10 // __________________________________________________________________________
11 //
12 //    begin                :
13 //    email                : brownw@ornl.gov
14 // ***************************************************************************
15 
16 #if defined(NV_KERNEL) || defined(USE_HIP)
17 
18 #include "lal_aux_fun1.h"
19 #ifndef _DOUBLE_DOUBLE
20 _texture( pos_tex,float4);
21 _texture( q_tex,float);
22 #else
23 _texture_2d( pos_tex,int4);
24 _texture( q_tex,int2);
25 #endif
26 
27 #else
28 #define pos_tex x_
29 #define q_tex q_
30 #endif
31 
k_lj_coul(const __global numtyp4 * restrict x_,const __global numtyp4 * restrict lj1,const __global numtyp4 * restrict lj3,const int lj_types,const __global numtyp * restrict sp_lj_in,const __global int * dev_nbor,const __global int * dev_packed,__global acctyp4 * restrict ans,__global acctyp * restrict engv,const int eflag,const int vflag,const int inum,const int nbor_pitch,const __global numtyp * restrict q_,const __global numtyp * restrict cutsq,const numtyp qqrd2e,const int t_per_atom)32 __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
33                         const __global numtyp4 *restrict lj1,
34                         const __global numtyp4 *restrict  lj3,
35                         const int lj_types,
36                         const __global numtyp *restrict sp_lj_in,
37                         const __global int *dev_nbor,
38                         const __global int *dev_packed,
39                         __global acctyp4 *restrict ans,
40                         __global acctyp *restrict engv,
41                         const int eflag, const int vflag, const int inum,
42                         const int nbor_pitch,
43                         const __global numtyp *restrict q_,
44                         const __global numtyp *restrict cutsq,
45                         const numtyp qqrd2e, const int t_per_atom) {
46   int tid, ii, offset;
47   atom_info(t_per_atom,ii,tid,offset);
48 
49   __local numtyp sp_lj[8];
50   int n_stride;
51   local_allocate_store_charge();
52 
53   sp_lj[0]=sp_lj_in[0];
54   sp_lj[1]=sp_lj_in[1];
55   sp_lj[2]=sp_lj_in[2];
56   sp_lj[3]=sp_lj_in[3];
57   sp_lj[4]=sp_lj_in[4];
58   sp_lj[5]=sp_lj_in[5];
59   sp_lj[6]=sp_lj_in[6];
60   sp_lj[7]=sp_lj_in[7];
61 
62   acctyp4 f;
63   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
64   acctyp energy, e_coul, virial[6];
65   if (EVFLAG) {
66     energy=(acctyp)0;
67     e_coul=(acctyp)0;
68     for (int i=0; i<6; i++) virial[i]=(acctyp)0;
69   }
70 
71   if (ii<inum) {
72     int nbor, nbor_end;
73     int i, numj;
74     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
75               n_stride,nbor_end,nbor);
76 
77     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
78     numtyp qtmp; fetch(qtmp,i,q_tex);
79     int itype=ix.w;
80 
81     for ( ; nbor<nbor_end; nbor+=n_stride) {
82       int j=dev_packed[nbor];
83 
84       numtyp factor_lj, factor_coul;
85       factor_lj = sp_lj[sbmask(j)];
86       factor_coul = sp_lj[sbmask(j)+4];
87       j &= NEIGHMASK;
88 
89       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
90       int jtype=jx.w;
91 
92       // Compute r12
93       numtyp delx = ix.x-jx.x;
94       numtyp dely = ix.y-jx.y;
95       numtyp delz = ix.z-jx.z;
96       numtyp rsq = delx*delx+dely*dely+delz*delz;
97 
98       int mtype=itype*lj_types+jtype;
99       if (rsq<cutsq[mtype]) {
100         numtyp r2inv=ucl_recip(rsq);
101         numtyp forcecoul, force_lj, force, r6inv;
102 
103         if (rsq < lj1[mtype].z) {
104           r6inv = r2inv*r2inv*r2inv;
105           force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
106         } else
107           force_lj = (numtyp)0.0;
108 
109         if (rsq < lj1[mtype].w) {
110           fetch(forcecoul,j,q_tex);
111           forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
112         } else
113           forcecoul = (numtyp)0.0;
114 
115         force = (force_lj + forcecoul) * r2inv;
116 
117         f.x+=delx*force;
118         f.y+=dely*force;
119         f.z+=delz*force;
120 
121         if (EVFLAG && eflag) {
122           e_coul += forcecoul;
123           if (rsq < lj1[mtype].z) {
124             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
125             energy+=factor_lj*(e-lj3[mtype].z);
126           }
127         }
128         if (EVFLAG && vflag) {
129           virial[0] += delx*delx*force;
130           virial[1] += dely*dely*force;
131           virial[2] += delz*delz*force;
132           virial[3] += delx*dely*force;
133           virial[4] += delx*delz*force;
134           virial[5] += dely*delz*force;
135         }
136       }
137 
138     } // for nbor
139   } // if ii
140   store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
141                   vflag,ans,engv);
142 }
143 
k_lj_coul_fast(const __global numtyp4 * restrict x_,const __global numtyp4 * restrict lj1_in,const __global numtyp4 * restrict lj3_in,const __global numtyp * restrict sp_lj_in,const __global int * dev_nbor,const __global int * dev_packed,__global acctyp4 * restrict ans,__global acctyp * restrict engv,const int eflag,const int vflag,const int inum,const int nbor_pitch,const __global numtyp * restrict q_,const __global numtyp * restrict _cutsq,const numtyp qqrd2e,const int t_per_atom)144 __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
145                              const __global numtyp4 *restrict lj1_in,
146                              const __global numtyp4 *restrict lj3_in,
147                              const __global numtyp *restrict sp_lj_in,
148                              const __global int *dev_nbor,
149                              const __global int *dev_packed,
150                              __global acctyp4 *restrict ans,
151                              __global acctyp *restrict engv,
152                              const int eflag, const int vflag, const int inum,
153                              const int nbor_pitch,
154                              const __global numtyp *restrict q_,
155                              const __global numtyp *restrict _cutsq,
156                              const numtyp qqrd2e, const int t_per_atom) {
157   int tid, ii, offset;
158   atom_info(t_per_atom,ii,tid,offset);
159 
160   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
161   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
162   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
163   __local numtyp sp_lj[8];
164   int n_stride;
165   local_allocate_store_charge();
166 
167   if (tid<8)
168     sp_lj[tid]=sp_lj_in[tid];
169   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
170     lj1[tid]=lj1_in[tid];
171     cutsq[tid]=_cutsq[tid];
172     if (EVFLAG && eflag)
173       lj3[tid]=lj3_in[tid];
174   }
175 
176   acctyp4 f;
177   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
178   acctyp energy, e_coul, virial[6];
179   if (EVFLAG) {
180     energy=(acctyp)0;
181     e_coul=(acctyp)0;
182     for (int i=0; i<6; i++) virial[i]=(acctyp)0;
183   }
184 
185   __syncthreads();
186 
187   if (ii<inum) {
188     int nbor, nbor_end;
189     int i, numj;
190     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
191               n_stride,nbor_end,nbor);
192 
193     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
194     numtyp qtmp; fetch(qtmp,i,q_tex);
195     int iw=ix.w;
196     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
197 
198     for ( ; nbor<nbor_end; nbor+=n_stride) {
199       int j=dev_packed[nbor];
200 
201       numtyp factor_lj, factor_coul;
202       factor_lj = sp_lj[sbmask(j)];
203       factor_coul = sp_lj[sbmask(j)+4];
204       j &= NEIGHMASK;
205 
206       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
207       int mtype=itype+jx.w;
208 
209       // Compute r12
210       numtyp delx = ix.x-jx.x;
211       numtyp dely = ix.y-jx.y;
212       numtyp delz = ix.z-jx.z;
213       numtyp rsq = delx*delx+dely*dely+delz*delz;
214 
215       if (rsq<cutsq[mtype]) {
216         numtyp r2inv=ucl_recip(rsq);
217         numtyp forcecoul, force_lj, force, r6inv;
218 
219         if (rsq < lj1[mtype].z) {
220           r6inv = r2inv*r2inv*r2inv;
221           force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
222         } else
223           force_lj = (numtyp)0.0;
224 
225         if (rsq < lj1[mtype].w) {
226           fetch(forcecoul,j,q_tex);
227           forcecoul *= qqrd2e*qtmp*ucl_rsqrt(rsq)*factor_coul;
228         } else
229           forcecoul = (numtyp)0.0;
230 
231         force = (force_lj + forcecoul) * r2inv;
232 
233         f.x+=delx*force;
234         f.y+=dely*force;
235         f.z+=delz*force;
236 
237         if (EVFLAG && eflag) {
238           e_coul += forcecoul;
239           if (rsq < lj1[mtype].z) {
240             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
241             energy+=factor_lj*(e-lj3[mtype].z);
242           }
243         }
244         if (EVFLAG && vflag) {
245           virial[0] += delx*delx*force;
246           virial[1] += dely*dely*force;
247           virial[2] += delz*delz*force;
248           virial[3] += delx*dely*force;
249           virial[4] += delx*delz*force;
250           virial[5] += dely*delz*force;
251         }
252       }
253 
254     } // for nbor
255   } // if ii
256   store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
257                   vflag,ans,engv);
258 }
259 
260