1 // **************************************************************************
2 // gayberne_lj.cu
3 // -------------------
4 // W. Michael Brown (ORNL)
5 //
6 // Device code for Gay-Berne - Lennard-Jones potential acceleration
7 //
8 // __________________________________________________________________________
9 // This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
10 // __________________________________________________________________________
11 //
12 // begin :
13 // email : brownw@ornl.gov
14 // ***************************************************************************/
15
16 #ifdef NV_KERNEL
17 #include "lal_ellipsoid_extra.h"
18 #endif
19
k_gayberne_sphere_ellipsoid(const __global numtyp4 * restrict x_,const __global numtyp4 * restrict q,const __global numtyp4 * restrict shape,const __global numtyp4 * restrict well,const __global numtyp * restrict gum,const __global numtyp2 * restrict sig_eps,const int ntypes,const __global numtyp * restrict lshape,const __global int * dev_nbor,const int stride,__global acctyp4 * restrict ans,__global acctyp * restrict engv,__global int * restrict err_flag,const int eflag,const int vflag,const int start,const int inum,const int t_per_atom)20 __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
21 const __global numtyp4 *restrict q,
22 const __global numtyp4 *restrict shape,
23 const __global numtyp4 *restrict well,
24 const __global numtyp *restrict gum,
25 const __global numtyp2 *restrict sig_eps,
26 const int ntypes,
27 const __global numtyp *restrict lshape,
28 const __global int *dev_nbor,
29 const int stride,
30 __global acctyp4 *restrict ans,
31 __global acctyp *restrict engv,
32 __global int *restrict err_flag,
33 const int eflag, const int vflag,
34 const int start, const int inum,
35 const int t_per_atom) {
36 int tid, ii, offset;
37 atom_info(t_per_atom,ii,tid,offset);
38 ii+=start;
39
40 __local numtyp sp_lj[4];
41 sp_lj[0]=gum[3];
42 sp_lj[1]=gum[4];
43 sp_lj[2]=gum[5];
44 sp_lj[3]=gum[6];
45
46 acctyp energy=(acctyp)0;
47 acctyp4 f;
48 f.x=(acctyp)0;
49 f.y=(acctyp)0;
50 f.z=(acctyp)0;
51 acctyp virial[6];
52 for (int i=0; i<6; i++)
53 virial[i]=(acctyp)0;
54
55 if (ii<inum) {
56 const __global int *nbor, *nbor_end;
57 int i, numj;
58 __local int n_stride;
59 nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
60 n_stride,nbor_end,nbor);
61
62 numtyp4 ix; fetch4(ix,i,pos_tex);
63 int itype=ix.w;
64
65 numtyp oner=shape[itype].x;
66 numtyp one_well=well[itype].x;
67
68 numtyp factor_lj;
69 for ( ; nbor<nbor_end; nbor+=n_stride) {
70
71 int j=*nbor;
72 factor_lj = sp_lj[sbmask(j)];
73 j &= NEIGHMASK;
74
75 numtyp4 jx; fetch4(jx,j,pos_tex);
76 int jtype=jx.w;
77
78 // Compute r12
79 numtyp r12[3];
80 r12[0] = jx.x-ix.x;
81 r12[1] = jx.y-ix.y;
82 r12[2] = jx.z-ix.z;
83 numtyp ir = gpu_dot3(r12,r12);
84
85 ir = ucl_rsqrt(ir);
86 numtyp r = ucl_recip(ir);
87
88 numtyp r12hat[3];
89 r12hat[0]=r12[0]*ir;
90 r12hat[1]=r12[1]*ir;
91 r12hat[2]=r12[2]*ir;
92
93 numtyp a2[9];
94 gpu_quat_to_mat_trans(q,j,a2);
95
96 numtyp u_r, dUr[3], eta;
97 { // Compute U_r, dUr, eta, and teta
98 // Compute g12
99 numtyp g12[9];
100 {
101 {
102 numtyp g2[9];
103 gpu_diag_times3(shape[jtype],a2,g12);
104 gpu_transpose_times3(a2,g12,g2);
105 g12[0]=g2[0]+oner;
106 g12[4]=g2[4]+oner;
107 g12[8]=g2[8]+oner;
108 g12[1]=g2[1];
109 g12[2]=g2[2];
110 g12[3]=g2[3];
111 g12[5]=g2[5];
112 g12[6]=g2[6];
113 g12[7]=g2[7];
114 }
115
116 { // Compute U_r and dUr
117
118 // Compute kappa
119 numtyp kappa[3];
120 gpu_mldivide3(g12,r12,kappa,err_flag);
121
122 // -- kappa is now / r
123 kappa[0]*=ir;
124 kappa[1]*=ir;
125 kappa[2]*=ir;
126
127 // energy
128
129 // compute u_r and dUr
130 numtyp uslj_rsq;
131 {
132 // Compute distance of closest approach
133 numtyp h12, sigma12;
134 sigma12 = gpu_dot3(r12hat,kappa);
135 sigma12 = ucl_rsqrt((numtyp)0.5*sigma12);
136 h12 = r-sigma12;
137
138 // -- kappa is now ok
139 kappa[0]*=r;
140 kappa[1]*=r;
141 kappa[2]*=r;
142
143 int mtype=fast_mul(ntypes,itype)+jtype;
144 numtyp sigma = sig_eps[mtype].x;
145 numtyp epsilon = sig_eps[mtype].y;
146 numtyp varrho = sigma/(h12+gum[0]*sigma);
147 numtyp varrho6 = varrho*varrho*varrho;
148 varrho6*=varrho6;
149 numtyp varrho12 = varrho6*varrho6;
150 u_r = (numtyp)4.0*epsilon*(varrho12-varrho6);
151
152 numtyp temp1 = ((numtyp)2.0*varrho12*varrho-varrho6*varrho)/sigma;
153 temp1 = temp1*(numtyp)24.0*epsilon;
154 uslj_rsq = temp1*sigma12*sigma12*sigma12*(numtyp)0.5;
155 numtyp temp2 = gpu_dot3(kappa,r12hat);
156 uslj_rsq = uslj_rsq*ir*ir;
157
158 dUr[0] = temp1*r12hat[0]+uslj_rsq*(kappa[0]-temp2*r12hat[0]);
159 dUr[1] = temp1*r12hat[1]+uslj_rsq*(kappa[1]-temp2*r12hat[1]);
160 dUr[2] = temp1*r12hat[2]+uslj_rsq*(kappa[2]-temp2*r12hat[2]);
161 }
162 }
163 }
164
165 // Compute eta
166 {
167 eta = (numtyp)2.0*lshape[itype]*lshape[jtype];
168 numtyp det_g12 = gpu_det3(g12);
169 eta = ucl_powr(eta/det_g12,gum[1]);
170 }
171 }
172
173 numtyp chi, dchi[3];
174 { // Compute chi and dchi
175
176 // Compute b12
177 numtyp b12[9];
178 {
179 numtyp b2[9];
180 gpu_diag_times3(well[jtype],a2,b12);
181 gpu_transpose_times3(a2,b12,b2);
182 b12[0]=b2[0]+one_well;
183 b12[4]=b2[4]+one_well;
184 b12[8]=b2[8]+one_well;
185 b12[1]=b2[1];
186 b12[2]=b2[2];
187 b12[3]=b2[3];
188 b12[5]=b2[5];
189 b12[6]=b2[6];
190 b12[7]=b2[7];
191 }
192
193 // compute chi_12
194 numtyp iota[3];
195 gpu_mldivide3(b12,r12,iota,err_flag);
196 // -- iota is now iota/r
197 iota[0]*=ir;
198 iota[1]*=ir;
199 iota[2]*=ir;
200 chi = gpu_dot3(r12hat,iota);
201 chi = ucl_powr(chi*(numtyp)2.0,gum[2]);
202
203 // -- iota is now ok
204 iota[0]*=r;
205 iota[1]*=r;
206 iota[2]*=r;
207
208 numtyp temp1 = gpu_dot3(iota,r12hat);
209 numtyp temp2 = (numtyp)-4.0*ir*ir*gum[2]*ucl_powr(chi,(gum[2]-(numtyp)1.0)/
210 gum[2]);
211 dchi[0] = temp2*(iota[0]-temp1*r12hat[0]);
212 dchi[1] = temp2*(iota[1]-temp1*r12hat[1]);
213 dchi[2] = temp2*(iota[2]-temp1*r12hat[2]);
214 }
215
216 numtyp temp2 = factor_lj*eta*chi;
217 if (eflag>0)
218 energy+=u_r*temp2;
219 numtyp temp1 = -eta*u_r*factor_lj;
220 if (vflag>0) {
221 r12[0]*=-1;
222 r12[1]*=-1;
223 r12[2]*=-1;
224 numtyp ft=temp1*dchi[0]-temp2*dUr[0];
225 f.x+=ft;
226 virial[0]+=r12[0]*ft;
227 ft=temp1*dchi[1]-temp2*dUr[1];
228 f.y+=ft;
229 virial[1]+=r12[1]*ft;
230 virial[3]+=r12[0]*ft;
231 ft=temp1*dchi[2]-temp2*dUr[2];
232 f.z+=ft;
233 virial[2]+=r12[2]*ft;
234 virial[4]+=r12[0]*ft;
235 virial[5]+=r12[1]*ft;
236 } else {
237 f.x+=temp1*dchi[0]-temp2*dUr[0];
238 f.y+=temp1*dchi[1]-temp2*dUr[1];
239 f.z+=temp1*dchi[2]-temp2*dUr[2];
240 }
241 } // for nbor
242 store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
243 ans,engv);
244 } // if ii
245 }
246
k_gayberne_lj(const __global numtyp4 * restrict x_,const __global numtyp4 * restrict lj1,const __global numtyp4 * restrict lj3,const int lj_types,const __global numtyp * restrict gum,const int stride,const __global int * dev_ij,__global acctyp4 * restrict ans,__global acctyp * restrict engv,__global int * restrict err_flag,const int eflag,const int vflag,const int start,const int inum,const int t_per_atom)247 __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
248 const __global numtyp4 *restrict lj1,
249 const __global numtyp4 *restrict lj3,
250 const int lj_types,
251 const __global numtyp *restrict gum,
252 const int stride,
253 const __global int *dev_ij,
254 __global acctyp4 *restrict ans,
255 __global acctyp *restrict engv,
256 __global int *restrict err_flag,
257 const int eflag, const int vflag, const int start,
258 const int inum, const int t_per_atom) {
259 int tid, ii, offset;
260 atom_info(t_per_atom,ii,tid,offset);
261 ii+=start;
262
263 __local numtyp sp_lj[4];
264 sp_lj[0]=gum[3];
265 sp_lj[1]=gum[4];
266 sp_lj[2]=gum[5];
267 sp_lj[3]=gum[6];
268
269 acctyp energy=(acctyp)0;
270 acctyp4 f;
271 f.x=(acctyp)0;
272 f.y=(acctyp)0;
273 f.z=(acctyp)0;
274 acctyp virial[6];
275 for (int i=0; i<6; i++)
276 virial[i]=(acctyp)0;
277
278 if (ii<inum) {
279 const __global int *nbor, *list_end;
280 int i, numj;
281 __local int n_stride;
282 nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
283 n_stride,list_end,nbor);
284
285 numtyp4 ix; fetch4(ix,i,pos_tex);
286 int itype=ix.w;
287
288 numtyp factor_lj;
289 for ( ; nbor<list_end; nbor+=n_stride) {
290
291 int j=*nbor;
292 factor_lj = sp_lj[sbmask(j)];
293 j &= NEIGHMASK;
294
295 numtyp4 jx; fetch4(jx,j,pos_tex);
296 int jtype=jx.w;
297
298 // Compute r12
299 numtyp delx = ix.x-jx.x;
300 numtyp dely = ix.y-jx.y;
301 numtyp delz = ix.z-jx.z;
302 numtyp r2inv = delx*delx+dely*dely+delz*delz;
303
304 int ii=itype*lj_types+jtype;
305 if (r2inv<lj1[ii].z && lj1[ii].w==SPHERE_SPHERE) {
306 r2inv=ucl_recip(r2inv);
307 numtyp r6inv = r2inv*r2inv*r2inv;
308 numtyp force = r2inv*r6inv*(lj1[ii].x*r6inv-lj1[ii].y);
309 force*=factor_lj;
310
311 f.x+=delx*force;
312 f.y+=dely*force;
313 f.z+=delz*force;
314
315 if (eflag>0) {
316 numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
317 energy+=factor_lj*(e-lj3[ii].z);
318 }
319 if (vflag>0) {
320 virial[0] += delx*delx*force;
321 virial[1] += dely*dely*force;
322 virial[2] += delz*delz*force;
323 virial[3] += delx*dely*force;
324 virial[4] += delx*delz*force;
325 virial[5] += dely*delz*force;
326 }
327 }
328
329 } // for nbor
330 acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
331 ans,engv);
332 } // if ii
333 }
334
k_gayberne_lj_fast(const __global numtyp4 * restrict x_,const __global numtyp4 * restrict lj1_in,const __global numtyp4 * restrict lj3_in,const __global numtyp * restrict gum,const int stride,const __global int * dev_ij,__global acctyp4 * restrict ans,__global acctyp * restrict engv,__global int * restrict err_flag,const int eflag,const int vflag,const int start,const int inum,const int t_per_atom)335 __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
336 const __global numtyp4 *restrict lj1_in,
337 const __global numtyp4 *restrict lj3_in,
338 const __global numtyp *restrict gum,
339 const int stride,
340 const __global int *dev_ij,
341 __global acctyp4 *restrict ans,
342 __global acctyp *restrict engv,
343 __global int *restrict err_flag,
344 const int eflag, const int vflag,
345 const int start, const int inum,
346 const int t_per_atom) {
347 int tid, ii, offset;
348 atom_info(t_per_atom,ii,tid,offset);
349 ii+=start;
350
351 __local numtyp sp_lj[4];
352 __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
353 __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
354 if (tid<4)
355 sp_lj[tid]=gum[tid+3];
356 if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
357 lj1[tid]=lj1_in[tid];
358 if (eflag>0)
359 lj3[tid]=lj3_in[tid];
360 }
361
362 acctyp energy=(acctyp)0;
363 acctyp4 f;
364 f.x=(acctyp)0;
365 f.y=(acctyp)0;
366 f.z=(acctyp)0;
367 acctyp virial[6];
368 for (int i=0; i<6; i++)
369 virial[i]=(acctyp)0;
370
371 __syncthreads();
372
373 if (ii<inum) {
374 const __global int *nbor, *list_end;
375 int i, numj;
376 __local int n_stride;
377 nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
378 n_stride,list_end,nbor);
379
380 numtyp4 ix; fetch4(ix,i,pos_tex);
381 int iw=ix.w;
382 int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
383
384 numtyp factor_lj;
385 for ( ; nbor<list_end; nbor+=n_stride) {
386
387 int j=*nbor;
388 factor_lj = sp_lj[sbmask(j)];
389 j &= NEIGHMASK;
390
391 numtyp4 jx; fetch4(jx,j,pos_tex);
392 int mtype=itype+jx.w;
393
394 // Compute r12
395 numtyp delx = ix.x-jx.x;
396 numtyp dely = ix.y-jx.y;
397 numtyp delz = ix.z-jx.z;
398 numtyp r2inv = delx*delx+dely*dely+delz*delz;
399
400 if (r2inv<lj1[mtype].z && lj1[mtype].w==SPHERE_SPHERE) {
401 r2inv=ucl_recip(r2inv);
402 numtyp r6inv = r2inv*r2inv*r2inv;
403 numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
404
405 f.x+=delx*force;
406 f.y+=dely*force;
407 f.z+=delz*force;
408
409 if (eflag>0) {
410 numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
411 energy+=factor_lj*(e-lj3[mtype].z);
412 }
413 if (vflag>0) {
414 virial[0] += delx*delx*force;
415 virial[1] += dely*dely*force;
416 virial[2] += delz*delz*force;
417 virial[3] += delx*dely*force;
418 virial[4] += delx*delz*force;
419 virial[5] += dely*delz*force;
420 }
421 }
422
423 } // for nbor
424 acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
425 ans,engv);
426 } // if ii
427 }
428
429