1 /* ----------------------------------------------------------------------
2    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3 
4    Original Version:
5    http://lammps.sandia.gov, Sandia National Laboratories
6    Steve Plimpton, sjplimp@sandia.gov
7 
8    See the README file in the top-level LAMMPS directory.
9 
10    -----------------------------------------------------------------------
11 
12    USER-CUDA Package and associated modifications:
13    https://sourceforge.net/projects/lammpscuda/
14 
15    Christian Trott, christian.trott@tu-ilmenau.de
16    Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17    Theoretical Physics II, University of Technology Ilmenau, Germany
18 
19    See the README file in the USER-CUDA directory.
20 
21    This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23 
24 
PairGranHookeCuda_Kernel(int eflag,int vflag,int eflag_atom,int vflag_atom,int ** firstneight,int * binned_id,F_FLOAT kn,F_FLOAT gamman,F_FLOAT gammat,F_FLOAT xmu)25 __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, int vflag_atom, int** firstneight, int* binned_id
26     , F_FLOAT kn, F_FLOAT gamman, F_FLOAT gammat, F_FLOAT xmu)
27 {
28   ENERGY_FLOAT evdwl = ENERGY_F(0.0);
29 
30   ENERGY_FLOAT* sharedE;
31   ENERGY_FLOAT* sharedV;
32 
33   if(eflag || eflag_atom) {
34     sharedE = &sharedmem[threadIdx.x];
35     sharedV = &sharedmem[0];
36     sharedE[0] = ENERGY_F(0.0);
37     sharedV += blockDim.x;
38   }
39 
40   if(vflag || vflag_atom) {
41     sharedV += threadIdx.x;
42     sharedV[0 * blockDim.x] = ENERGY_F(0.0);
43     sharedV[1 * blockDim.x] = ENERGY_F(0.0);
44     sharedV[2 * blockDim.x] = ENERGY_F(0.0);
45     sharedV[3 * blockDim.x] = ENERGY_F(0.0);
46     sharedV[4 * blockDim.x] = ENERGY_F(0.0);
47     sharedV[5 * blockDim.x] = ENERGY_F(0.0);
48   }
49 
50   int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
51 
52   MYEMUDBG(if(ii == 0) printf("# CUDA: PairGranHookeCuda_Kernel: -- no binning --\n");)
53 
54     X_FLOAT xtmp, ytmp, ztmp;
55 
56   X_FLOAT4 myxtype;
57   V_FLOAT4 myvradius, ovradius;
58   F_FLOAT fxtmp, fytmp, fztmp, torquextmp, torqueytmp, torqueztmp;
59   F_FLOAT delx, dely, delz;
60   F_FLOAT radi, radj, radsum, r, rsqinv;
61   F_FLOAT vr1, vr2, vr3, vnnr, vn1, vn2, vn3, vt1, vt2, vt3;
62   F_FLOAT wr1, wr2, wr3;
63   F_FLOAT vtr1, vtr2, vtr3, vrel;
64   F_FLOAT meff, damp, ccel, tor1, tor2, tor3;
65   F_FLOAT fn, fs, ft, fs1, fs2, fs3;
66 
67   int jnum = 0;
68   int i, j;
69   int* jlist;
70 
71   if(ii < _inum) {
72     i = _ilist[ii];
73 
74     myxtype = fetchXType(i);
75     myvradius = fetchVRadius(i);
76 
77     xtmp = myxtype.x;
78     ytmp = myxtype.y;
79     ztmp = myxtype.z;
80     radi = myvradius.w;
81 
82     fxtmp = F_F(0.0);
83     fytmp = F_F(0.0);
84     fztmp = F_F(0.0);
85     torquextmp = F_F(0.0);
86     torqueytmp = F_F(0.0);
87     torqueztmp = F_F(0.0);
88 
89     jnum = _numneigh[i];
90 
91     jlist = &_neighbors[i];
92   }
93 
94   __syncthreads();
95 
96   for(int jj = 0; jj < jnum; jj++) {
97     if(ii < _inum)
98       if(jj < jnum) {
99         j = jlist[jj * _nlocal];
100 
101         myxtype = fetchXType(j);
102         ovradius = fetchVRadius(j);
103 
104         delx = xtmp - myxtype.x;
105         dely = ytmp - myxtype.y;
106         delz = ztmp - myxtype.z;
107 
108         radj = ovradius.w;
109         radsum = radi + radj;
110 
111         const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
112 
113         if(rsq < radsum * radsum) {
114           const F_FLOAT rinv = _RSQRT_(rsq);
115           r = F_F(1.0) / rinv;
116           rsqinv = F_F(1.0) / rsq;
117 
118           // relative translational velocity
119 
120           vr1 = myvradius.x - ovradius.x;
121           vr2 = myvradius.y - ovradius.y;
122           vr3 = myvradius.z - ovradius.z;
123 
124           // normal component
125 
126           vnnr = vr1 * delx + vr2 * dely + vr3 * delz;
127           vn1 = delx * vnnr * rsqinv;
128           vn2 = dely * vnnr * rsqinv;
129           vn3 = delz * vnnr * rsqinv;
130 
131           // tangential component
132 
133           vt1 = vr1 - vn1;
134           vt2 = vr2 - vn2;
135           vt3 = vr3 - vn3;
136 
137           // relative rotational velocity
138           V_FLOAT4 omegarmass_i = fetchOmegaRmass(i);
139           V_FLOAT4 omegarmass_j = fetchOmegaRmass(j);
140 
141           wr1 = (radi * omegarmass_i.x + radj * omegarmass_j.x) * rinv;
142           wr2 = (radi * omegarmass_i.y + radj * omegarmass_j.y) * rinv;
143           wr3 = (radi * omegarmass_i.z + radj * omegarmass_j.z) * rinv;
144 
145           meff = omegarmass_i.w * omegarmass_j.w / (omegarmass_i.w + omegarmass_j.w);
146 
147           if(_mask[i] & _freeze_group_bit) meff = omegarmass_j.w;
148 
149           if(_mask[j] & _freeze_group_bit) meff = omegarmass_i.w;
150 
151           damp = meff * gamman * vnnr * rsqinv;
152           ccel = kn * (radsum - r) * rinv - damp;
153 
154           vtr1 = vt1 - (delz * wr2 - dely * wr3);
155           vtr2 = vt2 - (delx * wr3 - delz * wr1);
156           vtr3 = vt3 - (dely * wr1 - delx * wr2);
157           vrel = vtr1 * vtr1 + vtr2 * vtr2 + vtr3 * vtr3;
158           vrel = _SQRT_(vrel);
159 
160           fn = xmu * fabs(ccel * r);
161           fs = meff * gammat * vrel;
162           ft = (vrel != F_F(0.0)) ? MIN(fn, fs) / vrel : F_F(0.0);
163 
164           fs1 = -ft * vtr1;
165           fs2 = -ft * vtr2;
166           fs3 = -ft * vtr3;
167 
168           F_FLOAT dxfp, dyfp, dzfp;
169           fxtmp += dxfp = delx * ccel + fs1;
170           fytmp += dyfp = dely * ccel + fs2;
171           fztmp += dzfp = delz * ccel + fs3;
172 
173           tor1 = rinv * (dely * fs3 - delz * fs2);
174           tor2 = rinv * (delz * fs1 - delx * fs3);
175           tor3 = rinv * (delx * fs2 - dely * fs1);
176 
177           torquextmp -= radi * tor1;
178           torqueytmp -= radi * tor2;
179           torqueztmp -= radi * tor3;
180 
181           if(vflag) {
182             sharedV[0 * blockDim.x] += delx * dxfp;
183             sharedV[1 * blockDim.x] += dely * dyfp;
184             sharedV[2 * blockDim.x] += delz * dzfp;
185             sharedV[3 * blockDim.x] += delx * dyfp;
186             sharedV[4 * blockDim.x] += delx * dzfp;
187             sharedV[5 * blockDim.x] += dely * dzfp;
188           }
189 
190         }
191       }
192   }
193 
194   __syncthreads();
195 
196   if(ii < _inum) {
197     F_FLOAT* my_f = _f + i;
198     *my_f += fxtmp;
199     my_f += _nmax;
200     *my_f += fytmp;
201     my_f += _nmax;
202     *my_f += fztmp;
203     F_FLOAT* my_torque = _torque + i;
204     *my_torque += torquextmp;
205     my_torque += _nmax;
206     *my_torque += torqueytmp;
207     my_torque += _nmax;
208     *my_torque += torqueztmp;
209   }
210 
211   __syncthreads();
212 
213   if(eflag) sharedE[0] = evdwl;
214 
215   if(eflag_atom && i < _nlocal) _eatom[i] += evdwl;
216 
217   if(vflag_atom && i < _nlocal) {
218     _vatom[i]         += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
219     _vatom[i + _nmax]   += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
220     _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
221     _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
222     _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
223     _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
224   }
225 
226   if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, 0);
227 }
228