1 /* ----------------------------------------------------------------------
2 LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3
4 Original Version:
5 http://lammps.sandia.gov, Sandia National Laboratories
6 Steve Plimpton, sjplimp@sandia.gov
7
8 See the README file in the top-level LAMMPS directory.
9
10 -----------------------------------------------------------------------
11
12 USER-CUDA Package and associated modifications:
13 https://sourceforge.net/projects/lammpscuda/
14
15 Christian Trott, christian.trott@tu-ilmenau.de
16 Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17 Theoretical Physics II, University of Technology Ilmenau, Germany
18
19 See the README file in the USER-CUDA directory.
20
21 This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23
24
PairGranHookeCuda_Kernel(int eflag,int vflag,int eflag_atom,int vflag_atom,int ** firstneight,int * binned_id,F_FLOAT kn,F_FLOAT gamman,F_FLOAT gammat,F_FLOAT xmu)25 __global__ void PairGranHookeCuda_Kernel(int eflag, int vflag, int eflag_atom, int vflag_atom, int** firstneight, int* binned_id
26 , F_FLOAT kn, F_FLOAT gamman, F_FLOAT gammat, F_FLOAT xmu)
27 {
28 ENERGY_FLOAT evdwl = ENERGY_F(0.0);
29
30 ENERGY_FLOAT* sharedE;
31 ENERGY_FLOAT* sharedV;
32
33 if(eflag || eflag_atom) {
34 sharedE = &sharedmem[threadIdx.x];
35 sharedV = &sharedmem[0];
36 sharedE[0] = ENERGY_F(0.0);
37 sharedV += blockDim.x;
38 }
39
40 if(vflag || vflag_atom) {
41 sharedV += threadIdx.x;
42 sharedV[0 * blockDim.x] = ENERGY_F(0.0);
43 sharedV[1 * blockDim.x] = ENERGY_F(0.0);
44 sharedV[2 * blockDim.x] = ENERGY_F(0.0);
45 sharedV[3 * blockDim.x] = ENERGY_F(0.0);
46 sharedV[4 * blockDim.x] = ENERGY_F(0.0);
47 sharedV[5 * blockDim.x] = ENERGY_F(0.0);
48 }
49
50 int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
51
52 MYEMUDBG(if(ii == 0) printf("# CUDA: PairGranHookeCuda_Kernel: -- no binning --\n");)
53
54 X_FLOAT xtmp, ytmp, ztmp;
55
56 X_FLOAT4 myxtype;
57 V_FLOAT4 myvradius, ovradius;
58 F_FLOAT fxtmp, fytmp, fztmp, torquextmp, torqueytmp, torqueztmp;
59 F_FLOAT delx, dely, delz;
60 F_FLOAT radi, radj, radsum, r, rsqinv;
61 F_FLOAT vr1, vr2, vr3, vnnr, vn1, vn2, vn3, vt1, vt2, vt3;
62 F_FLOAT wr1, wr2, wr3;
63 F_FLOAT vtr1, vtr2, vtr3, vrel;
64 F_FLOAT meff, damp, ccel, tor1, tor2, tor3;
65 F_FLOAT fn, fs, ft, fs1, fs2, fs3;
66
67 int jnum = 0;
68 int i, j;
69 int* jlist;
70
71 if(ii < _inum) {
72 i = _ilist[ii];
73
74 myxtype = fetchXType(i);
75 myvradius = fetchVRadius(i);
76
77 xtmp = myxtype.x;
78 ytmp = myxtype.y;
79 ztmp = myxtype.z;
80 radi = myvradius.w;
81
82 fxtmp = F_F(0.0);
83 fytmp = F_F(0.0);
84 fztmp = F_F(0.0);
85 torquextmp = F_F(0.0);
86 torqueytmp = F_F(0.0);
87 torqueztmp = F_F(0.0);
88
89 jnum = _numneigh[i];
90
91 jlist = &_neighbors[i];
92 }
93
94 __syncthreads();
95
96 for(int jj = 0; jj < jnum; jj++) {
97 if(ii < _inum)
98 if(jj < jnum) {
99 j = jlist[jj * _nlocal];
100
101 myxtype = fetchXType(j);
102 ovradius = fetchVRadius(j);
103
104 delx = xtmp - myxtype.x;
105 dely = ytmp - myxtype.y;
106 delz = ztmp - myxtype.z;
107
108 radj = ovradius.w;
109 radsum = radi + radj;
110
111 const F_FLOAT rsq = delx * delx + dely * dely + delz * delz;
112
113 if(rsq < radsum * radsum) {
114 const F_FLOAT rinv = _RSQRT_(rsq);
115 r = F_F(1.0) / rinv;
116 rsqinv = F_F(1.0) / rsq;
117
118 // relative translational velocity
119
120 vr1 = myvradius.x - ovradius.x;
121 vr2 = myvradius.y - ovradius.y;
122 vr3 = myvradius.z - ovradius.z;
123
124 // normal component
125
126 vnnr = vr1 * delx + vr2 * dely + vr3 * delz;
127 vn1 = delx * vnnr * rsqinv;
128 vn2 = dely * vnnr * rsqinv;
129 vn3 = delz * vnnr * rsqinv;
130
131 // tangential component
132
133 vt1 = vr1 - vn1;
134 vt2 = vr2 - vn2;
135 vt3 = vr3 - vn3;
136
137 // relative rotational velocity
138 V_FLOAT4 omegarmass_i = fetchOmegaRmass(i);
139 V_FLOAT4 omegarmass_j = fetchOmegaRmass(j);
140
141 wr1 = (radi * omegarmass_i.x + radj * omegarmass_j.x) * rinv;
142 wr2 = (radi * omegarmass_i.y + radj * omegarmass_j.y) * rinv;
143 wr3 = (radi * omegarmass_i.z + radj * omegarmass_j.z) * rinv;
144
145 meff = omegarmass_i.w * omegarmass_j.w / (omegarmass_i.w + omegarmass_j.w);
146
147 if(_mask[i] & _freeze_group_bit) meff = omegarmass_j.w;
148
149 if(_mask[j] & _freeze_group_bit) meff = omegarmass_i.w;
150
151 damp = meff * gamman * vnnr * rsqinv;
152 ccel = kn * (radsum - r) * rinv - damp;
153
154 vtr1 = vt1 - (delz * wr2 - dely * wr3);
155 vtr2 = vt2 - (delx * wr3 - delz * wr1);
156 vtr3 = vt3 - (dely * wr1 - delx * wr2);
157 vrel = vtr1 * vtr1 + vtr2 * vtr2 + vtr3 * vtr3;
158 vrel = _SQRT_(vrel);
159
160 fn = xmu * fabs(ccel * r);
161 fs = meff * gammat * vrel;
162 ft = (vrel != F_F(0.0)) ? MIN(fn, fs) / vrel : F_F(0.0);
163
164 fs1 = -ft * vtr1;
165 fs2 = -ft * vtr2;
166 fs3 = -ft * vtr3;
167
168 F_FLOAT dxfp, dyfp, dzfp;
169 fxtmp += dxfp = delx * ccel + fs1;
170 fytmp += dyfp = dely * ccel + fs2;
171 fztmp += dzfp = delz * ccel + fs3;
172
173 tor1 = rinv * (dely * fs3 - delz * fs2);
174 tor2 = rinv * (delz * fs1 - delx * fs3);
175 tor3 = rinv * (delx * fs2 - dely * fs1);
176
177 torquextmp -= radi * tor1;
178 torqueytmp -= radi * tor2;
179 torqueztmp -= radi * tor3;
180
181 if(vflag) {
182 sharedV[0 * blockDim.x] += delx * dxfp;
183 sharedV[1 * blockDim.x] += dely * dyfp;
184 sharedV[2 * blockDim.x] += delz * dzfp;
185 sharedV[3 * blockDim.x] += delx * dyfp;
186 sharedV[4 * blockDim.x] += delx * dzfp;
187 sharedV[5 * blockDim.x] += dely * dzfp;
188 }
189
190 }
191 }
192 }
193
194 __syncthreads();
195
196 if(ii < _inum) {
197 F_FLOAT* my_f = _f + i;
198 *my_f += fxtmp;
199 my_f += _nmax;
200 *my_f += fytmp;
201 my_f += _nmax;
202 *my_f += fztmp;
203 F_FLOAT* my_torque = _torque + i;
204 *my_torque += torquextmp;
205 my_torque += _nmax;
206 *my_torque += torqueytmp;
207 my_torque += _nmax;
208 *my_torque += torqueztmp;
209 }
210
211 __syncthreads();
212
213 if(eflag) sharedE[0] = evdwl;
214
215 if(eflag_atom && i < _nlocal) _eatom[i] += evdwl;
216
217 if(vflag_atom && i < _nlocal) {
218 _vatom[i] += ENERGY_F(0.5) * sharedV[0 * blockDim.x];
219 _vatom[i + _nmax] += ENERGY_F(0.5) * sharedV[1 * blockDim.x];
220 _vatom[i + 2 * _nmax] += ENERGY_F(0.5) * sharedV[2 * blockDim.x];
221 _vatom[i + 3 * _nmax] += ENERGY_F(0.5) * sharedV[3 * blockDim.x];
222 _vatom[i + 4 * _nmax] += ENERGY_F(0.5) * sharedV[4 * blockDim.x];
223 _vatom[i + 5 * _nmax] += ENERGY_F(0.5) * sharedV[5 * blockDim.x];
224 }
225
226 if(vflag || eflag) PairVirialCompute_A_Kernel(eflag, vflag, 0);
227 }
228