1 /* ----------------------------------------------------------------------
2    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
3 
4    Original Version:
5    http://lammps.sandia.gov, Sandia National Laboratories
6    Steve Plimpton, sjplimp@sandia.gov
7 
8    See the README file in the top-level LAMMPS directory.
9 
10    -----------------------------------------------------------------------
11 
12    USER-CUDA Package and associated modifications:
13    https://sourceforge.net/projects/lammpscuda/
14 
15    Christian Trott, christian.trott@tu-ilmenau.de
16    Lars Winterfeld, lars.winterfeld@tu-ilmenau.de
17    Theoretical Physics II, University of Technology Ilmenau, Germany
18 
19    See the README file in the USER-CUDA directory.
20 
21    This software is distributed under the GNU General Public License.
22 ------------------------------------------------------------------------- */
23 #define Pi F_F(3.1415926535897932384626433832795)
24 #define PI Pi
25 #define PI2 F_F(0.5)*Pi
26 #define PI4 F_F(0.25)*Pi
27 template <const int eflag, const int vflag>
PairVirialCompute_A_Kernel_Template()28 static inline __device__ void PairVirialCompute_A_Kernel_Template()
29 {
30   __syncthreads();
31   ENERGY_FLOAT* shared = sharedmem;
32 
33   if(eflag) {
34     reduceBlock(shared);
35     shared += blockDim.x;
36   }
37 
38   if(vflag) {
39     reduceBlock(shared + 0 * blockDim.x);
40     reduceBlock(shared + 1 * blockDim.x);
41     reduceBlock(shared + 2 * blockDim.x);
42     reduceBlock(shared + 3 * blockDim.x);
43     reduceBlock(shared + 4 * blockDim.x);
44     reduceBlock(shared + 5 * blockDim.x);
45   }
46 
47   if(threadIdx.x == 0) {
48     shared = sharedmem;
49     ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
50 
51     if(eflag) {
52       buffer[blockIdx.x * gridDim.y + blockIdx.y] = ENERGY_F(0.5) * shared[0];
53       shared += blockDim.x;
54       buffer += gridDim.x * gridDim.y;
55     }
56 
57     if(vflag) {
58       buffer[blockIdx.x * gridDim.y + blockIdx.y + 0 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[0 * blockDim.x];
59       buffer[blockIdx.x * gridDim.y + blockIdx.y + 1 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[1 * blockDim.x];
60       buffer[blockIdx.x * gridDim.y + blockIdx.y + 2 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[2 * blockDim.x];
61       buffer[blockIdx.x * gridDim.y + blockIdx.y + 3 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[3 * blockDim.x];
62       buffer[blockIdx.x * gridDim.y + blockIdx.y + 4 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[4 * blockDim.x];
63       buffer[blockIdx.x * gridDim.y + blockIdx.y + 5 * gridDim.x * gridDim.y] = ENERGY_F(0.5) * shared[5 * blockDim.x];
64     }
65   }
66 
67   __syncthreads();
68 }
69 
virial_fdotr_compute_kernel(int eflag)70 __global__ void virial_fdotr_compute_kernel(int eflag)
71 {
72   int i = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
73   ENERGY_FLOAT* sharedE = (ENERGY_FLOAT*) &sharedmem[0];
74   ENERGY_FLOAT* sharedVirial = (ENERGY_FLOAT*) &sharedE[blockDim.x];
75   sharedE += threadIdx.x;
76   sharedVirial += threadIdx.x;
77 
78   if(i < _nlocal) {
79 
80     F_FLOAT x = _x[i];
81     F_FLOAT y = _x[i + _nmax];
82     F_FLOAT z = _x[i + 2 * _nmax];
83     F_FLOAT fx = _f[i];
84     F_FLOAT fy = _f[i + _nmax];
85     F_FLOAT fz = _f[i + 2 * _nmax];
86     //if(fz*z*fz*z>1e-5) printf("V %i %i %e %e %e %e %e %e\n",i,_tag[i],x,y,z,fx,fy,fz);
87     sharedVirial[0] = fx * x;
88     sharedVirial[1 * blockDim.x] = fy * y;
89     sharedVirial[2 * blockDim.x] = fz * z;
90     sharedVirial[3 * blockDim.x] = fy * x;
91     sharedVirial[4 * blockDim.x] = fz * x;
92     sharedVirial[5 * blockDim.x] = fz * y;
93   } else {
94     sharedVirial[0] = 0;
95     sharedVirial[1 * blockDim.x] = 0;
96     sharedVirial[2 * blockDim.x] = 0;
97     sharedVirial[3 * blockDim.x] = 0;
98     sharedVirial[4 * blockDim.x] = 0;
99     sharedVirial[5 * blockDim.x] = 0;
100   }
101 
102   sharedVirial = (ENERGY_FLOAT*) &sharedmem[0];
103   sharedVirial += blockDim.x;
104   reduceBlockP2(sharedVirial);
105   reduceBlockP2(&sharedVirial[1 * blockDim.x]);
106   reduceBlockP2(&sharedVirial[2 * blockDim.x]);
107   reduceBlockP2(&sharedVirial[3 * blockDim.x]);
108   reduceBlockP2(&sharedVirial[4 * blockDim.x]);
109   reduceBlockP2(&sharedVirial[5 * blockDim.x]);
110 
111   if(threadIdx.x < 6) {
112     ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
113 
114     if(eflag) buffer = &buffer[gridDim.x * gridDim.y];
115 
116     buffer[blockIdx.x * gridDim.y + blockIdx.y + threadIdx.x * gridDim.x * gridDim.y] = sharedVirial[threadIdx.x * blockDim.x];
117   }
118 }
119 
120 /*#define vec3_scale(K,X,Y) Y.x = K*X.x;  Y.y = K*X.y;  Y.z = K*X.z;
121 #define vec3_scaleadd(K,X,Y,Z) Z.x = K*X.x+Y.x;  Z.y = K*X.y+Y.y;  Z.z = K*X.z+Y.z;
122 #define vec3_add(X,Y,Z) Z.x = X.x+Y.x;  Z.y = X.y+Y.y;  Z.z = X.z+Y.z;
123 #define vec3_dot(X,Y) (X.x*Y.x + X.y*Y.y + X.z*Y.z)*/
124 
vec3_scale(F_FLOAT k,F_FLOAT3 & x,F_FLOAT3 & y)125 __device__ inline void vec3_scale(F_FLOAT k, F_FLOAT3 &x, F_FLOAT3 &y)
126 {
127   y.x = k * x.x;
128   y.y = k * x.y;
129   y.z = k * x.z;
130 }
131 
vec3_scale(F_FLOAT k,F_FLOAT4 & x,F_FLOAT3 & y)132 __device__ inline void vec3_scale(F_FLOAT k, F_FLOAT4 &x, F_FLOAT3 &y)
133 {
134   y.x = k * x.x;
135   y.y = k * x.y;
136   y.z = k * x.z;
137 }
138 
vec3_scale(F_FLOAT k,F_FLOAT4 & x,F_FLOAT4 & y)139 __device__ inline void vec3_scale(F_FLOAT k, F_FLOAT4 &x, F_FLOAT4 &y)
140 {
141   y.x = k * x.x;
142   y.y = k * x.y;
143   y.z = k * x.z;
144 }
145 
vec3_scaleadd(F_FLOAT k,F_FLOAT3 & x,F_FLOAT3 & y,F_FLOAT3 & z)146 __device__ inline void vec3_scaleadd(F_FLOAT k, F_FLOAT3 &x, F_FLOAT3 &y, F_FLOAT3 &z)
147 {
148   z.x = k * x.x + y.x;
149   z.y = k * x.y + y.y;
150   z.z = k * x.z + y.z;
151 }
152 
vec3_add(F_FLOAT3 & x,F_FLOAT3 & y,F_FLOAT3 & z)153 __device__ inline void vec3_add(F_FLOAT3 &x, F_FLOAT3 &y, F_FLOAT3 &z)
154 {
155   z.x = x.x + y.x;
156   z.y = x.y + y.y;
157   z.z = x.z + y.z;
158 }
159 
vec3_dot(F_FLOAT3 x,F_FLOAT3 y)160 __device__ inline F_FLOAT vec3_dot(F_FLOAT3 x, F_FLOAT3 y)
161 {
162   return x.x * y.x + x.y * y.y + x.z * y.z;
163 }
164 
vec3_dot(F_FLOAT4 x,F_FLOAT4 y)165 __device__ inline F_FLOAT vec3_dot(F_FLOAT4 x, F_FLOAT4 y)
166 {
167   return x.x * y.x + x.y * y.y + x.z * y.z;
168 }
169 
170 /* ----------------------------------------------------------------------
171    Fermi-like smoothing function
172 ------------------------------------------------------------------------- */
173 
F_fermi(F_FLOAT & r,int & iparam)174 __device__ inline F_FLOAT F_fermi(F_FLOAT &r, int &iparam)
175 {
176   return F_F(1.0) / (F_F(1.0) + exp(-params[iparam].ZBLexpscale * (r - params[iparam].ZBLcut)));
177 }
178 
179 /* ----------------------------------------------------------------------
180    Fermi-like smoothing function derivative with respect to r
181 ------------------------------------------------------------------------- */
182 
F_fermi_d(F_FLOAT & r,int & iparam)183 __device__ inline F_FLOAT F_fermi_d(F_FLOAT &r, int &iparam)
184 {
185   volatile const F_FLOAT tmp =  exp(-params[iparam].ZBLexpscale * (r - params[iparam].ZBLcut));
186   return params[iparam].ZBLexpscale * tmp /
187          ((F_F(1.0) + tmp) * (F_F(1.0) + tmp));
188 }
189 
ters_fc(F_FLOAT r,F_FLOAT ters_R,F_FLOAT ters_D)190 __device__ inline F_FLOAT ters_fc(F_FLOAT r, F_FLOAT ters_R, F_FLOAT ters_D)
191 {
192   return (r < ters_R - ters_D) ? F_F(1.0) : ((r > ters_R + ters_D) ?
193          F_F(0.0) : F_F(0.5) * (F_F(1.0) - sin(PI2 * (r - ters_R) / ters_D)));
194 }
195 
ters_fc_d(F_FLOAT r,F_FLOAT ters_R,F_FLOAT ters_D)196 __device__ inline F_FLOAT ters_fc_d(F_FLOAT r, F_FLOAT ters_R, F_FLOAT ters_D)
197 {
198   return ((r < ters_R - ters_D) || (r > ters_R + ters_D)) ?
199          F_F(0.0) : -(PI4 / ters_D) * cos(PI2 * (r - ters_R) / ters_D);
200 }
201 
202 
ters_gijk(F_FLOAT & cos_theta,int iparam)203 __device__ inline F_FLOAT ters_gijk(F_FLOAT &cos_theta, int iparam)
204 {
205   F_FLOAT ters_c = params[iparam].c;
206   F_FLOAT ters_d = params[iparam].d;
207 
208   return params[iparam].gamma * (F_F(1.0) + pow(params[iparam].c / params[iparam].d, F_F(2.0)) -
209                                  pow(ters_c, F_F(2.0)) / (pow(ters_d, F_F(2.0)) + pow(params[iparam].h - cos_theta, F_F(2.0))));
210 }
211 
ters_gijk2(F_FLOAT & cos_theta,int iparam)212 __device__ F_FLOAT ters_gijk2(F_FLOAT &cos_theta, int iparam)
213 {
214   F_FLOAT ters_c = params[iparam].c;
215   F_FLOAT ters_d = params[iparam].d;
216 
217   return params[iparam].gamma * (F_F(1.0) + pow(ters_c / ters_d, F_F(2.0)) -
218                                  pow(ters_c, F_F(2.0)) / (pow(ters_d, F_F(2.0)) + pow(params[iparam].h - cos_theta, F_F(2.0))));
219 }
220 
ters_gijk_d(F_FLOAT costheta,int iparam)221 __device__ inline F_FLOAT ters_gijk_d(F_FLOAT costheta, int iparam)
222 {
223   F_FLOAT numerator = -F_F(2.0) * pow(params[iparam].c, F_F(2.0)) * (params[iparam].h - costheta);
224   F_FLOAT denominator = pow(pow(params[iparam].d, F_F(2.0)) +
225                             pow(params[iparam].h - costheta, F_F(2.0)), F_F(2.0));
226   return params[iparam].gamma * numerator / denominator;
227 }
228 
zeta(int iparam,const F_FLOAT rsqij,const F_FLOAT rsqik,F_FLOAT3 & delij,F_FLOAT3 & delik)229 __device__ inline F_FLOAT zeta(int iparam, const F_FLOAT rsqij, const F_FLOAT rsqik,
230                                F_FLOAT3 &delij, F_FLOAT3 &delik)
231 {
232   F_FLOAT rij, rik, costheta, arg, ex_delr;
233 
234   rij = sqrt(rsqij);
235   rik = sqrt(rsqik);
236   costheta = vec3_dot(delij, delik) / (rij * rik);
237 
238   arg = (params[iparam].powermint == 3) ? (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik)) : params[iparam].lam3 * (rij - rik);
239 
240   if(arg > F_F(69.0776)) ex_delr = F_F(1.e30);
241   else if(arg < -F_F(69.0776)) ex_delr = F_F(0.0);
242   else ex_delr = exp(arg);
243 
244   return ters_fc(rik, params[iparam].bigr, params[iparam].bigd) * ex_delr * params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c / (params[iparam].d * params[iparam].d)) -
245          (params[iparam].c * params[iparam].c) / ((params[iparam].d * params[iparam].d) + (params[iparam].h - costheta) * (params[iparam].h - costheta)));
246 }
247 
repulsive(int iparam,F_FLOAT rsq,F_FLOAT & fforce,int eflag,ENERGY_FLOAT & eng)248 __device__ void repulsive(int iparam, F_FLOAT rsq, F_FLOAT &fforce,
249                           int eflag, ENERGY_FLOAT &eng)
250 {
251   F_FLOAT r, tmp_fc, tmp_fc_d, tmp_exp;
252 
253   F_FLOAT ters_R = params[iparam].bigr;
254   F_FLOAT ters_D = params[iparam].bigd;
255   r = sqrt(rsq);
256   tmp_fc = ters_fc(r, ters_R, ters_D);
257   tmp_fc_d = ters_fc_d(r, ters_R, ters_D);
258   tmp_exp = exp(-params[iparam].lam1 * r);
259 
260   if(!_zbl) {
261     fforce = -params[iparam].biga * tmp_exp * (tmp_fc_d - tmp_fc * params[iparam].lam1) / r;
262 
263     if(eflag) eng += tmp_fc * params[iparam].biga * tmp_exp;
264   } else {
265     F_FLOAT const fforce_ters = params[iparam].biga * tmp_exp * (tmp_fc_d - tmp_fc * params[iparam].lam1);
266     ENERGY_FLOAT eng_ters = tmp_fc * params[iparam].biga * tmp_exp;
267 
268     F_FLOAT r_ov_a = r / params[iparam].a_ij;
269     F_FLOAT phi = F_F(0.1818) * exp(-F_F(3.2) * r_ov_a) + F_F(0.5099) * exp(-F_F(0.9423) * r_ov_a) +
270                   F_F(0.2802) * exp(-F_F(0.4029) * r_ov_a) + F_F(0.02817) * exp(-F_F(0.2016) * r_ov_a);
271     F_FLOAT dphi = (F_F(1.0) / params[iparam].a_ij) * (-F_F(3.2) * F_F(0.1818) * exp(-F_F(3.2) * r_ov_a) -
272                    F_F(0.9423) * F_F(0.5099) * exp(-F_F(0.9423) * r_ov_a) -
273                    F_F(0.4029) * F_F(0.2802) * exp(-F_F(0.4029) * r_ov_a) -
274                    F_F(0.2016) * F_F(0.02817) * exp(-F_F(0.2016) * r_ov_a));
275     F_FLOAT fforce_ZBL = params[iparam].premult / (-r * r) * phi + params[iparam].premult / r * dphi;
276     ENERGY_FLOAT eng_ZBL = params[iparam].premult * (F_F(1.0) / r) * phi;
277 
278     fforce = -(-F_fermi_d(r, iparam) * (eng_ZBL - eng_ters) + fforce_ZBL + F_fermi(r, iparam) * (fforce_ters - fforce_ZBL)) / r;
279 
280     if(eflag)
281       eng += eng_ZBL + F_fermi(r, iparam) * (eng_ters - eng_ZBL);
282   }
283 
284 
285 }
286 
287 /* ---------------------------------------------------------------------- */
288 
ters_fa(F_FLOAT r,int iparam,F_FLOAT ters_R,F_FLOAT ters_D)289 __device__ inline F_FLOAT ters_fa(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLOAT ters_D)
290 {
291   if(r > ters_R + ters_D) return F_F(0.0);
292 
293   if(_zbl)
294     return -params[iparam].bigb * exp(-params[iparam].lam2 * r) * ters_fc(r, ters_R, ters_D) * F_fermi(r, iparam);
295   else
296     return -params[iparam].bigb * exp(-params[iparam].lam2 * r) * ters_fc(r, ters_R, ters_D);
297 }
298 
299 /* ---------------------------------------------------------------------- */
300 
ters_fa_d(F_FLOAT r,int iparam,F_FLOAT ters_R,F_FLOAT ters_D)301 __device__ inline F_FLOAT ters_fa_d(F_FLOAT r, int iparam, F_FLOAT ters_R, F_FLOAT ters_D)
302 {
303   if(r > ters_R + ters_D) return F_F(0.0);
304 
305   if(_zbl)
306     return params[iparam].bigb * exp(-params[iparam].lam2 * r) *
307            ((params[iparam].lam2 * ters_fc(r, ters_R, ters_D) - ters_fc_d(r, ters_R, ters_D)) * F_fermi(r, iparam)
308             - ters_fc(r, ters_R, ters_D) * F_fermi_d(r, iparam));
309   else
310     return params[iparam].bigb * exp(-params[iparam].lam2 * r) *
311            (params[iparam].lam2 * ters_fc(r, ters_R, ters_D) - ters_fc_d(r, ters_R, ters_D));
312 }
313 
314 /* ---------------------------------------------------------------------- */
315 
ters_bij(F_FLOAT zeta,int iparam)316 __device__ inline F_FLOAT ters_bij(F_FLOAT zeta, int iparam)
317 {
318   F_FLOAT tmp = params[iparam].beta * zeta;
319 
320   if(tmp > params[iparam].c1) return F_F(1.0) / sqrt(tmp);
321 
322   if(tmp > params[iparam].c2)
323     return (F_F(1.0) - pow(tmp, -params[iparam].powern) / (F_F(2.0) * params[iparam].powern)) / sqrt(tmp);
324 
325   if(tmp < params[iparam].c4) return F_F(1.0);
326 
327   if(tmp < params[iparam].c3)
328     return F_F(1.0) - pow(tmp, params[iparam].powern) / (F_F(2.0) * params[iparam].powern);
329 
330   return pow(F_F(1.0) + pow(tmp, params[iparam].powern), -F_F(1.0) / (F_F(2.0) * params[iparam].powern));
331 }
332 
333 /* ---------------------------------------------------------------------- */
334 
ters_bij_d(F_FLOAT zeta,int iparam)335 __device__ inline F_FLOAT ters_bij_d(F_FLOAT zeta, int iparam)
336 {
337   F_FLOAT tmp = params[iparam].beta * zeta;
338 
339   if(tmp > params[iparam].c1) return params[iparam].beta * -F_F(0.5) * pow(tmp, -F_F(1.5));
340 
341   if(tmp > params[iparam].c2)
342     return params[iparam].beta * (-F_F(0.5) * pow(tmp, -F_F(1.5)) *
343                                   (F_F(1.0) - F_F(0.5) * (F_F(1.0) +  F_F(1.0) / (F_F(2.0) * params[iparam].powern)) *
344                                    pow(tmp, -params[iparam].powern)));
345 
346   if(tmp < params[iparam].c4) return F_F(0.0);
347 
348   if(tmp < params[iparam].c3)
349     return -F_F(0.5) * params[iparam].beta * pow(tmp, params[iparam].powern - F_F(1.0));
350 
351   F_FLOAT tmp_n = pow(tmp, params[iparam].powern);
352   return -F_F(0.5) * pow(F_F(1.0) + tmp_n, -F_F(1.0) - (F_F(1.0) / (F_F(2.0) * params[iparam].powern))) * tmp_n / zeta;
353 }
354 
force_zeta(int iparam,F_FLOAT rsq,F_FLOAT zeta_ij,F_FLOAT & fforce,F_FLOAT & prefactor,int eflag,F_FLOAT & eng)355 __device__ void force_zeta(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij,
356                            F_FLOAT &fforce, F_FLOAT &prefactor,
357                            int eflag, F_FLOAT &eng)
358 {
359   F_FLOAT r, fa, fa_d, bij;
360   F_FLOAT ters_R = params[iparam].bigr;
361   F_FLOAT ters_D = params[iparam].bigd;
362   r = sqrt(rsq);
363   fa = ters_fa(r, iparam, ters_R, ters_D);
364   fa_d = ters_fa_d(r, iparam, ters_R, ters_D);
365   bij = ters_bij(zeta_ij, iparam);
366   fforce = F_F(0.5) * bij * fa_d / r;
367   prefactor = -F_F(0.5) * fa * ters_bij_d(zeta_ij, iparam);
368 
369   if(eflag) eng += bij * fa;
370 }
371 
force_zeta_prefactor_force(int iparam,F_FLOAT rsq,F_FLOAT zeta_ij,F_FLOAT & fforce,F_FLOAT & prefactor)372 __device__ void force_zeta_prefactor_force(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij,
373     F_FLOAT &fforce, F_FLOAT &prefactor)
374 {
375   F_FLOAT r, fa, fa_d, bij;
376   F_FLOAT ters_R = params[iparam].bigr;
377   F_FLOAT ters_D = params[iparam].bigd;
378   r = sqrt(rsq);
379   fa = ters_fa(r, iparam, ters_R, ters_D);
380   fa_d = ters_fa_d(r, iparam, ters_R, ters_D);
381   bij = ters_bij(zeta_ij, iparam);
382   fforce = F_F(0.5) * bij * fa_d / r;
383   prefactor = -F_F(0.5) * fa * ters_bij_d(zeta_ij, iparam);
384 }
385 
force_zeta_prefactor(int iparam,F_FLOAT rsq,F_FLOAT zeta_ij,F_FLOAT & prefactor)386 __device__ void force_zeta_prefactor(int iparam, F_FLOAT rsq, F_FLOAT zeta_ij,
387                                      F_FLOAT &prefactor)
388 {
389   F_FLOAT r, fa;
390   r = sqrt(rsq);
391   fa = ters_fa(r, iparam, params[iparam].bigr, params[iparam].bigd);
392   prefactor = -F_F(0.5) * fa * ters_bij_d(zeta_ij, iparam);
393 }
394 
395 
costheta_d(F_FLOAT3 & rij_hat,F_FLOAT & rij,F_FLOAT3 & rik_hat,F_FLOAT & rik,F_FLOAT3 & dri,F_FLOAT3 & drj,F_FLOAT3 & drk)396 __device__ void costheta_d(F_FLOAT3 &rij_hat, F_FLOAT &rij,
397                            F_FLOAT3 &rik_hat, F_FLOAT &rik,
398                            F_FLOAT3 &dri, F_FLOAT3 &drj, F_FLOAT3 &drk)
399 {
400   // first element is derivative wrt Ri, second wrt Rj, third wrt Rk
401 
402   F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat);
403 
404   vec3_scaleadd(-cos_theta, rij_hat, rik_hat, drj);
405   vec3_scale(F_F(1.0) / rij, drj, drj);
406   vec3_scaleadd(-cos_theta, rik_hat, rij_hat, drk);
407   vec3_scale(F_F(1.0) / rik, drk, drk);
408   vec3_add(drj, drk, dri);
409   vec3_scale(-F_F(1.0), dri, dri);
410 }
411 
ters_zetaterm_d(F_FLOAT prefactor,F_FLOAT3 & rij_hat,F_FLOAT rij,F_FLOAT3 & rik_hat,F_FLOAT rik,F_FLOAT3 & dri,F_FLOAT3 & drj,F_FLOAT3 & drk,int iparam)412 __device__ void ters_zetaterm_d(F_FLOAT prefactor,
413                                 F_FLOAT3 &rij_hat, F_FLOAT rij,
414                                 F_FLOAT3 &rik_hat, F_FLOAT rik,
415                                 F_FLOAT3 &dri, F_FLOAT3 &drj, F_FLOAT3 &drk,
416                                 int iparam)
417 {
418   F_FLOAT ex_delr, ex_delr_d, tmp;
419   F_FLOAT3 dcosdri, dcosdrj, dcosdrk;
420 
421   if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik));
422   else tmp = params[iparam].lam3 * (rij - rik);
423 
424   if(tmp > F_F(69.0776)) ex_delr = F_F(1.e30);
425   else if(tmp < -F_F(69.0776)) ex_delr = F_F(0.0);
426   else ex_delr = exp(tmp);
427 
428   if(params[iparam].powermint == 3)
429     ex_delr_d = F_F(3.0) * (params[iparam].lam3 * params[iparam].lam3 * params[iparam].lam3) * (rij - rik) * (rij - rik) * ex_delr;
430   else ex_delr_d = params[iparam].lam3 * ex_delr;
431 
432 
433   const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat);
434   costheta_d(rij_hat, rij, rik_hat, rik, dcosdri, dcosdrj, dcosdrk);
435 
436   const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) -
437                        (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta)));
438   const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta);
439   const F_FLOAT denominator = (params[iparam].d * params[iparam].d) +
440                               (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta);
441   const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri
442   // dri = -dfc*gijk*ex_delr*rik_hat;
443   // dri += fc*gijk_d*ex_delr*dcosdri;
444   // dri += fc*gijk*ex_delr_d*(rik_hat - rij_hat);
445   const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd);
446   const F_FLOAT dfc = ters_fc_d(rik, params[iparam].bigr, params[iparam].bigd);
447 
448 
449   vec3_scale(-dfc * gijk * ex_delr, rik_hat, dri);
450   vec3_scaleadd(fc * gijk_d * ex_delr, dcosdri, dri, dri);
451   vec3_scaleadd(fc * gijk * ex_delr_d, rik_hat, dri, dri);
452   vec3_scaleadd(-fc * gijk * ex_delr_d, rij_hat, dri, dri);
453   vec3_scale(prefactor, dri, dri);
454   // compute the derivative wrt Rj
455   // drj = fc*gijk_d*ex_delr*dcosdrj;
456   // drj += fc*gijk*ex_delr_d*rij_hat;
457 
458   vec3_scale(fc * gijk_d * ex_delr, dcosdrj, drj);
459   vec3_scaleadd(fc * gijk * ex_delr_d, rij_hat, drj, drj);
460   vec3_scale(prefactor, drj, drj);
461 
462   // compute the derivative wrt Rk
463   // drk = dfc*gijk*ex_delr*rik_hat;
464   // drk += fc*gijk_d*ex_delr*dcosdrk;
465   // drk += -fc*gijk*ex_delr_d*rik_hat;
466 
467   vec3_scale(dfc * gijk * ex_delr, rik_hat, drk);
468   vec3_scaleadd(fc * gijk_d * ex_delr, dcosdrk, drk, drk);
469   vec3_scaleadd(-fc * gijk * ex_delr_d, rik_hat, drk, drk);
470   vec3_scale(prefactor, drk, drk);
471 }
472 
ters_zetaterm_d_fi(F_FLOAT & prefactor,F_FLOAT3 & rij_hat,F_FLOAT & rij,F_FLOAT3 & rik_hat,F_FLOAT & rik,F_FLOAT3 & dri,int & iparam)473 __device__ void ters_zetaterm_d_fi(F_FLOAT &prefactor,
474                                    F_FLOAT3 &rij_hat, F_FLOAT &rij,
475                                    F_FLOAT3 &rik_hat, F_FLOAT &rik,
476                                    F_FLOAT3 &dri,  int &iparam)
477 {
478   F_FLOAT ex_delr, ex_delr_d, tmp;
479 
480   if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik));
481   else tmp = params[iparam].lam3 * (rij - rik);
482 
483   if(tmp > F_F(69.0776)) ex_delr = F_F(1.e30);
484   else if(tmp < -F_F(69.0776)) ex_delr = F_F(0.0);
485   else ex_delr = exp(tmp);
486 
487   if(params[iparam].powermint == 3)
488     ex_delr_d = F_F(3.0) * (params[iparam].lam3 * params[iparam].lam3 * params[iparam].lam3) * (rij - rik) * (rij - rik) * ex_delr;
489   else ex_delr_d = params[iparam].lam3 * ex_delr;
490 
491   const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat);
492   //costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
493 
494 
495   F_FLOAT3 dcosdri;
496   vec3_scaleadd(-cos_theta, rij_hat, rik_hat, dri);
497   vec3_scale(F_F(1.0) / rij, dri, dri);
498   vec3_scaleadd(-cos_theta, rik_hat, rij_hat, dcosdri);
499   vec3_scale(F_F(1.0) / rik, dcosdri, dcosdri);
500   vec3_add(dri, dcosdri, dcosdri);
501   vec3_scale(-F_F(1.0), dcosdri, dcosdri);
502 
503   const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) -
504                        (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta)));
505   const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta);
506   const F_FLOAT denominator = (params[iparam].d * params[iparam].d) +
507                               (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta);
508   const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri
509   //
510   const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd);
511   const F_FLOAT dfc = ters_fc_d(rik, params[iparam].bigr, params[iparam].bigd);
512 
513   vec3_scale(-dfc * gijk * ex_delr, rik_hat, dri);
514   vec3_scaleadd(fc * gijk_d * ex_delr, dcosdri, dri, dri);
515   vec3_scaleadd(fc * gijk * ex_delr_d, rik_hat, dri, dri);
516   vec3_scaleadd(-fc * gijk * ex_delr_d, rij_hat, dri, dri);
517   vec3_scale(prefactor, dri, dri);
518 
519 }
520 
ters_zetaterm_d_fj(F_FLOAT & prefactor,F_FLOAT3 & rij_hat,F_FLOAT & rij,F_FLOAT3 & rik_hat,F_FLOAT & rik,F_FLOAT3 & drj,int & iparam)521 __device__ void ters_zetaterm_d_fj(F_FLOAT &prefactor,
522                                    F_FLOAT3 &rij_hat, F_FLOAT &rij,
523                                    F_FLOAT3 &rik_hat, F_FLOAT &rik,
524                                    F_FLOAT3 &drj, int &iparam)
525 {
526   F_FLOAT ex_delr, ex_delr_d, tmp;
527 
528   if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik));
529   else tmp = params[iparam].lam3 * (rij - rik);
530 
531   if(tmp > F_F(69.0776)) ex_delr = F_F(1.e30);
532   else if(tmp < -F_F(69.0776)) ex_delr = F_F(0.0);
533   else ex_delr = exp(tmp);
534 
535   if(params[iparam].powermint == 3)
536     ex_delr_d = F_F(3.0) * (params[iparam].lam3 * params[iparam].lam3 * params[iparam].lam3) * (rij - rik) * (rij - rik) * ex_delr;
537   else ex_delr_d = params[iparam].lam3 * ex_delr;
538 
539   const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat);
540   vec3_scaleadd(-cos_theta, rij_hat, rik_hat, drj);
541   vec3_scale(F_F(1.0) / rij, drj, drj);
542 
543   const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) -
544                        (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta)));
545   const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta);
546   const F_FLOAT denominator = (params[iparam].d * params[iparam].d) +
547                               (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta);
548   const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri
549 
550   const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd);
551 
552   vec3_scale(fc * gijk_d * ex_delr, drj, drj);
553   vec3_scaleadd(fc * gijk * ex_delr_d, rij_hat, drj, drj);
554   vec3_scale(prefactor, drj, drj);
555 }
556 
ters_zetaterm_d_fk(F_FLOAT & prefactor,F_FLOAT3 & rij_hat,F_FLOAT & rij,F_FLOAT3 & rik_hat,F_FLOAT & rik,F_FLOAT3 & drk,int & iparam)557 __device__ void ters_zetaterm_d_fk(F_FLOAT &prefactor,
558                                    F_FLOAT3 &rij_hat, F_FLOAT &rij,
559                                    F_FLOAT3 &rik_hat, F_FLOAT &rik,
560                                    F_FLOAT3 &drk, int &iparam)
561 {
562   F_FLOAT ex_delr, ex_delr_d, tmp;
563 
564   if(params[iparam].powermint == 3) tmp = (params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik) * params[iparam].lam3 * (rij - rik));
565   else tmp = params[iparam].lam3 * (rij - rik);
566 
567   if(tmp > F_F(69.0776)) ex_delr = F_F(1.e30);
568   else if(tmp < -F_F(69.0776)) ex_delr = F_F(0.0);
569   else ex_delr = exp(tmp);
570 
571   if(params[iparam].powermint == 3)
572     ex_delr_d = F_F(3.0) * (params[iparam].lam3 * params[iparam].lam3 * params[iparam].lam3) * (rij - rik) * (rij - rik) * ex_delr;
573   else ex_delr_d = params[iparam].lam3 * ex_delr;
574 
575   const F_FLOAT cos_theta = vec3_dot(rij_hat, rik_hat);
576   vec3_scaleadd(-cos_theta, rik_hat, rij_hat, drk);
577   vec3_scale(F_F(1.0) / rik, drk, drk);
578 
579   const F_FLOAT gijk = params[iparam].gamma * (F_F(1.0) + (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d) -
580                        (params[iparam].c * params[iparam].c) / (params[iparam].d * params[iparam].d + (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta)));
581   const F_FLOAT numerator = -F_F(2.0) * params[iparam].c * params[iparam].c * (params[iparam].h - cos_theta);
582   const F_FLOAT denominator = (params[iparam].d * params[iparam].d) +
583                               (params[iparam].h - cos_theta) * (params[iparam].h - cos_theta);
584   const F_FLOAT gijk_d = params[iparam].gamma * numerator / (denominator * denominator); // compute the derivative wrt Ri
585 
586   const F_FLOAT fc = ters_fc(rik, params[iparam].bigr, params[iparam].bigd);
587   const F_FLOAT dfc = ters_fc_d(rik, params[iparam].bigr, params[iparam].bigd);
588 
589   vec3_scale(fc * gijk_d * ex_delr, drk, drk);
590   vec3_scaleadd(dfc * gijk * ex_delr, rik_hat, drk, drk);
591   vec3_scaleadd(-fc * gijk * ex_delr_d, rik_hat, drk, drk);
592   vec3_scale(prefactor, drk, drk);
593 }
594 
attractive(int iparam,F_FLOAT prefactor,F_FLOAT4 & delij,F_FLOAT4 & delik,F_FLOAT3 & fi,F_FLOAT3 & fj,F_FLOAT3 & fk)595 __device__ void attractive(int iparam, F_FLOAT prefactor,
596                            F_FLOAT4 &delij,
597                            F_FLOAT4 &delik,
598                            F_FLOAT3 &fi, F_FLOAT3 &fj, F_FLOAT3 &fk)
599 {
600   F_FLOAT3 rij_hat, rik_hat;
601   F_FLOAT rij, rijinv, rik, rikinv;
602 
603   rij = sqrt(delij.w);
604   rijinv = F_F(1.0) / rij;
605   vec3_scale(rijinv, delij, rij_hat);
606 
607   rik = sqrt(delik.w);
608   rikinv = F_F(1.0) / rik;
609   vec3_scale(rikinv, delik, rik_hat);
610 
611   ters_zetaterm_d(prefactor, rij_hat, rij, rik_hat, rik, fi, fj, fk, iparam);
612 }
613 
attractive_fi(int & iparam,F_FLOAT & prefactor,F_FLOAT4 & delij,F_FLOAT4 & delik,F_FLOAT3 & f)614 __device__ void attractive_fi(int &iparam, F_FLOAT &prefactor,
615                               F_FLOAT4 &delij,
616                               F_FLOAT4 &delik,
617                               F_FLOAT3 &f)
618 {
619   F_FLOAT3 rij_hat, rik_hat;
620   F_FLOAT rij, rijinv, rik, rikinv;
621 
622   rij = sqrt(delij.w);
623   rijinv = F_F(1.0) / rij;
624   vec3_scale(rijinv, delij, rij_hat);
625 
626   rik = sqrt(delik.w);
627   rikinv = F_F(1.0) / rik;
628   vec3_scale(rikinv, delik, rik_hat);
629 
630   ters_zetaterm_d_fi(prefactor, rij_hat, rij, rik_hat, rik, f, iparam);
631 }
632 
attractive_fj(int iparam,F_FLOAT prefactor,F_FLOAT4 & delij,F_FLOAT4 & delik,F_FLOAT3 & f)633 __device__ void attractive_fj(int iparam, F_FLOAT prefactor,
634                               F_FLOAT4 &delij,
635                               F_FLOAT4 &delik,
636                               F_FLOAT3 &f)
637 {
638   F_FLOAT3 rij_hat, rik_hat;
639   F_FLOAT rij, rijinv, rik, rikinv;
640 
641   rij = sqrt(delij.w);
642   rijinv = F_F(1.0) / rij;
643   vec3_scale(rijinv, delij, rij_hat);
644 
645   rik = sqrt(delik.w);
646   rikinv = F_F(1.0) / rik;
647   vec3_scale(rikinv, delik, rik_hat);
648 
649   ters_zetaterm_d_fj(prefactor, rij_hat, rij, rik_hat, rik, f, iparam);
650 }
651 
attractive_fk(int iparam,F_FLOAT prefactor,F_FLOAT4 & delij,F_FLOAT4 & delik,F_FLOAT3 & f)652 __device__ void attractive_fk(int iparam, F_FLOAT prefactor,
653                               F_FLOAT4 &delij,
654                               F_FLOAT4 &delik,
655                               F_FLOAT3 &f)
656 {
657   F_FLOAT3 rij_hat, rik_hat;
658   F_FLOAT rij, rijinv, rik, rikinv;
659 
660   rij = sqrt(delij.w);
661   rijinv = F_F(1.0) / rij;
662   vec3_scale(rijinv, delij, rij_hat);
663 
664   rik = sqrt(delik.w);
665   rikinv = F_F(1.0) / rik;
666   vec3_scale(rikinv, delik, rik_hat);
667 
668   ters_zetaterm_d_fk(prefactor, rij_hat, rij, rik_hat, rik, f, iparam);
669 }
670 
Pair_Tersoff_Kernel_TpA_RIJ()671 __global__ void Pair_Tersoff_Kernel_TpA_RIJ()//F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red)
672 {
673   int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
674 
675   if(ii >= _nall) return;
676 
677   X_FLOAT4 myxtype;
678   F_FLOAT4 delij;
679   F_FLOAT xtmp, ytmp, ztmp;
680   int itype, jnum, i, j;
681   int* jlist;
682   int neigh_red = 0;
683   i = ii;//_ilist[ii];
684   myxtype = fetchXType(i);
685 
686   xtmp = myxtype.x;
687   ytmp = myxtype.y;
688   ztmp = myxtype.z;
689   itype = map[(static_cast <int>(myxtype.w))];
690 
691   jnum = _numneigh[i];
692   jlist = &_neighbors[i];
693 
694   __syncthreads();
695 
696   for(int jj = 0; jj < jnum; jj++) {
697     if(jj < jnum) {
698 
699       j = jlist[jj * _nall];
700       j &= NEIGHMASK;
701       myxtype = fetchXType(j);
702       delij.x = xtmp - myxtype.x;
703       delij.y = ytmp - myxtype.y;
704       delij.z = ztmp - myxtype.z;
705       int jtype = map[(static_cast <int>(myxtype.w))];
706       int iparam_ij = elem2param[(itype * nelements + jtype) * nelements + jtype];
707       delij.w = vec3_dot(delij, delij);
708 
709       if(delij.w < params[iparam_ij].cutsq) {
710         _glob_neighbors_red[i + neigh_red * _nall] = j;
711         _glob_neightype_red[i + neigh_red * _nall] = jtype;
712         _glob_r_ij[i + neigh_red * _nall] = delij;
713         neigh_red++;
714       }
715     }
716   }
717 
718   _glob_numneigh_red[i] = neigh_red;
719 }
720 
721 
Pair_Tersoff_Kernel_TpA_ZetaIJ()722 __global__ void Pair_Tersoff_Kernel_TpA_ZetaIJ()//F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red)
723 {
724 
725   int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
726 
727   if(ii >= _nall) return;
728 
729 
730   F_FLOAT4 delij;
731   F_FLOAT4 delik;
732 
733   int itype, jnum, i, j;
734   int* jlist;
735   i = ii;
736   itype = map[(static_cast <int>(_type[i]))];
737 
738   jnum = _glob_numneigh_red[i];
739   jlist = &_glob_neighbors_red[i];
740 
741   __syncthreads();
742 
743   for(int jj = 0; jj < jnum; jj++) {
744     if(jj < jnum) {
745 
746       j = jlist[jj * _nall];
747       j &= NEIGHMASK;
748       int jtype = _glob_neightype_red[i + jj * _nall];
749       delij = _glob_r_ij[i + jj * _nall];
750 
751       int iparam_ij = elem2param[(itype * nelements + jtype) * nelements + jtype];
752 
753       if(delij.w < params[iparam_ij].cutsq) {
754         F_FLOAT zeta_ij = 0.0;
755         F_FLOAT3 delij3 = {delij.x, delij.y, delij.z};
756 
757         for(int kk = 0; kk < jnum; kk++) {
758           if(jj == kk) continue;
759 
760           int k = jlist[kk * _nall];
761           k &= NEIGHMASK;
762 
763           int ktype = _glob_neightype_red[i + kk * _nall];
764           delik = _glob_r_ij[i + kk * _nall];
765           F_FLOAT3 delik3 = {delik.x, delik.y, delik.z};
766           int iparam_ijk = elem2param[(itype * nelements + jtype) * nelements + ktype];
767           const F_FLOAT rsqki = delik.w;
768 
769           if(rsqki <= params[iparam_ijk].cutsq)
770             zeta_ij += zeta(iparam_ijk, delij.w, rsqki, delij3, delik3);
771         }
772 
773         _glob_zeta_ij[i + jj * _nall] = zeta_ij;
774       }
775     }
776   }
777 }
778 
779 //back3: num 12 steps 10: ZetaIJ/TPA 0.255/0.106
780 //back5: num 12 steps 10: ZetaIJ/TPA 0.257/0.098
781 //back6: num 12 steps 10: ZetaIJ/TPA 0.027/0.097 /rij berechnung extra
782 //back12: num 12 steps 10: ZetaIJ/TPA 0.026/0.070
783 //back15: num 12 steps 10: ZetaIJ/TPA 0.0137/0.0287 //pow beseitigt
784 //        num 12 steps 10: ZetaIJ/TPA 0.0137/0.027
785 template <int eflag, int vflagm>
Pair_Tersoff_Kernel_TpA(int eflag_atom,int vflag_atom)786 __global__ void Pair_Tersoff_Kernel_TpA(int eflag_atom, int vflag_atom) //,F_FLOAT* _glob_zeta_ij,F_FLOAT4* _glob_r_ij,int* _glob_numneigh_red,int* _glob_neighbors_red,int* _glob_neightype_red)
787 {
788   ENERGY_FLOAT evdwl = ENERGY_F(0.0);
789 
790   ENERGY_FLOAT* sharedE = &sharedmem[threadIdx.x];
791   ENERGY_FLOAT* sharedV = &sharedmem[threadIdx.x];
792 
793   F_FLOAT* shared_F_F = (F_FLOAT*) sharedmem;
794 
795   if((eflag || eflag_atom) && (vflagm || vflag_atom)) shared_F_F = (F_FLOAT*) &sharedmem[7 * blockDim.x];
796   else if(eflag) shared_F_F = (F_FLOAT*) &sharedmem[blockDim.x];
797   else if(vflagm) shared_F_F = (F_FLOAT*) &sharedmem[6 * blockDim.x];
798 
799   shared_F_F += threadIdx.x;
800 
801   if(eflag_atom || eflag) {
802     sharedE[0] = ENERGY_F(0.0);
803     sharedV += blockDim.x;
804   }
805 
806   if(vflagm || vflag_atom) {
807     sharedV[0 * blockDim.x] = ENERGY_F(0.0);
808     sharedV[1 * blockDim.x] = ENERGY_F(0.0);
809     sharedV[2 * blockDim.x] = ENERGY_F(0.0);
810     sharedV[3 * blockDim.x] = ENERGY_F(0.0);
811     sharedV[4 * blockDim.x] = ENERGY_F(0.0);
812     sharedV[5 * blockDim.x] = ENERGY_F(0.0);
813   }
814 
815   int jnum_red = 0;
816 #define fxtmp shared_F_F[0]
817 #define fytmp shared_F_F[blockDim.x]
818 #define fztmp shared_F_F[2*blockDim.x]
819   //#define jnum_red (static_cast <int> (shared_F_F[3*blockDim.x]))
820 
821   int ii = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
822 
823   X_FLOAT4 myxtype_i, myxtype_j, myxtype_k;
824   F_FLOAT4 delij, delik, deljk;
825   F_FLOAT fpair;
826   F_FLOAT prefactor_ij, prefactor_ji;
827 
828   int itype, i, j;
829   int* jlist_red;
830 
831   if(ii < _inum) {
832     i = _ilist[ii];
833 
834     if(vflagm)
835       myxtype_i = fetchXType(i);
836 
837     //itype=map[(static_cast <int> (myxtype_i.w))];
838     itype = map[_type[i]];
839 
840 
841     fxtmp = F_F(0.0);
842     fytmp = F_F(0.0);
843     fztmp = F_F(0.0);
844 
845 
846     //shared_F_F[3*blockDim.x] = _glob_numneigh_red[i];
847     jnum_red = _glob_numneigh_red[i];
848     jlist_red = &_glob_neighbors_red[i];
849   }
850 
851   __syncthreads();
852 
853 #pragma unroll 1
854 
855   for(int jj = 0; jj < jnum_red; jj++) {
856     if(i < _nlocal) {
857       fpair = F_F(0.0);
858       j = jlist_red[jj * _nall];
859       j &= NEIGHMASK;
860 
861       if(vflagm)
862         myxtype_j = fetchXType(j);
863 
864       int jtype = _glob_neightype_red[i + jj * _nall];
865       delij = _glob_r_ij[i + jj * _nall];
866 
867       volatile int iparam_ij = elem2param[(itype * nelements + jtype) * nelements + jtype];
868       volatile int iparam_ji = elem2param[(jtype * nelements + itype) * nelements + itype];
869 
870       if(delij.w < params[iparam_ij].cutsq) {
871         F_FLOAT dxfp, dyfp, dzfp;
872         repulsive(iparam_ij, delij.w, fpair, eflag, evdwl);
873         fxtmp += dxfp = delij.x * fpair;
874         fytmp += dyfp = delij.y * fpair;
875         fztmp += dzfp = delij.z * fpair;
876 
877         if(vflagm) {
878           sharedV[0 * blockDim.x] += delij.x * dxfp;
879           sharedV[1 * blockDim.x] += delij.y * dyfp;
880           sharedV[2 * blockDim.x] += delij.z * dzfp;
881           sharedV[3 * blockDim.x] += delij.x * dyfp;
882           sharedV[4 * blockDim.x] += delij.x * dzfp;
883           sharedV[5 * blockDim.x] += delij.y * dzfp;
884         }
885 
886 
887 
888         force_zeta(iparam_ij, delij.w, _glob_zeta_ij[i + jj * _nall], fpair, prefactor_ij, eflag, evdwl);
889         fxtmp -=
890           dxfp = delij.x * fpair;
891         fytmp -=
892           dyfp = delij.y * fpair;
893         fztmp -=
894           dzfp = delij.z * fpair;
895 
896         if(vflagm) {
897           sharedV[0 * blockDim.x] -= ENERGY_F(2.0) * delij.x * dxfp;
898           sharedV[1 * blockDim.x] -= ENERGY_F(2.0) * delij.y * dyfp;
899           sharedV[2 * blockDim.x] -= ENERGY_F(2.0) * delij.z * dzfp;
900           sharedV[3 * blockDim.x] -= ENERGY_F(2.0) * delij.x * dyfp;
901           sharedV[4 * blockDim.x] -= ENERGY_F(2.0) * delij.x * dzfp;
902           sharedV[5 * blockDim.x] -= ENERGY_F(2.0) * delij.y * dzfp;
903         }
904 
905         int j_jj = 0;
906 
907         //#pragma unroll 1
908         for(int kk = 0; kk < _glob_numneigh_red[j]; kk++) {
909           if(_glob_neighbors_red[j + kk * _nall] == i) j_jj = kk;
910         }
911 
912         force_zeta_prefactor_force(iparam_ji, delij.w, _glob_zeta_ij[j + j_jj * _nall], fpair, prefactor_ji);
913 
914         fxtmp -=
915           dxfp = delij.x * fpair;
916         fytmp -=
917           dyfp = delij.y * fpair;
918         fztmp -=
919           dzfp = delij.z * fpair;
920 
921 
922 
923         vec3_scale(F_F(-1.0), delij, delij);
924 
925 #pragma unroll 1
926 
927         for(int kk = 0; kk < jnum_red; kk++) {
928           if(jj == kk) continue;
929 
930           int k = jlist_red[kk * _nall];
931           k &= NEIGHMASK;
932 
933           if(vflagm)
934             myxtype_k = fetchXType(k);
935 
936           delik = _glob_r_ij[i + kk * _nall];
937 
938           int ktype = _glob_neightype_red[i + kk * _nall];
939           int iparam_ijk = elem2param[(itype * nelements + jtype) * nelements + ktype];
940           vec3_scale(F_F(-1.0), delik, delik);
941 
942           if(delik.w <= params[iparam_ijk].cutsq) {
943             if(vflagm) {
944               F_FLOAT3 fi, fj, fk;
945               attractive(iparam_ijk, prefactor_ij,
946                          delij, delik, fi, fj, fk);
947               fxtmp += fi.x;
948               fytmp += fi.y;
949               fztmp += fi.z;
950 
951               sharedV[0 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.x * fi.x;
952               sharedV[1 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.y * fi.y;
953               sharedV[2 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.z * fi.z;
954               sharedV[3 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.x * fi.y;
955               sharedV[4 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.x * fi.z;
956               sharedV[5 * blockDim.x] += ENERGY_F(2.0) * myxtype_i.y * fi.z;
957 
958               sharedV[0 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.x * fj.x;
959               sharedV[1 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.y * fj.y;
960               sharedV[2 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.z * fj.z;
961               sharedV[3 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.x * fj.y;
962               sharedV[4 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.x * fj.z;
963               sharedV[5 * blockDim.x] += ENERGY_F(2.0) * myxtype_j.y * fj.z;
964 
965               sharedV[0 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.x;
966               sharedV[1 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.y * fk.y;
967               sharedV[2 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.z * fk.z;
968               sharedV[3 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.y;
969               sharedV[4 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.x * fk.z;
970               sharedV[5 * blockDim.x] += ENERGY_F(2.0) * myxtype_k.y * fk.z;
971             } else {
972               F_FLOAT3 fi; //local variable
973               attractive_fi(iparam_ijk, prefactor_ij,
974                             delij, delik, fi);
975               fxtmp += fi.x;
976               fytmp += fi.y;
977               fztmp += fi.z;
978 
979             }
980           }
981         }
982 
983         int j_jnum_red = _glob_numneigh_red[j];
984         int* j_jlist_red = &_glob_neighbors_red[j];
985 
986         int j_ii = 0;
987 
988         //#pragma unroll 1
989         for(int j_kk = 0; j_kk < j_jnum_red; j_kk++) {
990           if(j_jlist_red[j_kk * _nall] == i) j_ii = j_kk;
991         }
992 
993 #pragma unroll 1
994 
995         for(int kk = 0; kk < j_jnum_red; kk++) {
996           if(j_ii == kk) continue;
997 
998           int k = j_jlist_red[kk * _nall];
999           k &= NEIGHMASK;
1000           deljk = _glob_r_ij[j + kk * _nall];
1001           vec3_scale(F_F(-1.0), deljk, deljk);
1002           int ktype = _glob_neightype_red[j + kk * _nall];
1003 
1004           int iparam_jik = elem2param[(jtype * nelements + itype) * nelements + ktype];
1005           int iparam_jki = elem2param[(jtype * nelements + ktype) * nelements + itype];
1006 
1007 
1008           vec3_scale(F_F(-1.0), delij, delij);
1009 
1010           if(deljk.w <= params[iparam_jik].cutsq) {
1011             F_FLOAT3 ftmp; //local variable
1012 
1013             attractive_fj(iparam_jik, prefactor_ji,
1014                           delij, deljk, ftmp);
1015             fxtmp += ftmp.x;
1016             fytmp += ftmp.y;
1017             fztmp += ftmp.z;
1018             int iparam_jk = elem2param[(jtype * nelements + ktype) * nelements + ktype];
1019             F_FLOAT prefactor_jk;
1020             force_zeta_prefactor(iparam_jk, deljk.w, _glob_zeta_ij[j + kk * _nall], prefactor_jk);
1021 
1022             attractive_fk(iparam_jki, prefactor_jk,
1023                           deljk, delij, ftmp);
1024             fxtmp += ftmp.x;
1025             fytmp += ftmp.y;
1026             fztmp += ftmp.z;
1027 
1028           }
1029 
1030           vec3_scale(F_F(-1.0), delij, delij);
1031         }
1032       }
1033     }
1034 
1035   }
1036 
1037   __syncthreads();
1038 
1039   if(ii < _inum) {
1040     F_FLOAT* my_f;
1041 
1042     if(_collect_forces_later) {
1043       ENERGY_FLOAT* buffer = (ENERGY_FLOAT*) _buffer;
1044 
1045       if(eflag) {
1046         buffer = &buffer[1 * gridDim.x * gridDim.y];
1047       }
1048 
1049       if(vflagm) {
1050         buffer = &buffer[6 * gridDim.x * gridDim.y];
1051       }
1052 
1053       my_f = (F_FLOAT*) buffer;
1054       my_f += i;
1055       *my_f = fxtmp;
1056       my_f += _nmax;
1057       *my_f = fytmp;
1058       my_f += _nmax;
1059       *my_f = fztmp;
1060     } else {
1061       my_f = _f + i;
1062       *my_f += fxtmp;
1063       my_f += _nmax;
1064       *my_f += fytmp;
1065       my_f += _nmax;
1066       *my_f += fztmp;
1067     }
1068   }
1069 
1070   __syncthreads();
1071 
1072   if(eflag) {
1073     sharedE[0] = evdwl;
1074   }
1075 
1076   if(eflag_atom && i < _nlocal) {
1077     _eatom[i] = ENERGY_F(0.5) * evdwl;
1078   }
1079 
1080   if(vflag_atom && i < _nlocal) {
1081     _vatom[i]         = ENERGY_F(0.5) * sharedV[0 * blockDim.x];
1082     _vatom[i + _nmax]   = ENERGY_F(0.5) * sharedV[1 * blockDim.x];
1083     _vatom[i + 2 * _nmax] = ENERGY_F(0.5) * sharedV[2 * blockDim.x];
1084     _vatom[i + 3 * _nmax] = ENERGY_F(0.5) * sharedV[3 * blockDim.x];
1085     _vatom[i + 4 * _nmax] = ENERGY_F(0.5) * sharedV[4 * blockDim.x];
1086     _vatom[i + 5 * _nmax] = ENERGY_F(0.5) * sharedV[5 * blockDim.x];
1087   }
1088 
1089   if(vflagm && eflag) PairVirialCompute_A_Kernel_Template<1, 1>();
1090   else if(eflag) PairVirialCompute_A_Kernel_Template<1, 0>();
1091   else if(vflagm) PairVirialCompute_A_Kernel_Template<0, 1>();
1092 
1093 #undef fxtmp
1094 #undef fytmp
1095 #undef fztmp
1096   //#undef jnum_red
1097 }
1098