1 // clang-format off
2 /* -*- c++ -*- -------------------------------------------------------------
3    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
4    https://www.lammps.org/, Sandia National Laboratories
5    Steve Plimpton, sjplimp@sandia.gov
6 
7    Copyright (2003) Sandia Corporation.  Under the terms of Contract
8    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
9    certain rights in this software.  This software is distributed under
10    the GNU General Public License.
11 
12    See the README file in the top-level LAMMPS directory.
13 ------------------------------------------------------------------------- */
14 
15 /* ----------------------------------------------------------------------
16    Contributing author: W. Michael Brown (Intel)
17 ------------------------------------------------------------------------- */
18 
19 #ifdef __INTEL_LLVM_COMPILER
20 #define USE_OMP_SIMD
21 #define __INTEL_COMPILER __INTEL_LLVM_COMPILER
22 #define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER
23 #endif
24 
25 #ifdef __INTEL_COMPILER
26 #define LMP_SIMD_COMPILER
27 #if (__INTEL_COMPILER_BUILD_DATE > 20160720)
28 #define LMP_INTEL_USE_SIMDOFF
29 #endif
30 #pragma warning (disable:3948)
31 #pragma warning (disable:3949)
32 #pragma warning (disable:13200)
33 #endif
34 
35 #ifdef __INTEL_OFFLOAD
36 #ifdef LMP_INTEL_OFFLOAD
37 #define _LMP_INTEL_OFFLOAD
38 #ifdef __TARGET_ARCH_MIC
39 #ifndef __MIC__
40 #define __MIC__ 1
41 #endif
42 #endif
43 #endif
44 #endif
45 
46 #ifndef LMP_INTEL_PREPROCESS_H
47 #define LMP_INTEL_PREPROCESS_H
48 
49 // LAMMPS_MEMALIGN is set to 64 by default for -DLMP_INTEL
50 // so we only need to error out in case of a different alignment
51 #if LAMMPS_MEMALIGN && (LAMMPS_MEMALIGN != 64)
52 #error Please set -DLAMMPS_MEMALIGN=64 in CCFLAGS of your LAMMPS makefile for INTEL package
53 #endif
54 
55 #if defined(_OPENMP)
56 #define _use_omp_pragma(txt) _Pragma(txt)
57 #else
58 #define _use_omp_pragma(txt)
59 #endif
60 
61 #if defined(LMP_SIMD_COMPILER)
62 #define _use_simd_pragma(txt) _Pragma(txt)
63 #else
64 #define _use_simd_pragma(txt)
65 #endif
66 
67 namespace LAMMPS_NS {
68 
69 enum {LMP_OVERFLOW, LMP_LOCAL_MIN, LMP_LOCAL_MAX, LMP_GHOST_MIN,
70       LMP_GHOST_MAX};
71 enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
72       TIME_OFFLOAD_PAIR, TIME_OFFLOAD_WAIT, TIME_OFFLOAD_LATENCY,
73       TIME_IMBALANCE};
74 
75 #define NUM_ITIMERS ( TIME_IMBALANCE + 1 )
76 #define INTEL_MIC_VECTOR_WIDTH 16
77 #define INTEL_VECTOR_WIDTH 4
78 #define INTEL_MAX_STENCIL 256
79 // INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL)
80 #define INTEL_MAX_STENCIL_CHECK 4096
81 #define INTEL_P3M_MAXORDER 8
82 #define INTEL_P3M_ALIGNED_MAXORDER 8
83 // PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY)
84 #define INTEL_P3M_TABLE 1
85 
86 #ifdef __INTEL_COMPILER
87 #ifdef __AVX__
88 #undef INTEL_VECTOR_WIDTH
89 #define INTEL_VECTOR_WIDTH 8
90 #endif
91 
92 #ifdef __AVX2__
93 #undef INTEL_VECTOR_WIDTH
94 #define INTEL_VECTOR_WIDTH 8
95 #endif
96 
97 #ifdef __AVX512F__
98 #undef INTEL_VECTOR_WIDTH
99 #define INTEL_VECTOR_WIDTH 16
100 #define INTEL_V512 1
101 #define INTEL_VMASK 1
102 #else
103 #ifdef __MIC__
104 #define INTEL_V512 1
105 #define INTEL_VMASK 1
106 #define INTEL_HTHREADS 4
107 #endif
108 #endif
109 
110 #ifdef __AVX512ER__
111 #define INTEL_HTHREADS 4
112 #endif
113 
114 #ifdef __AVX512CD__
115 #ifndef _LMP_INTEL_OFFLOAD
116 #define LMP_USE_AVXCD
117 #endif
118 #endif
119 
120 #ifdef __MIC__
121 #define INTEL_COMPILE_WIDTH INTEL_MIC_VECTOR_WIDTH
122 #else
123 #define INTEL_COMPILE_WIDTH INTEL_VECTOR_WIDTH
124 #endif
125 
126 #else
127 
128 #undef INTEL_VECTOR_WIDTH
129 #define INTEL_VECTOR_WIDTH 1
130 #define INTEL_COMPILE_WIDTH 1
131 
132 #endif
133 
134 #define INTEL_DATA_ALIGN 64
135 #define INTEL_ONEATOM_FACTOR 1
136 #define INTEL_MIC_NBOR_PAD INTEL_MIC_VECTOR_WIDTH
137 #define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH
138 #define INTEL_LB_MEAN_WEIGHT 0.1
139 #define INTEL_BIGP 1e15
140 #define INTEL_MAX_HOST_CORE_COUNT 512
141 #define INTEL_MAX_COI_CORES 36
142 
143 #ifndef INTEL_HTHREADS
144 #define INTEL_HTHREADS 2
145 #endif
146 
147 #if INTEL_DATA_ALIGN > 1
148 
149 #define IP_PRE_edge_align(n, esize)                                     \
150   {                                                                     \
151     const int pad_mask = ~static_cast<int>(INTEL_DATA_ALIGN/esize-1);   \
152     n = (n + INTEL_DATA_ALIGN / esize - 1) & pad_mask;                  \
153   }
154 
155 #else
156 
157 #define IP_PRE_edge_align(n, esize)                                     \
158 
159 #endif
160 
161 #define IP_PRE_get_stride(stride, n, datasize, torque)          \
162   {                                                             \
163     int blength = n;                                            \
164     if (torque) blength *= 2;                                   \
165     const int bytes = blength * datasize;                       \
166     stride = INTEL_DATA_ALIGN - (bytes % INTEL_DATA_ALIGN);     \
167     stride = blength + stride / datasize;                       \
168   }
169 
170 #if defined(_OPENMP)
171 
172 #define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)       \
173   {                                                             \
174     int idelta = inum/nthreads;                                 \
175     const int imod = inum % nthreads;                           \
176     ifrom = tid * idelta;                                       \
177     ito = ifrom + idelta;                                       \
178     if (tid < imod) {                                           \
179       ito+=tid+1;                                               \
180       ifrom+=tid;                                               \
181     } else {                                                    \
182       ito+=imod;                                                \
183       ifrom+=imod;                                              \
184     }                                                           \
185   }
186 
187 #define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)    \
188   {                                                             \
189     tid = omp_get_thread_num();                                 \
190     IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads);          \
191   }
192 
193 #define IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr)      \
194   {                                                             \
195     if (nthr <= INTEL_HTHREADS) {                               \
196       ifrom = tid;                                              \
197       ito = inum;                                               \
198       ip = nthr;                                                \
199     } else if (nthr % INTEL_HTHREADS == 0) {                    \
200       int nd = nthr / INTEL_HTHREADS;                           \
201       int td = tid / INTEL_HTHREADS;                            \
202       int tm = tid % INTEL_HTHREADS;                            \
203       IP_PRE_omp_range(ifrom, ito, td, inum, nd);               \
204       ifrom += tm;                                              \
205       ip = INTEL_HTHREADS;                                      \
206     } else {                                                    \
207       IP_PRE_omp_range(ifrom, ito, tid, inum, nthr);            \
208       ip = 1;                                                   \
209     }                                                           \
210   }
211 
212 #define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr)   \
213   {                                                             \
214     tid = omp_get_thread_num();                                 \
215     IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr);         \
216   }
217 
218 #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
219                              datasize)                          \
220 {                                                               \
221   int chunk_size = INTEL_DATA_ALIGN / datasize;                 \
222   int idelta = static_cast<int>(ceil(static_cast<float>(inum)   \
223                                      /chunk_size/nthreads));    \
224   idelta *= chunk_size;                                         \
225   ifrom = tid*idelta;                                           \
226   ito = ifrom + idelta;                                         \
227   if (ito > inum) ito = inum;                                   \
228 }
229 
230 #define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum,        \
231                                 nthreads, datasize)             \
232   {                                                             \
233     tid = omp_get_thread_num();                                 \
234     IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads,     \
235                            datasize);                           \
236   }
237 
238 #define IP_PRE_omp_range_vec(ifrom, ito, tid, inum, nthreads,   \
239                              vecsize)                           \
240   {                                                             \
241     int idelta = static_cast<int>(ceil(static_cast<float>(inum) \
242                                        /vecsize/nthreads));     \
243     idelta *= vecsize;                                          \
244     ifrom = tid*idelta;                                         \
245     ito = ifrom + idelta;                                       \
246     if (ito > inum) ito = inum;                                 \
247   }
248 
249 #define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum,          \
250                                 nthreads, vecsize)              \
251   {                                                             \
252     tid = omp_get_thread_num();                                 \
253     IP_PRE_omp_range_vec(ifrom, ito, tid, inum, nthreads,       \
254                          vecsize);                              \
255   }
256 
257 #define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum,     \
258                                  nthr, vecsize)                 \
259   {                                                             \
260     tid = omp_get_thread_num();                                 \
261     if (nthr <= INTEL_HTHREADS) {                               \
262       ifrom = tid*vecsize;                                      \
263       ito = inum;                                               \
264       ip = nthr*vecsize;                                        \
265     } else if (nthr % INTEL_HTHREADS == 0) {                    \
266       int nd = nthr / INTEL_HTHREADS;                           \
267       int td = tid / INTEL_HTHREADS;                            \
268       int tm = tid % INTEL_HTHREADS;                            \
269       IP_PRE_omp_range_vec(ifrom, ito, td, inum, nd, vecsize);  \
270       ifrom += tm * vecsize;                                    \
271       ip = INTEL_HTHREADS * vecsize;                            \
272     } else {                                                    \
273       IP_PRE_omp_range_vec(ifrom, ito, tid, inum, nthr,         \
274                            vecsize);                            \
275       ip = vecsize;                                             \
276     }                                                           \
277   }
278 
279 #else
280 
281 #define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads)    \
282   {                                                             \
283     tid = 0;                                                    \
284     ifrom = 0;                                                  \
285     ito = inum;                                                 \
286   }
287 
288 #define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads)       \
289   {                                                             \
290     ifrom = 0;                                                  \
291     ito = inum;                                                 \
292   }
293 
294 #define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr)   \
295   {                                                             \
296     tid = 0;                                                    \
297     ifrom = 0;                                                  \
298     ito = inum;                                                 \
299     ip = 1;                                                     \
300   }
301 
302 #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
303                              datasize)                          \
304 {                                                               \
305     ifrom = 0;                                                  \
306     ito = inum;                                                 \
307 }
308 
309 #define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum,        \
310                                 nthreads, datasize)             \
311 {                                                               \
312   tid = 0;                                                      \
313   ifrom = 0;                                                    \
314   ito = inum;                                                   \
315 }
316 
317 #define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum,          \
318                                 nthreads, vecsize)              \
319   {                                                             \
320     tid = 0;                                                    \
321     ifrom = 0;                                                  \
322     ito = inum;                                                 \
323   }
324 
325 #define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum,     \
326                                  nthr, vecsize)                 \
327   {                                                             \
328     tid = 0;                                                    \
329     ifrom = 0;                                                  \
330     ip = vecsize;                                               \
331     ito = inum;                                                 \
332   }
333 
334 #endif
335 
336 // TO BE DEPRECATED
337 #ifndef USE_OMP_SIMD
338 
339 #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
340                                   f_stride, pos, ov0, ov1, ov2,         \
341                                   ov3, ov4, ov5)                        \
342 {                                                                       \
343   acc_t *f_scalar = &f_start[0].x;                                      \
344   flt_t *x_scalar = &pos[minlocal].x;                                   \
345   int f_stride4 = f_stride * 4;                                         \
346   _alignvar(acc_t ovv[16],64);                                          \
347   int vwidth;                                                           \
348   if (sizeof(acc_t) == sizeof(double))                                  \
349     vwidth = INTEL_COMPILE_WIDTH/2;                                     \
350   else                                                                  \
351     vwidth = INTEL_COMPILE_WIDTH;                                       \
352   if (vwidth < 4) vwidth = 4;                                           \
353   _use_simd_pragma("vector aligned")                                    \
354   _use_simd_pragma("simd")                                              \
355   for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0;                 \
356   int remainder = lt % vwidth;                                          \
357   if (lf > lt) remainder = 0;                                           \
358   const int v_range = lt - remainder;                                   \
359   if (nthreads == 2) {                                                  \
360     acc_t *f_scalar2 = f_scalar + f_stride4;                            \
361     for (int n = lf; n < v_range; n += vwidth) {                        \
362       _use_simd_pragma("vector aligned")                                \
363       _use_simd_pragma("simd")                                          \
364       for (int v = 0; v < vwidth; v++) {                                \
365         f_scalar[n+v] += f_scalar2[n+v];                                \
366         ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
367       }                                                                 \
368       ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
369       ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
370       ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
371       if (vwidth > 4) {                                                 \
372         ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
373         ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
374         ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
375       }                                                                 \
376       if (vwidth > 8) {                                                 \
377         ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
378         ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
379         ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
380         ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
381         ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
382         ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
383       }                                                                 \
384     }                                                                   \
385     _use_simd_pragma("vector aligned")                                  \
386     _use_simd_pragma("ivdep")                                           \
387     _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
388     for (int n = v_range; n < lt; n++)                                  \
389       f_scalar[n] += f_scalar2[n];                                      \
390   } else if (nthreads==4) {                                             \
391     acc_t *f_scalar2 = f_scalar + f_stride4;                            \
392     acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
393     acc_t *f_scalar4 = f_scalar3 + f_stride4;                           \
394     for (int n = lf; n < v_range; n += vwidth) {                        \
395       _use_simd_pragma("vector aligned")                                \
396       _use_simd_pragma("simd")                                          \
397       for (int v = 0; v < vwidth; v++) {                                \
398         f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] +              \
399           f_scalar4[n+v];                                               \
400         ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
401       }                                                                 \
402       ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
403       ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
404       ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
405       if (vwidth > 4) {                                                 \
406         ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
407         ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
408         ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
409       }                                                                 \
410       if (vwidth > 8) {                                                 \
411         ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
412         ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
413         ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
414         ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
415         ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
416         ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
417       }                                                                 \
418     }                                                                   \
419     _use_simd_pragma("vector aligned")                                  \
420     _use_simd_pragma("ivdep")                                           \
421     _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
422     for (int n = v_range; n < lt; n++)                                  \
423       f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];        \
424   } else if (nthreads==1) {                                             \
425     for (int n = lf; n < v_range; n += vwidth) {                        \
426       _use_simd_pragma("vector aligned")                                \
427       _use_simd_pragma("simd")                                          \
428       for (int v = 0; v < vwidth; v++)                                  \
429         ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
430       ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
431       ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
432       ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
433       if (vwidth > 4) {                                                 \
434         ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
435         ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
436         ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
437       }                                                                 \
438       if (vwidth > 8) {                                                 \
439         ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
440         ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
441         ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
442         ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
443         ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
444         ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
445       }                                                                 \
446     }                                                                   \
447   } else if (nthreads==3) {                                             \
448     acc_t *f_scalar2 = f_scalar + f_stride4;                            \
449     acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
450     for (int n = lf; n < v_range; n += vwidth) {                        \
451       _use_simd_pragma("vector aligned")                                \
452       _use_simd_pragma("simd")                                          \
453       for (int v = 0; v < vwidth; v++) {                                \
454         f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v];               \
455         ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
456       }                                                                 \
457       ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
458       ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
459       ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
460       if (vwidth > 4) {                                                 \
461         ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
462         ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
463         ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
464       }                                                                 \
465       if (vwidth > 8) {                                                 \
466         ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
467         ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
468         ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
469         ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
470         ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
471         ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
472       }                                                                 \
473     }                                                                   \
474     _use_simd_pragma("vector aligned")                                  \
475     _use_simd_pragma("ivdep")                                           \
476     _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
477     for (int n = v_range; n < lt; n++)                                  \
478       f_scalar[n] += f_scalar2[n] + f_scalar3[n];                       \
479   }                                                                     \
480   for (int n = v_range; n < lt; n += 4) {                               \
481     _use_simd_pragma("vector aligned")                                  \
482     _use_simd_pragma("ivdep")                                           \
483     for (int v = 0; v < 4; v++)                                         \
484       ovv[v] += f_scalar[n+v] * x_scalar[n+v];                          \
485     ov3 += f_scalar[n+1] * x_scalar[n+0];                               \
486     ov4 += f_scalar[n+2] * x_scalar[n+0];                               \
487     ov5 += f_scalar[n+2] * x_scalar[n+1];                               \
488   }                                                                     \
489   ov0 += ovv[0];                                                        \
490   ov1 += ovv[1];                                                        \
491   ov2 += ovv[2];                                                        \
492   if (vwidth > 4) {                                                     \
493     ov0 += ovv[4];                                                      \
494     ov1 += ovv[5];                                                      \
495     ov2 += ovv[6];                                                      \
496   }                                                                     \
497   if (vwidth > 8) {                                                     \
498     ov0 += ovv[8] + ovv[12];                                            \
499     ov1 += ovv[9] + ovv[13];                                            \
500     ov2 += ovv[10] + ovv[14];                                           \
501   }                                                                     \
502 }
503 
504 #define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,       \
505                                f_stride, pos, offload, vflag, ov0, ov1, \
506                                ov2, ov3, ov4, ov5)                      \
507 {                                                                       \
508   int o_range = (nall - minlocal) * 4;                                  \
509   IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads,       \
510                             sizeof(acc_t));                             \
511                                                                         \
512   acc_t *f_scalar = &f_start[0].x;                                      \
513   int f_stride4 = f_stride * 4;                                         \
514   int t;                                                                \
515   if (vflag == VIRIAL_FDOTR) t = 4; else t = 1;                         \
516   acc_t *f_scalar2 = f_scalar + f_stride4 * t;                          \
517   for ( ; t < nthreads; t++) {                                          \
518     _use_simd_pragma("vector aligned")                                  \
519     _use_simd_pragma("simd")                                            \
520     for (int n = iifrom; n < iito; n++)                                 \
521       f_scalar[n] += f_scalar2[n];                                      \
522     f_scalar2 += f_stride4;                                             \
523   }                                                                     \
524                                                                         \
525   if (vflag == VIRIAL_FDOTR) {                                          \
526     int nt_min = MIN(4,nthreads);                                       \
527     IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start,  \
528                               f_stride, pos, ov0, ov1, ov2, ov3, ov4,   \
529                               ov5);                                     \
530   }                                                                     \
531 }
532 
533 #else
534 
535 #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
536                                   f_stride, pos, ov0, ov1, ov2,         \
537                                   ov3, ov4, ov5)                        \
538 {                                                                       \
539   acc_t *f_scalar = &f_start[0].x;                                      \
540   flt_t *x_scalar = &pos[minlocal].x;                                   \
541   int f_stride4 = f_stride * 4;                                         \
542   _alignvar(acc_t ovv[16],64);                                          \
543   int vwidth;                                                           \
544   if (sizeof(acc_t) == sizeof(double))                                  \
545     vwidth = INTEL_COMPILE_WIDTH/2;                                     \
546   else                                                                  \
547     vwidth = INTEL_COMPILE_WIDTH;                                       \
548   if (vwidth < 4) vwidth = 4;                                           \
549   _use_simd_pragma("omp simd aligned(ovv:64)")                          \
550   for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0;                 \
551   int remainder = lt % vwidth;                                          \
552   if (lf > lt) remainder = 0;                                           \
553   const int v_range = lt - remainder;                                   \
554   if (nthreads == 2) {                                                  \
555     acc_t *f_scalar2 = f_scalar + f_stride4;                            \
556     for (int n = lf; n < v_range; n += vwidth) {                        \
557       _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,ovv,x_scalar:64)")\
558       for (int v = 0; v < vwidth; v++) {                                \
559         f_scalar[n+v] += f_scalar2[n+v];                                \
560         ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
561       }                                                                 \
562       ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
563       ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
564       ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
565       if (vwidth > 4) {                                                 \
566         ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
567         ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
568         ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
569       }                                                                 \
570       if (vwidth > 8) {                                                 \
571         ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
572         ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
573         ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
574         ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
575         ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
576         ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
577       }                                                                 \
578     }                                                                   \
579     _use_simd_pragma("vector aligned")                                  \
580     _use_simd_pragma("ivdep")                                           \
581     _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
582     for (int n = v_range; n < lt; n++)                                  \
583       f_scalar[n] += f_scalar2[n];                                      \
584   } else if (nthreads==4) {                                             \
585     acc_t *f_scalar2 = f_scalar + f_stride4;                            \
586     acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
587     acc_t *f_scalar4 = f_scalar3 + f_stride4;                           \
588     for (int n = lf; n < v_range; n += vwidth) {                        \
589       _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4,ovv:64)") \
590       for (int v = 0; v < vwidth; v++) {                                \
591         f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] +              \
592           f_scalar4[n+v];                                               \
593         ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
594       }                                                                 \
595       ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
596       ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
597       ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
598       if (vwidth > 4) {                                                 \
599         ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
600         ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
601         ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
602       }                                                                 \
603       if (vwidth > 8) {                                                 \
604         ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
605         ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
606         ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
607         ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
608         ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
609         ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
610       }                                                                 \
611     }                                                                   \
612     _use_simd_pragma("vector aligned")                                  \
613     _use_simd_pragma("ivdep")                                           \
614     _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
615     for (int n = v_range; n < lt; n++)                                  \
616       f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n];        \
617   } else if (nthreads==1) {                                             \
618     for (int n = lf; n < v_range; n += vwidth) {                        \
619       _use_simd_pragma("omp simd aligned(ovv,f_scalar,x_scalar:64)")    \
620       for (int v = 0; v < vwidth; v++)                                  \
621         ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
622       ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
623       ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
624       ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
625       if (vwidth > 4) {                                                 \
626         ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
627         ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
628         ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
629       }                                                                 \
630       if (vwidth > 8) {                                                 \
631         ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
632         ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
633         ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
634         ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
635         ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
636         ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
637       }                                                                 \
638     }                                                                   \
639   } else if (nthreads==3) {                                             \
640     acc_t *f_scalar2 = f_scalar + f_stride4;                            \
641     acc_t *f_scalar3 = f_scalar2 + f_stride4;                           \
642     for (int n = lf; n < v_range; n += vwidth) {                        \
643       _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,ovv,x_scalar:64)") \
644       for (int v = 0; v < vwidth; v++) {                                \
645         f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v];               \
646         ovv[v] += f_scalar[n+v] * x_scalar[n+v];                        \
647       }                                                                 \
648       ov3 += f_scalar[n+1] * x_scalar[n+0];                             \
649       ov4 += f_scalar[n+2] * x_scalar[n+0];                             \
650       ov5 += f_scalar[n+2] * x_scalar[n+1];                             \
651       if (vwidth > 4) {                                                 \
652         ov3 += f_scalar[n+5] * x_scalar[n+4];                           \
653         ov4 += f_scalar[n+6] * x_scalar[n+4];                           \
654         ov5 += f_scalar[n+6] * x_scalar[n+5];                           \
655       }                                                                 \
656       if (vwidth > 8) {                                                 \
657         ov3 += f_scalar[n+9] * x_scalar[n+8];                           \
658         ov3 += f_scalar[n+13] * x_scalar[n+12];                         \
659         ov4 += f_scalar[n+10] * x_scalar[n+8];                          \
660         ov4 += f_scalar[n+14] * x_scalar[n+12];                         \
661         ov5 += f_scalar[n+10] * x_scalar[n+9];                          \
662         ov5 += f_scalar[n+14] * x_scalar[n+13];                         \
663       }                                                                 \
664     }                                                                   \
665     _use_simd_pragma("vector aligned")                                  \
666     _use_simd_pragma("ivdep")                                           \
667     _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)")      \
668     for (int n = v_range; n < lt; n++)                                  \
669       f_scalar[n] += f_scalar2[n] + f_scalar3[n];                       \
670   }                                                                     \
671   for (int n = v_range; n < lt; n += 4) {                               \
672     _use_simd_pragma("vector aligned")                                  \
673     _use_simd_pragma("ivdep")                                           \
674     for (int v = 0; v < 4; v++)                                         \
675       ovv[v] += f_scalar[n+v] * x_scalar[n+v];                          \
676     ov3 += f_scalar[n+1] * x_scalar[n+0];                               \
677     ov4 += f_scalar[n+2] * x_scalar[n+0];                               \
678     ov5 += f_scalar[n+2] * x_scalar[n+1];                               \
679   }                                                                     \
680   ov0 += ovv[0];                                                        \
681   ov1 += ovv[1];                                                        \
682   ov2 += ovv[2];                                                        \
683   if (vwidth > 4) {                                                     \
684     ov0 += ovv[4];                                                      \
685     ov1 += ovv[5];                                                      \
686     ov2 += ovv[6];                                                      \
687   }                                                                     \
688   if (vwidth > 8) {                                                     \
689     ov0 += ovv[8] + ovv[12];                                            \
690     ov1 += ovv[9] + ovv[13];                                            \
691     ov2 += ovv[10] + ovv[14];                                           \
692   }                                                                     \
693 }
694 
695 #define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,       \
696                                f_stride, pos, offload, vflag, ov0, ov1, \
697                                ov2, ov3, ov4, ov5)                      \
698 {                                                                       \
699   int o_range = (nall - minlocal) * 4;                                  \
700   IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads,       \
701                             sizeof(acc_t));                             \
702                                                                         \
703   acc_t *f_scalar = &f_start[0].x;                                      \
704   int f_stride4 = f_stride * 4;                                         \
705   int t;                                                                \
706   if (vflag == VIRIAL_FDOTR) t = 4; else t = 1;                         \
707   acc_t *f_scalar2 = f_scalar + f_stride4 * t;                          \
708   for ( ; t < nthreads; t++) {                                          \
709     _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2:64)")         \
710     for (int n = iifrom; n < iito; n++)                                 \
711       f_scalar[n] += f_scalar2[n];                                      \
712     f_scalar2 += f_stride4;                                             \
713   }                                                                     \
714                                                                         \
715   if (vflag == VIRIAL_FDOTR) {                                          \
716     int nt_min = MIN(4,nthreads);                                       \
717     IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start,  \
718                               f_stride, pos, ov0, ov1, ov2, ov3, ov4,   \
719                               ov5);                                     \
720   }                                                                     \
721 }
722 
723 #endif
724 
725 #ifdef _LMP_INTEL_OFFLOAD
726 #include <sys/time.h>
727 
target(mic)728 __declspec( target (mic))
729 inline double MIC_Wtime() {
730   double time;
731   struct timeval tv;
732 
733   gettimeofday(&tv, nullptr);
734   time = 1.0 * tv.tv_sec + 1.0e-6 * tv.tv_usec;
735   return time;
736 }
737 
738 #define IP_PRE_neighbor_pad(jnum, offload)                              \
739 {                                                                       \
740   const int opad_mask = ~static_cast<int>(INTEL_MIC_NBOR_PAD *          \
741                                           sizeof(float) /               \
742                                           sizeof(flt_t) - 1);           \
743   const int pad_mask = ~static_cast<int>(INTEL_NBOR_PAD *               \
744                                           sizeof(float) /               \
745                                           sizeof(flt_t) - 1);           \
746   if (offload && INTEL_MIC_NBOR_PAD > 1)                                \
747     jnum = (jnum + INTEL_MIC_NBOR_PAD * sizeof(float) /                 \
748             sizeof(flt_t) - 1) & opad_mask;                             \
749   else if (INTEL_NBOR_PAD > 1)                                          \
750     jnum = (jnum + INTEL_NBOR_PAD * sizeof(float) /                     \
751             sizeof(flt_t) - 1) & pad_mask;                              \
752 }
753 
754 #define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,        \
755                                      nlocal, nall)                      \
756 {                                                                       \
757     if (fix->separate_buffers() && ago != 0) {                          \
758     fix->start_watch(TIME_PACK);                                        \
759     if (offload) {                                                      \
760       int packthreads;                                                  \
761       if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;\
762       else packthreads = 1;                                             \
763       _use_omp_pragma("omp parallel if(packthreads > 1)")               \
764       {                                                                 \
765         int ifrom, ito, tid;                                            \
766         IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal,              \
767                                   packthreads, sizeof(flt_t));          \
768         buffers->thr_pack_cop(ifrom, ito, 0);                           \
769         int nghost = nall - nlocal;                                     \
770         if (nghost) {                                                   \
771           IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal,        \
772                                  packthreads, sizeof(flt_t));           \
773           buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal,           \
774                                 fix->offload_min_ghost() - nlocal,      \
775                                 ago == 1);                              \
776         }                                                               \
777       }                                                                 \
778     } else {                                                            \
779       buffers->thr_pack_host(fix->host_min_local(), nlocal, 0);         \
780       buffers->thr_pack_host(nlocal, nall,                              \
781                              fix->host_min_ghost()-nlocal);             \
782     }                                                                   \
783     fix->stop_watch(TIME_PACK);                                         \
784   }                                                                     \
785 }
786 
787 #define IP_PRE_get_transfern(ago, newton, eflag, vflag,                 \
788                              buffers, offload, fix, separate_flag,      \
789                              x_size, q_size, ev_size, f_stride)         \
790 {                                                                       \
791   separate_flag = 0;                                                    \
792   if (ago == 0) {                                                       \
793     x_size = 0;                                                         \
794     q_size = nall;                                                      \
795     if (offload) {                                                      \
796       if (fix->separate_buffers()) {                                    \
797         if (lmp->atom->torque)                                          \
798           separate_flag = 2;                                            \
799         else                                                            \
800           separate_flag = 1;                                            \
801       } else                                                            \
802         separate_flag = 3;                                              \
803     }                                                                   \
804   } else {                                                              \
805     x_size = nall;                                                      \
806     q_size = 0;                                                         \
807   }                                                                     \
808   ev_size = 0;                                                          \
809   if (eflag) ev_size = 2;                                               \
810   if (vflag) ev_size = 8;                                               \
811   if (newton)                                                           \
812     f_stride = buffers->get_stride(nall);                               \
813   else                                                                  \
814     f_stride = buffers->get_stride(nlocal);                             \
815 }
816 
817 #define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,          \
818                            ev_global)                                   \
819 {                                                                       \
820   if (offload) {                                                        \
821     tc = buffers->get_off_threads();                                    \
822     f_start = buffers->get_off_f();                                     \
823     ev_global = buffers->get_ev_global();                               \
824   } else {                                                              \
825     tc = comm->nthreads;                                                \
826     f_start = buffers->get_f();                                         \
827     fix->start_watch(TIME_HOST_PAIR);                                   \
828     ev_global = buffers->get_ev_global_host();                          \
829   }                                                                     \
830 }
831 
832 #define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,  \
833                                   f_stride, x, q)                       \
834 {                                                                       \
835   if (separate_flag) {                                                  \
836     if (separate_flag < 3) {                                            \
837       int all_local = nlocal;                                           \
838       int ghost_min = overflow[LMP_GHOST_MIN];                          \
839       nlocal = overflow[LMP_LOCAL_MAX] + 1;                             \
840       int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min;             \
841       if (nghost < 0) nghost = 0;                                       \
842       nall = nlocal + nghost;                                           \
843       separate_flag--;                                                  \
844       int flength;                                                      \
845       if (newton) flength = nall;                                       \
846       else flength = nlocal;                                            \
847       IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T),             \
848                            separate_flag);                              \
849       if (nghost) {                                                     \
850         if (nlocal < all_local || ghost_min > all_local) {              \
851           memmove(x + nlocal, x + ghost_min,                            \
852                   (nall - nlocal) * sizeof(ATOM_T));                    \
853           if (q != 0)                                                   \
854             memmove((void *)(q + nlocal), (void *)(q + ghost_min),      \
855                     (nall - nlocal) * sizeof(flt_t));                   \
856         }                                                               \
857       }                                                                 \
858     }                                                                   \
859     x[nall].x = INTEL_BIGP;                                             \
860     x[nall].y = INTEL_BIGP;                                             \
861     x[nall].z = INTEL_BIGP;                                             \
862   }                                                                     \
863 }
864 
865 #define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads,       \
866                                 f_start, f_stride, x, offload, vflag,   \
867                                 ov0, ov1, ov2, ov3, ov4, ov5)           \
868 {                                                                       \
869   if (newton) {                                                         \
870     _use_omp_pragma("omp barrier");                                     \
871     IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start,           \
872                            f_stride, x, offload, vflag, ov0, ov1, ov2,  \
873                            ov3, ov4, ov5);                              \
874   }                                                                     \
875 }
876 
877 #define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag,    \
878                             ov0, ov1, ov2, ov3, ov4, ov5)
879 
880 #else
881 
882 #if INTEL_NBOR_PAD > 1
883 
884 #define IP_PRE_neighbor_pad(jnum, offload)                              \
885 {                                                                       \
886   const int pad_mask = ~static_cast<int>(INTEL_NBOR_PAD *               \
887                                          sizeof(float) /                \
888                                          sizeof(flt_t) - 1);            \
889   jnum = (jnum + INTEL_NBOR_PAD * sizeof(float) /                       \
890           sizeof(flt_t) - 1) & pad_mask;                                \
891 }
892 
893 #else
894 
895 #define IP_PRE_neighbor_pad(jnum, offload)
896 
897 #endif
898 
899 #define MIC_Wtime MPI_Wtime
900 #define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload,        \
901                                      nlocal, nall)
902 
903 #define IP_PRE_get_transfern(ago, newton, eflag, vflag,                 \
904                              buffers, offload, fix, separate_flag,      \
905                              x_size, q_size, ev_size, f_stride)         \
906 {                                                                       \
907   separate_flag = 0;                                                    \
908   int f_length;                                                         \
909   if (newton)                                                           \
910     f_length = nall;                                                    \
911   else                                                                  \
912     f_length = nlocal;                                                  \
913   f_stride = buffers->get_stride(f_length);                             \
914 }
915 
916 #define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start,          \
917                            ev_global)                                   \
918 {                                                                       \
919   tc = comm->nthreads;                                                  \
920   f_start = buffers->get_f();                                           \
921   fix->start_watch(TIME_HOST_PAIR);                                     \
922   ev_global = buffers->get_ev_global_host();                            \
923 }
924 
925 #define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall,  \
926                                   f_stride, x, q)
927 
928 #define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads,       \
929                                 f_start, f_stride, x, offload, vflag,   \
930                                 ov0, ov1, ov2, ov3, ov4, ov5)           \
931 {                                                                       \
932   if (newton) {                                                         \
933     if (vflag == 2 && nthreads > INTEL_HTHREADS) {                      \
934       _use_omp_pragma("omp barrier");                                   \
935       buffers->fdotr_reduce(nall, nthreads, f_stride, ov0, ov1, ov2,    \
936                             ov3, ov4, ov5);                             \
937     }                                                                   \
938   }                                                                     \
939 }
940 
941 #define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag,    \
942                             ov0, ov1, ov2, ov3, ov4, ov5)               \
943 {                                                                       \
944   if (newton) {                                                         \
945     if (vflag == 2 && nthreads <= INTEL_HTHREADS) {                     \
946       int lt = nall * 4;                                                \
947       buffers->fdotr_reduce_l5(0, lt, nthreads, f_stride, ov0, ov1,     \
948                                ov2, ov3, ov4, ov5);                     \
949     }                                                                   \
950   }                                                                     \
951 }
952 
953 #endif
954 
955 #define IP_PRE_ev_tally_nbor(vflag, fpair, delx, dely, delz)            \
956 {                                                                       \
957   if (vflag == 1) {                                                     \
958     sv0 += delx * delx * fpair;                                         \
959     sv1 += dely * dely * fpair;                                         \
960     sv2 += delz * delz * fpair;                                         \
961     sv3 += delx * dely * fpair;                                         \
962     sv4 += delx * delz * fpair;                                         \
963     sv5 += dely * delz * fpair;                                         \
964   }                                                                     \
965 }
966 
967 #define IP_PRE_ev_tally_nborv(vflag, dx, dy, dz, fpx, fpy, fpz)         \
968 {                                                                       \
969   if (vflag == 1) {                                                     \
970     sv0 += dx * fpx;                                                    \
971     sv1 += dy * fpy;                                                    \
972     sv2 += dz * fpz;                                                    \
973     sv3 += dx * fpy;                                                    \
974     sv4 += dx * fpz;                                                    \
975     sv5 += dy * fpz;                                                    \
976   }                                                                     \
977 }
978 
979 #define IP_PRE_ev_tally_nbor3(vflag, fj, fk, delx, dely, delz, delr2)   \
980 {                                                                       \
981   if (vflag == 1) {                                                     \
982     sv0 += delx * fj[0] + delr2[0] * fk[0];                             \
983     sv1 += dely * fj[1] + delr2[1] * fk[1];                             \
984     sv2 += delz * fj[2] + delr2[2] * fk[2];                             \
985     sv3 += delx * fj[1] + delr2[0] * fk[1];                             \
986     sv4 += delx * fj[2] + delr2[0] * fk[2];                             \
987     sv5 += dely * fj[2] + delr2[1] * fk[2];                             \
988   }                                                                     \
989 }
990 
991 #define IP_PRE_ev_tally_nbor3v(vflag, fj0, fj1, fj2, delx, dely, delz)  \
992 {                                                                       \
993   if (vflag == 1) {                                                     \
994     sv0 += delx * fj0;                                                  \
995     sv1 += dely * fj1;                                                  \
996     sv2 += delz * fj2;                                                  \
997     sv3 += delx * fj1;                                                  \
998     sv4 += delx * fj2;                                                  \
999     sv5 += dely * fj2;                                                  \
1000   }                                                                     \
1001 }
1002 
1003 #define IP_PRE_ev_tally_bond(eflag, VFLAG, eatom, vflag, ebond, i1, i2, \
1004                              fbond, delx, dely, delz, obond, force,     \
1005                              newton, nlocal, ov0, ov1, ov2, ov3, ov4,   \
1006                              ov5)                                       \
1007 {                                                                       \
1008   flt_t ev_pre;                                                         \
1009   if (newton) ev_pre = (flt_t)1.0;                                      \
1010   else {                                                                \
1011     ev_pre = (flt_t)0.0;                                                \
1012     if (i1 < nlocal) ev_pre += (flt_t)0.5;                              \
1013     if (i2 < nlocal) ev_pre += (flt_t)0.5;                              \
1014   }                                                                     \
1015                                                                         \
1016   if (eflag) {                                                          \
1017     obond += ev_pre * ebond;                                            \
1018     if (eatom) {                                                        \
1019       flt_t halfeng = ebond * (flt_t)0.5;                               \
1020       if (newton || i1 < nlocal) f[i1].w += halfeng;                    \
1021       if (newton || i2 < nlocal) f[i2].w += halfeng;                    \
1022     }                                                                   \
1023   }                                                                     \
1024                                                                         \
1025   if (VFLAG && vflag) {                                                 \
1026     ov0 += ev_pre * (delx * delx * fbond);                              \
1027     ov1 += ev_pre * (dely * dely * fbond);                              \
1028     ov2 += ev_pre * (delz * delz * fbond);                              \
1029     ov3 += ev_pre * (delx * dely * fbond);                              \
1030     ov4 += ev_pre * (delx * delz * fbond);                              \
1031     ov5 += ev_pre * (dely * delz * fbond);                              \
1032   }                                                                     \
1033 }
1034 
1035 #define IP_PRE_ev_tally_angle(eflag, VFLAG, eatom, vflag, eangle, i1,   \
1036                               i2, i3, f1x, f1y, f1z, f3x, f3y, f3z,     \
1037                               delx1, dely1, delz1, delx2, dely2, delz2, \
1038                               oeangle, force, newton, nlocal, ov0, ov1, \
1039                               ov2, ov3, ov4, ov5)                       \
1040 {                                                                       \
1041   flt_t ev_pre;                                                         \
1042   if (newton) ev_pre = (flt_t)1.0;                                      \
1043   else {                                                                \
1044     ev_pre = (flt_t)0.0;                                                \
1045     if (i1 < nlocal) ev_pre += (flt_t)0.3333333333333333;               \
1046     if (i2 < nlocal) ev_pre += (flt_t)0.3333333333333333;               \
1047     if (i3 < nlocal) ev_pre += (flt_t)0.3333333333333333;               \
1048   }                                                                     \
1049                                                                         \
1050   if (eflag) {                                                          \
1051     oeangle += ev_pre * eangle;                                         \
1052     if (eatom) {                                                        \
1053       flt_t thirdeng = eangle * (flt_t)0.3333333333333333;              \
1054       if (newton || i1 < nlocal) f[i1].w += thirdeng;                   \
1055       if (newton || i2 < nlocal) f[i2].w += thirdeng;                   \
1056       if (newton || i3 < nlocal) f[i3].w += thirdeng;                   \
1057     }                                                                   \
1058   }                                                                     \
1059                                                                         \
1060   if (VFLAG && vflag) {                                                 \
1061     ov0 += ev_pre * (delx1 * f1x + delx2 * f3x);                        \
1062     ov1 += ev_pre * (dely1 * f1y + dely2 * f3y);                        \
1063     ov2 += ev_pre * (delz1 * f1z + delz2 * f3z);                        \
1064     ov3 += ev_pre * (delx1 * f1y + delx2 * f3y);                        \
1065     ov4 += ev_pre * (delx1 * f1z + delx2 * f3z);                        \
1066     ov5 += ev_pre * (dely1 * f1z + dely2 * f3z);                        \
1067   }                                                                     \
1068 }
1069 
1070 #define IP_PRE_ev_tally_dihed(eflag, VFLAG, eatom, vflag, deng, i1, i2, \
1071                               i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,\
1072                               f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y,   \
1073                               vb2z, vb3x, vb3y, vb3z, oedihedral, force,\
1074                               newton, nlocal, ov0, ov1, ov2, ov3, ov4,  \
1075                               ov5)                                      \
1076 {                                                                       \
1077   flt_t ev_pre;                                                         \
1078   if (newton) ev_pre = (flt_t)1.0;                                      \
1079   else {                                                                \
1080     ev_pre = (flt_t)0.0;                                                \
1081     if (i1 < nlocal) ev_pre += (flt_t)0.25;                             \
1082     if (i2 < nlocal) ev_pre += (flt_t)0.25;                             \
1083     if (i3 < nlocal) ev_pre += (flt_t)0.25;                             \
1084     if (i4 < nlocal) ev_pre += (flt_t)0.25;                             \
1085   }                                                                     \
1086                                                                         \
1087   if (eflag) {                                                          \
1088     oedihedral += ev_pre * deng;                                        \
1089     if (eatom) {                                                        \
1090       flt_t qdeng = deng * (flt_t)0.25;                                 \
1091       if (newton || i1 < nlocal) f[i1].w += qdeng;                      \
1092       if (newton || i2 < nlocal) f[i2].w += qdeng;                      \
1093       if (newton || i3 < nlocal) f[i3].w += qdeng;                      \
1094       if (newton || i4 < nlocal) f[i4].w += qdeng;                      \
1095     }                                                                   \
1096   }                                                                     \
1097                                                                         \
1098   if (VFLAG && vflag) {                                                 \
1099     ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x);            \
1100     ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y);            \
1101     ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z);            \
1102     ov3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y);            \
1103     ov4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z);            \
1104     ov5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z);            \
1105   }                                                                     \
1106 }
1107 
1108 #define IP_PRE_ev_tally_atom(newton, eflag, vflag, f, fwtmp)            \
1109 {                                                                       \
1110   if (eflag) {                                                          \
1111     f[i].w += fwtmp;                                                    \
1112     oevdwl += sevdwl;                                                   \
1113   }                                                                     \
1114   if (newton == 0 && vflag == 1) {                                      \
1115     ov0 += sv0;                                                         \
1116     ov1 += sv1;                                                         \
1117     ov2 += sv2;                                                         \
1118     ov3 += sv3;                                                         \
1119     ov4 += sv4;                                                         \
1120     ov5 += sv5;                                                         \
1121   }                                                                     \
1122 }
1123 
1124 #define IP_PRE_ev_tally_atomq(newton, eflag, vflag, f, fwtmp)           \
1125 {                                                                       \
1126   if (eflag) {                                                          \
1127     f[i].w += fwtmp;                                                    \
1128     oevdwl += sevdwl;                                                   \
1129     oecoul += secoul;                                                   \
1130   }                                                                     \
1131   if (newton == 0 && vflag == 1) {                                      \
1132     ov0 += sv0;                                                         \
1133     ov1 += sv1;                                                         \
1134     ov2 += sv2;                                                         \
1135     ov3 += sv3;                                                         \
1136     ov4 += sv4;                                                         \
1137     ov5 += sv5;                                                         \
1138   }                                                                     \
1139 }
1140 
1141 }
1142 
1143 #endif
1144