1 // clang-format off
2 /* -*- c++ -*- -------------------------------------------------------------
3 LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
4 https://www.lammps.org/, Sandia National Laboratories
5 Steve Plimpton, sjplimp@sandia.gov
6
7 Copyright (2003) Sandia Corporation. Under the terms of Contract
8 DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
9 certain rights in this software. This software is distributed under
10 the GNU General Public License.
11
12 See the README file in the top-level LAMMPS directory.
13 ------------------------------------------------------------------------- */
14
15 /* ----------------------------------------------------------------------
16 Contributing author: W. Michael Brown (Intel)
17 ------------------------------------------------------------------------- */
18
19 #ifdef __INTEL_LLVM_COMPILER
20 #define USE_OMP_SIMD
21 #define __INTEL_COMPILER __INTEL_LLVM_COMPILER
22 #define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER
23 #endif
24
25 #ifdef __INTEL_COMPILER
26 #define LMP_SIMD_COMPILER
27 #if (__INTEL_COMPILER_BUILD_DATE > 20160720)
28 #define LMP_INTEL_USE_SIMDOFF
29 #endif
30 #pragma warning (disable:3948)
31 #pragma warning (disable:3949)
32 #pragma warning (disable:13200)
33 #endif
34
35 #ifdef __INTEL_OFFLOAD
36 #ifdef LMP_INTEL_OFFLOAD
37 #define _LMP_INTEL_OFFLOAD
38 #ifdef __TARGET_ARCH_MIC
39 #ifndef __MIC__
40 #define __MIC__ 1
41 #endif
42 #endif
43 #endif
44 #endif
45
46 #ifndef LMP_INTEL_PREPROCESS_H
47 #define LMP_INTEL_PREPROCESS_H
48
49 // LAMMPS_MEMALIGN is set to 64 by default for -DLMP_INTEL
50 // so we only need to error out in case of a different alignment
51 #if LAMMPS_MEMALIGN && (LAMMPS_MEMALIGN != 64)
52 #error Please set -DLAMMPS_MEMALIGN=64 in CCFLAGS of your LAMMPS makefile for INTEL package
53 #endif
54
55 #if defined(_OPENMP)
56 #define _use_omp_pragma(txt) _Pragma(txt)
57 #else
58 #define _use_omp_pragma(txt)
59 #endif
60
61 #if defined(LMP_SIMD_COMPILER)
62 #define _use_simd_pragma(txt) _Pragma(txt)
63 #else
64 #define _use_simd_pragma(txt)
65 #endif
66
67 namespace LAMMPS_NS {
68
69 enum {LMP_OVERFLOW, LMP_LOCAL_MIN, LMP_LOCAL_MAX, LMP_GHOST_MIN,
70 LMP_GHOST_MAX};
71 enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
72 TIME_OFFLOAD_PAIR, TIME_OFFLOAD_WAIT, TIME_OFFLOAD_LATENCY,
73 TIME_IMBALANCE};
74
75 #define NUM_ITIMERS ( TIME_IMBALANCE + 1 )
76 #define INTEL_MIC_VECTOR_WIDTH 16
77 #define INTEL_VECTOR_WIDTH 4
78 #define INTEL_MAX_STENCIL 256
79 // INTEL_MAX_STENCIL * sqrt(INTEL_MAX_STENCIL)
80 #define INTEL_MAX_STENCIL_CHECK 4096
81 #define INTEL_P3M_MAXORDER 8
82 #define INTEL_P3M_ALIGNED_MAXORDER 8
83 // PRECOMPUTE VALUES IN TABLE (DOESN'T AFFECT ACCURACY)
84 #define INTEL_P3M_TABLE 1
85
86 #ifdef __INTEL_COMPILER
87 #ifdef __AVX__
88 #undef INTEL_VECTOR_WIDTH
89 #define INTEL_VECTOR_WIDTH 8
90 #endif
91
92 #ifdef __AVX2__
93 #undef INTEL_VECTOR_WIDTH
94 #define INTEL_VECTOR_WIDTH 8
95 #endif
96
97 #ifdef __AVX512F__
98 #undef INTEL_VECTOR_WIDTH
99 #define INTEL_VECTOR_WIDTH 16
100 #define INTEL_V512 1
101 #define INTEL_VMASK 1
102 #else
103 #ifdef __MIC__
104 #define INTEL_V512 1
105 #define INTEL_VMASK 1
106 #define INTEL_HTHREADS 4
107 #endif
108 #endif
109
110 #ifdef __AVX512ER__
111 #define INTEL_HTHREADS 4
112 #endif
113
114 #ifdef __AVX512CD__
115 #ifndef _LMP_INTEL_OFFLOAD
116 #define LMP_USE_AVXCD
117 #endif
118 #endif
119
120 #ifdef __MIC__
121 #define INTEL_COMPILE_WIDTH INTEL_MIC_VECTOR_WIDTH
122 #else
123 #define INTEL_COMPILE_WIDTH INTEL_VECTOR_WIDTH
124 #endif
125
126 #else
127
128 #undef INTEL_VECTOR_WIDTH
129 #define INTEL_VECTOR_WIDTH 1
130 #define INTEL_COMPILE_WIDTH 1
131
132 #endif
133
134 #define INTEL_DATA_ALIGN 64
135 #define INTEL_ONEATOM_FACTOR 1
136 #define INTEL_MIC_NBOR_PAD INTEL_MIC_VECTOR_WIDTH
137 #define INTEL_NBOR_PAD INTEL_VECTOR_WIDTH
138 #define INTEL_LB_MEAN_WEIGHT 0.1
139 #define INTEL_BIGP 1e15
140 #define INTEL_MAX_HOST_CORE_COUNT 512
141 #define INTEL_MAX_COI_CORES 36
142
143 #ifndef INTEL_HTHREADS
144 #define INTEL_HTHREADS 2
145 #endif
146
147 #if INTEL_DATA_ALIGN > 1
148
149 #define IP_PRE_edge_align(n, esize) \
150 { \
151 const int pad_mask = ~static_cast<int>(INTEL_DATA_ALIGN/esize-1); \
152 n = (n + INTEL_DATA_ALIGN / esize - 1) & pad_mask; \
153 }
154
155 #else
156
157 #define IP_PRE_edge_align(n, esize) \
158
159 #endif
160
161 #define IP_PRE_get_stride(stride, n, datasize, torque) \
162 { \
163 int blength = n; \
164 if (torque) blength *= 2; \
165 const int bytes = blength * datasize; \
166 stride = INTEL_DATA_ALIGN - (bytes % INTEL_DATA_ALIGN); \
167 stride = blength + stride / datasize; \
168 }
169
170 #if defined(_OPENMP)
171
172 #define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \
173 { \
174 int idelta = inum/nthreads; \
175 const int imod = inum % nthreads; \
176 ifrom = tid * idelta; \
177 ito = ifrom + idelta; \
178 if (tid < imod) { \
179 ito+=tid+1; \
180 ifrom+=tid; \
181 } else { \
182 ito+=imod; \
183 ifrom+=imod; \
184 } \
185 }
186
187 #define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads) \
188 { \
189 tid = omp_get_thread_num(); \
190 IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads); \
191 }
192
193 #define IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr) \
194 { \
195 if (nthr <= INTEL_HTHREADS) { \
196 ifrom = tid; \
197 ito = inum; \
198 ip = nthr; \
199 } else if (nthr % INTEL_HTHREADS == 0) { \
200 int nd = nthr / INTEL_HTHREADS; \
201 int td = tid / INTEL_HTHREADS; \
202 int tm = tid % INTEL_HTHREADS; \
203 IP_PRE_omp_range(ifrom, ito, td, inum, nd); \
204 ifrom += tm; \
205 ip = INTEL_HTHREADS; \
206 } else { \
207 IP_PRE_omp_range(ifrom, ito, tid, inum, nthr); \
208 ip = 1; \
209 } \
210 }
211
212 #define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr) \
213 { \
214 tid = omp_get_thread_num(); \
215 IP_PRE_omp_stride(ifrom, ip, ito, tid, inum, nthr); \
216 }
217
218 #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
219 datasize) \
220 { \
221 int chunk_size = INTEL_DATA_ALIGN / datasize; \
222 int idelta = static_cast<int>(ceil(static_cast<float>(inum) \
223 /chunk_size/nthreads)); \
224 idelta *= chunk_size; \
225 ifrom = tid*idelta; \
226 ito = ifrom + idelta; \
227 if (ito > inum) ito = inum; \
228 }
229
230 #define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum, \
231 nthreads, datasize) \
232 { \
233 tid = omp_get_thread_num(); \
234 IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
235 datasize); \
236 }
237
238 #define IP_PRE_omp_range_vec(ifrom, ito, tid, inum, nthreads, \
239 vecsize) \
240 { \
241 int idelta = static_cast<int>(ceil(static_cast<float>(inum) \
242 /vecsize/nthreads)); \
243 idelta *= vecsize; \
244 ifrom = tid*idelta; \
245 ito = ifrom + idelta; \
246 if (ito > inum) ito = inum; \
247 }
248
249 #define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum, \
250 nthreads, vecsize) \
251 { \
252 tid = omp_get_thread_num(); \
253 IP_PRE_omp_range_vec(ifrom, ito, tid, inum, nthreads, \
254 vecsize); \
255 }
256
257 #define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \
258 nthr, vecsize) \
259 { \
260 tid = omp_get_thread_num(); \
261 if (nthr <= INTEL_HTHREADS) { \
262 ifrom = tid*vecsize; \
263 ito = inum; \
264 ip = nthr*vecsize; \
265 } else if (nthr % INTEL_HTHREADS == 0) { \
266 int nd = nthr / INTEL_HTHREADS; \
267 int td = tid / INTEL_HTHREADS; \
268 int tm = tid % INTEL_HTHREADS; \
269 IP_PRE_omp_range_vec(ifrom, ito, td, inum, nd, vecsize); \
270 ifrom += tm * vecsize; \
271 ip = INTEL_HTHREADS * vecsize; \
272 } else { \
273 IP_PRE_omp_range_vec(ifrom, ito, tid, inum, nthr, \
274 vecsize); \
275 ip = vecsize; \
276 } \
277 }
278
279 #else
280
281 #define IP_PRE_omp_range_id(ifrom, ito, tid, inum, nthreads) \
282 { \
283 tid = 0; \
284 ifrom = 0; \
285 ito = inum; \
286 }
287
288 #define IP_PRE_omp_range(ifrom, ito, tid, inum, nthreads) \
289 { \
290 ifrom = 0; \
291 ito = inum; \
292 }
293
294 #define IP_PRE_omp_stride_id(ifrom, ip, ito, tid, inum, nthr) \
295 { \
296 tid = 0; \
297 ifrom = 0; \
298 ito = inum; \
299 ip = 1; \
300 }
301
302 #define IP_PRE_omp_range_align(ifrom, ito, tid, inum, nthreads, \
303 datasize) \
304 { \
305 ifrom = 0; \
306 ito = inum; \
307 }
308
309 #define IP_PRE_omp_range_id_align(ifrom, ito, tid, inum, \
310 nthreads, datasize) \
311 { \
312 tid = 0; \
313 ifrom = 0; \
314 ito = inum; \
315 }
316
317 #define IP_PRE_omp_range_id_vec(ifrom, ito, tid, inum, \
318 nthreads, vecsize) \
319 { \
320 tid = 0; \
321 ifrom = 0; \
322 ito = inum; \
323 }
324
325 #define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum, \
326 nthr, vecsize) \
327 { \
328 tid = 0; \
329 ifrom = 0; \
330 ip = vecsize; \
331 ito = inum; \
332 }
333
334 #endif
335
336 // TO BE DEPRECATED
337 #ifndef USE_OMP_SIMD
338
339 #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \
340 f_stride, pos, ov0, ov1, ov2, \
341 ov3, ov4, ov5) \
342 { \
343 acc_t *f_scalar = &f_start[0].x; \
344 flt_t *x_scalar = &pos[minlocal].x; \
345 int f_stride4 = f_stride * 4; \
346 _alignvar(acc_t ovv[16],64); \
347 int vwidth; \
348 if (sizeof(acc_t) == sizeof(double)) \
349 vwidth = INTEL_COMPILE_WIDTH/2; \
350 else \
351 vwidth = INTEL_COMPILE_WIDTH; \
352 if (vwidth < 4) vwidth = 4; \
353 _use_simd_pragma("vector aligned") \
354 _use_simd_pragma("simd") \
355 for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0; \
356 int remainder = lt % vwidth; \
357 if (lf > lt) remainder = 0; \
358 const int v_range = lt - remainder; \
359 if (nthreads == 2) { \
360 acc_t *f_scalar2 = f_scalar + f_stride4; \
361 for (int n = lf; n < v_range; n += vwidth) { \
362 _use_simd_pragma("vector aligned") \
363 _use_simd_pragma("simd") \
364 for (int v = 0; v < vwidth; v++) { \
365 f_scalar[n+v] += f_scalar2[n+v]; \
366 ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
367 } \
368 ov3 += f_scalar[n+1] * x_scalar[n+0]; \
369 ov4 += f_scalar[n+2] * x_scalar[n+0]; \
370 ov5 += f_scalar[n+2] * x_scalar[n+1]; \
371 if (vwidth > 4) { \
372 ov3 += f_scalar[n+5] * x_scalar[n+4]; \
373 ov4 += f_scalar[n+6] * x_scalar[n+4]; \
374 ov5 += f_scalar[n+6] * x_scalar[n+5]; \
375 } \
376 if (vwidth > 8) { \
377 ov3 += f_scalar[n+9] * x_scalar[n+8]; \
378 ov3 += f_scalar[n+13] * x_scalar[n+12]; \
379 ov4 += f_scalar[n+10] * x_scalar[n+8]; \
380 ov4 += f_scalar[n+14] * x_scalar[n+12]; \
381 ov5 += f_scalar[n+10] * x_scalar[n+9]; \
382 ov5 += f_scalar[n+14] * x_scalar[n+13]; \
383 } \
384 } \
385 _use_simd_pragma("vector aligned") \
386 _use_simd_pragma("ivdep") \
387 _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
388 for (int n = v_range; n < lt; n++) \
389 f_scalar[n] += f_scalar2[n]; \
390 } else if (nthreads==4) { \
391 acc_t *f_scalar2 = f_scalar + f_stride4; \
392 acc_t *f_scalar3 = f_scalar2 + f_stride4; \
393 acc_t *f_scalar4 = f_scalar3 + f_stride4; \
394 for (int n = lf; n < v_range; n += vwidth) { \
395 _use_simd_pragma("vector aligned") \
396 _use_simd_pragma("simd") \
397 for (int v = 0; v < vwidth; v++) { \
398 f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] + \
399 f_scalar4[n+v]; \
400 ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
401 } \
402 ov3 += f_scalar[n+1] * x_scalar[n+0]; \
403 ov4 += f_scalar[n+2] * x_scalar[n+0]; \
404 ov5 += f_scalar[n+2] * x_scalar[n+1]; \
405 if (vwidth > 4) { \
406 ov3 += f_scalar[n+5] * x_scalar[n+4]; \
407 ov4 += f_scalar[n+6] * x_scalar[n+4]; \
408 ov5 += f_scalar[n+6] * x_scalar[n+5]; \
409 } \
410 if (vwidth > 8) { \
411 ov3 += f_scalar[n+9] * x_scalar[n+8]; \
412 ov3 += f_scalar[n+13] * x_scalar[n+12]; \
413 ov4 += f_scalar[n+10] * x_scalar[n+8]; \
414 ov4 += f_scalar[n+14] * x_scalar[n+12]; \
415 ov5 += f_scalar[n+10] * x_scalar[n+9]; \
416 ov5 += f_scalar[n+14] * x_scalar[n+13]; \
417 } \
418 } \
419 _use_simd_pragma("vector aligned") \
420 _use_simd_pragma("ivdep") \
421 _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
422 for (int n = v_range; n < lt; n++) \
423 f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; \
424 } else if (nthreads==1) { \
425 for (int n = lf; n < v_range; n += vwidth) { \
426 _use_simd_pragma("vector aligned") \
427 _use_simd_pragma("simd") \
428 for (int v = 0; v < vwidth; v++) \
429 ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
430 ov3 += f_scalar[n+1] * x_scalar[n+0]; \
431 ov4 += f_scalar[n+2] * x_scalar[n+0]; \
432 ov5 += f_scalar[n+2] * x_scalar[n+1]; \
433 if (vwidth > 4) { \
434 ov3 += f_scalar[n+5] * x_scalar[n+4]; \
435 ov4 += f_scalar[n+6] * x_scalar[n+4]; \
436 ov5 += f_scalar[n+6] * x_scalar[n+5]; \
437 } \
438 if (vwidth > 8) { \
439 ov3 += f_scalar[n+9] * x_scalar[n+8]; \
440 ov3 += f_scalar[n+13] * x_scalar[n+12]; \
441 ov4 += f_scalar[n+10] * x_scalar[n+8]; \
442 ov4 += f_scalar[n+14] * x_scalar[n+12]; \
443 ov5 += f_scalar[n+10] * x_scalar[n+9]; \
444 ov5 += f_scalar[n+14] * x_scalar[n+13]; \
445 } \
446 } \
447 } else if (nthreads==3) { \
448 acc_t *f_scalar2 = f_scalar + f_stride4; \
449 acc_t *f_scalar3 = f_scalar2 + f_stride4; \
450 for (int n = lf; n < v_range; n += vwidth) { \
451 _use_simd_pragma("vector aligned") \
452 _use_simd_pragma("simd") \
453 for (int v = 0; v < vwidth; v++) { \
454 f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v]; \
455 ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
456 } \
457 ov3 += f_scalar[n+1] * x_scalar[n+0]; \
458 ov4 += f_scalar[n+2] * x_scalar[n+0]; \
459 ov5 += f_scalar[n+2] * x_scalar[n+1]; \
460 if (vwidth > 4) { \
461 ov3 += f_scalar[n+5] * x_scalar[n+4]; \
462 ov4 += f_scalar[n+6] * x_scalar[n+4]; \
463 ov5 += f_scalar[n+6] * x_scalar[n+5]; \
464 } \
465 if (vwidth > 8) { \
466 ov3 += f_scalar[n+9] * x_scalar[n+8]; \
467 ov3 += f_scalar[n+13] * x_scalar[n+12]; \
468 ov4 += f_scalar[n+10] * x_scalar[n+8]; \
469 ov4 += f_scalar[n+14] * x_scalar[n+12]; \
470 ov5 += f_scalar[n+10] * x_scalar[n+9]; \
471 ov5 += f_scalar[n+14] * x_scalar[n+13]; \
472 } \
473 } \
474 _use_simd_pragma("vector aligned") \
475 _use_simd_pragma("ivdep") \
476 _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
477 for (int n = v_range; n < lt; n++) \
478 f_scalar[n] += f_scalar2[n] + f_scalar3[n]; \
479 } \
480 for (int n = v_range; n < lt; n += 4) { \
481 _use_simd_pragma("vector aligned") \
482 _use_simd_pragma("ivdep") \
483 for (int v = 0; v < 4; v++) \
484 ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
485 ov3 += f_scalar[n+1] * x_scalar[n+0]; \
486 ov4 += f_scalar[n+2] * x_scalar[n+0]; \
487 ov5 += f_scalar[n+2] * x_scalar[n+1]; \
488 } \
489 ov0 += ovv[0]; \
490 ov1 += ovv[1]; \
491 ov2 += ovv[2]; \
492 if (vwidth > 4) { \
493 ov0 += ovv[4]; \
494 ov1 += ovv[5]; \
495 ov2 += ovv[6]; \
496 } \
497 if (vwidth > 8) { \
498 ov0 += ovv[8] + ovv[12]; \
499 ov1 += ovv[9] + ovv[13]; \
500 ov2 += ovv[10] + ovv[14]; \
501 } \
502 }
503
504 #define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \
505 f_stride, pos, offload, vflag, ov0, ov1, \
506 ov2, ov3, ov4, ov5) \
507 { \
508 int o_range = (nall - minlocal) * 4; \
509 IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads, \
510 sizeof(acc_t)); \
511 \
512 acc_t *f_scalar = &f_start[0].x; \
513 int f_stride4 = f_stride * 4; \
514 int t; \
515 if (vflag == VIRIAL_FDOTR) t = 4; else t = 1; \
516 acc_t *f_scalar2 = f_scalar + f_stride4 * t; \
517 for ( ; t < nthreads; t++) { \
518 _use_simd_pragma("vector aligned") \
519 _use_simd_pragma("simd") \
520 for (int n = iifrom; n < iito; n++) \
521 f_scalar[n] += f_scalar2[n]; \
522 f_scalar2 += f_stride4; \
523 } \
524 \
525 if (vflag == VIRIAL_FDOTR) { \
526 int nt_min = MIN(4,nthreads); \
527 IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start, \
528 f_stride, pos, ov0, ov1, ov2, ov3, ov4, \
529 ov5); \
530 } \
531 }
532
533 #else
534
535 #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start, \
536 f_stride, pos, ov0, ov1, ov2, \
537 ov3, ov4, ov5) \
538 { \
539 acc_t *f_scalar = &f_start[0].x; \
540 flt_t *x_scalar = &pos[minlocal].x; \
541 int f_stride4 = f_stride * 4; \
542 _alignvar(acc_t ovv[16],64); \
543 int vwidth; \
544 if (sizeof(acc_t) == sizeof(double)) \
545 vwidth = INTEL_COMPILE_WIDTH/2; \
546 else \
547 vwidth = INTEL_COMPILE_WIDTH; \
548 if (vwidth < 4) vwidth = 4; \
549 _use_simd_pragma("omp simd aligned(ovv:64)") \
550 for (int v = 0; v < vwidth; v++) ovv[v] = (acc_t)0.0; \
551 int remainder = lt % vwidth; \
552 if (lf > lt) remainder = 0; \
553 const int v_range = lt - remainder; \
554 if (nthreads == 2) { \
555 acc_t *f_scalar2 = f_scalar + f_stride4; \
556 for (int n = lf; n < v_range; n += vwidth) { \
557 _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,ovv,x_scalar:64)")\
558 for (int v = 0; v < vwidth; v++) { \
559 f_scalar[n+v] += f_scalar2[n+v]; \
560 ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
561 } \
562 ov3 += f_scalar[n+1] * x_scalar[n+0]; \
563 ov4 += f_scalar[n+2] * x_scalar[n+0]; \
564 ov5 += f_scalar[n+2] * x_scalar[n+1]; \
565 if (vwidth > 4) { \
566 ov3 += f_scalar[n+5] * x_scalar[n+4]; \
567 ov4 += f_scalar[n+6] * x_scalar[n+4]; \
568 ov5 += f_scalar[n+6] * x_scalar[n+5]; \
569 } \
570 if (vwidth > 8) { \
571 ov3 += f_scalar[n+9] * x_scalar[n+8]; \
572 ov3 += f_scalar[n+13] * x_scalar[n+12]; \
573 ov4 += f_scalar[n+10] * x_scalar[n+8]; \
574 ov4 += f_scalar[n+14] * x_scalar[n+12]; \
575 ov5 += f_scalar[n+10] * x_scalar[n+9]; \
576 ov5 += f_scalar[n+14] * x_scalar[n+13]; \
577 } \
578 } \
579 _use_simd_pragma("vector aligned") \
580 _use_simd_pragma("ivdep") \
581 _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
582 for (int n = v_range; n < lt; n++) \
583 f_scalar[n] += f_scalar2[n]; \
584 } else if (nthreads==4) { \
585 acc_t *f_scalar2 = f_scalar + f_stride4; \
586 acc_t *f_scalar3 = f_scalar2 + f_stride4; \
587 acc_t *f_scalar4 = f_scalar3 + f_stride4; \
588 for (int n = lf; n < v_range; n += vwidth) { \
589 _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,f_scalar4,ovv:64)") \
590 for (int v = 0; v < vwidth; v++) { \
591 f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v] + \
592 f_scalar4[n+v]; \
593 ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
594 } \
595 ov3 += f_scalar[n+1] * x_scalar[n+0]; \
596 ov4 += f_scalar[n+2] * x_scalar[n+0]; \
597 ov5 += f_scalar[n+2] * x_scalar[n+1]; \
598 if (vwidth > 4) { \
599 ov3 += f_scalar[n+5] * x_scalar[n+4]; \
600 ov4 += f_scalar[n+6] * x_scalar[n+4]; \
601 ov5 += f_scalar[n+6] * x_scalar[n+5]; \
602 } \
603 if (vwidth > 8) { \
604 ov3 += f_scalar[n+9] * x_scalar[n+8]; \
605 ov3 += f_scalar[n+13] * x_scalar[n+12]; \
606 ov4 += f_scalar[n+10] * x_scalar[n+8]; \
607 ov4 += f_scalar[n+14] * x_scalar[n+12]; \
608 ov5 += f_scalar[n+10] * x_scalar[n+9]; \
609 ov5 += f_scalar[n+14] * x_scalar[n+13]; \
610 } \
611 } \
612 _use_simd_pragma("vector aligned") \
613 _use_simd_pragma("ivdep") \
614 _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
615 for (int n = v_range; n < lt; n++) \
616 f_scalar[n] += f_scalar2[n] + f_scalar3[n] + f_scalar4[n]; \
617 } else if (nthreads==1) { \
618 for (int n = lf; n < v_range; n += vwidth) { \
619 _use_simd_pragma("omp simd aligned(ovv,f_scalar,x_scalar:64)") \
620 for (int v = 0; v < vwidth; v++) \
621 ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
622 ov3 += f_scalar[n+1] * x_scalar[n+0]; \
623 ov4 += f_scalar[n+2] * x_scalar[n+0]; \
624 ov5 += f_scalar[n+2] * x_scalar[n+1]; \
625 if (vwidth > 4) { \
626 ov3 += f_scalar[n+5] * x_scalar[n+4]; \
627 ov4 += f_scalar[n+6] * x_scalar[n+4]; \
628 ov5 += f_scalar[n+6] * x_scalar[n+5]; \
629 } \
630 if (vwidth > 8) { \
631 ov3 += f_scalar[n+9] * x_scalar[n+8]; \
632 ov3 += f_scalar[n+13] * x_scalar[n+12]; \
633 ov4 += f_scalar[n+10] * x_scalar[n+8]; \
634 ov4 += f_scalar[n+14] * x_scalar[n+12]; \
635 ov5 += f_scalar[n+10] * x_scalar[n+9]; \
636 ov5 += f_scalar[n+14] * x_scalar[n+13]; \
637 } \
638 } \
639 } else if (nthreads==3) { \
640 acc_t *f_scalar2 = f_scalar + f_stride4; \
641 acc_t *f_scalar3 = f_scalar2 + f_stride4; \
642 for (int n = lf; n < v_range; n += vwidth) { \
643 _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2,f_scalar3,ovv,x_scalar:64)") \
644 for (int v = 0; v < vwidth; v++) { \
645 f_scalar[n+v] += f_scalar2[n+v] + f_scalar3[n+v]; \
646 ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
647 } \
648 ov3 += f_scalar[n+1] * x_scalar[n+0]; \
649 ov4 += f_scalar[n+2] * x_scalar[n+0]; \
650 ov5 += f_scalar[n+2] * x_scalar[n+1]; \
651 if (vwidth > 4) { \
652 ov3 += f_scalar[n+5] * x_scalar[n+4]; \
653 ov4 += f_scalar[n+6] * x_scalar[n+4]; \
654 ov5 += f_scalar[n+6] * x_scalar[n+5]; \
655 } \
656 if (vwidth > 8) { \
657 ov3 += f_scalar[n+9] * x_scalar[n+8]; \
658 ov3 += f_scalar[n+13] * x_scalar[n+12]; \
659 ov4 += f_scalar[n+10] * x_scalar[n+8]; \
660 ov4 += f_scalar[n+14] * x_scalar[n+12]; \
661 ov5 += f_scalar[n+10] * x_scalar[n+9]; \
662 ov5 += f_scalar[n+14] * x_scalar[n+13]; \
663 } \
664 } \
665 _use_simd_pragma("vector aligned") \
666 _use_simd_pragma("ivdep") \
667 _use_simd_pragma("loop_count min(4) max(INTEL_COMPILE_WIDTH)") \
668 for (int n = v_range; n < lt; n++) \
669 f_scalar[n] += f_scalar2[n] + f_scalar3[n]; \
670 } \
671 for (int n = v_range; n < lt; n += 4) { \
672 _use_simd_pragma("vector aligned") \
673 _use_simd_pragma("ivdep") \
674 for (int v = 0; v < 4; v++) \
675 ovv[v] += f_scalar[n+v] * x_scalar[n+v]; \
676 ov3 += f_scalar[n+1] * x_scalar[n+0]; \
677 ov4 += f_scalar[n+2] * x_scalar[n+0]; \
678 ov5 += f_scalar[n+2] * x_scalar[n+1]; \
679 } \
680 ov0 += ovv[0]; \
681 ov1 += ovv[1]; \
682 ov2 += ovv[2]; \
683 if (vwidth > 4) { \
684 ov0 += ovv[4]; \
685 ov1 += ovv[5]; \
686 ov2 += ovv[6]; \
687 } \
688 if (vwidth > 8) { \
689 ov0 += ovv[8] + ovv[12]; \
690 ov1 += ovv[9] + ovv[13]; \
691 ov2 += ovv[10] + ovv[14]; \
692 } \
693 }
694
695 #define IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \
696 f_stride, pos, offload, vflag, ov0, ov1, \
697 ov2, ov3, ov4, ov5) \
698 { \
699 int o_range = (nall - minlocal) * 4; \
700 IP_PRE_omp_range_id_align(iifrom, iito, tid, o_range, nthreads, \
701 sizeof(acc_t)); \
702 \
703 acc_t *f_scalar = &f_start[0].x; \
704 int f_stride4 = f_stride * 4; \
705 int t; \
706 if (vflag == VIRIAL_FDOTR) t = 4; else t = 1; \
707 acc_t *f_scalar2 = f_scalar + f_stride4 * t; \
708 for ( ; t < nthreads; t++) { \
709 _use_simd_pragma("omp simd aligned(f_scalar,f_scalar2:64)") \
710 for (int n = iifrom; n < iito; n++) \
711 f_scalar[n] += f_scalar2[n]; \
712 f_scalar2 += f_stride4; \
713 } \
714 \
715 if (vflag == VIRIAL_FDOTR) { \
716 int nt_min = MIN(4,nthreads); \
717 IP_PRE_fdotr_acc_force_l5(iifrom, iito, minlocal, nt_min, f_start, \
718 f_stride, pos, ov0, ov1, ov2, ov3, ov4, \
719 ov5); \
720 } \
721 }
722
723 #endif
724
725 #ifdef _LMP_INTEL_OFFLOAD
726 #include <sys/time.h>
727
target(mic)728 __declspec( target (mic))
729 inline double MIC_Wtime() {
730 double time;
731 struct timeval tv;
732
733 gettimeofday(&tv, nullptr);
734 time = 1.0 * tv.tv_sec + 1.0e-6 * tv.tv_usec;
735 return time;
736 }
737
738 #define IP_PRE_neighbor_pad(jnum, offload) \
739 { \
740 const int opad_mask = ~static_cast<int>(INTEL_MIC_NBOR_PAD * \
741 sizeof(float) / \
742 sizeof(flt_t) - 1); \
743 const int pad_mask = ~static_cast<int>(INTEL_NBOR_PAD * \
744 sizeof(float) / \
745 sizeof(flt_t) - 1); \
746 if (offload && INTEL_MIC_NBOR_PAD > 1) \
747 jnum = (jnum + INTEL_MIC_NBOR_PAD * sizeof(float) / \
748 sizeof(flt_t) - 1) & opad_mask; \
749 else if (INTEL_NBOR_PAD > 1) \
750 jnum = (jnum + INTEL_NBOR_PAD * sizeof(float) / \
751 sizeof(flt_t) - 1) & pad_mask; \
752 }
753
754 #define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, \
755 nlocal, nall) \
756 { \
757 if (fix->separate_buffers() && ago != 0) { \
758 fix->start_watch(TIME_PACK); \
759 if (offload) { \
760 int packthreads; \
761 if (comm->nthreads > INTEL_HTHREADS) packthreads = comm->nthreads;\
762 else packthreads = 1; \
763 _use_omp_pragma("omp parallel if(packthreads > 1)") \
764 { \
765 int ifrom, ito, tid; \
766 IP_PRE_omp_range_id_align(ifrom, ito, tid, nlocal, \
767 packthreads, sizeof(flt_t)); \
768 buffers->thr_pack_cop(ifrom, ito, 0); \
769 int nghost = nall - nlocal; \
770 if (nghost) { \
771 IP_PRE_omp_range_align(ifrom, ito, tid, nall - nlocal, \
772 packthreads, sizeof(flt_t)); \
773 buffers->thr_pack_cop(ifrom + nlocal, ito + nlocal, \
774 fix->offload_min_ghost() - nlocal, \
775 ago == 1); \
776 } \
777 } \
778 } else { \
779 buffers->thr_pack_host(fix->host_min_local(), nlocal, 0); \
780 buffers->thr_pack_host(nlocal, nall, \
781 fix->host_min_ghost()-nlocal); \
782 } \
783 fix->stop_watch(TIME_PACK); \
784 } \
785 }
786
787 #define IP_PRE_get_transfern(ago, newton, eflag, vflag, \
788 buffers, offload, fix, separate_flag, \
789 x_size, q_size, ev_size, f_stride) \
790 { \
791 separate_flag = 0; \
792 if (ago == 0) { \
793 x_size = 0; \
794 q_size = nall; \
795 if (offload) { \
796 if (fix->separate_buffers()) { \
797 if (lmp->atom->torque) \
798 separate_flag = 2; \
799 else \
800 separate_flag = 1; \
801 } else \
802 separate_flag = 3; \
803 } \
804 } else { \
805 x_size = nall; \
806 q_size = 0; \
807 } \
808 ev_size = 0; \
809 if (eflag) ev_size = 2; \
810 if (vflag) ev_size = 8; \
811 if (newton) \
812 f_stride = buffers->get_stride(nall); \
813 else \
814 f_stride = buffers->get_stride(nlocal); \
815 }
816
817 #define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, \
818 ev_global) \
819 { \
820 if (offload) { \
821 tc = buffers->get_off_threads(); \
822 f_start = buffers->get_off_f(); \
823 ev_global = buffers->get_ev_global(); \
824 } else { \
825 tc = comm->nthreads; \
826 f_start = buffers->get_f(); \
827 fix->start_watch(TIME_HOST_PAIR); \
828 ev_global = buffers->get_ev_global_host(); \
829 } \
830 }
831
832 #define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall, \
833 f_stride, x, q) \
834 { \
835 if (separate_flag) { \
836 if (separate_flag < 3) { \
837 int all_local = nlocal; \
838 int ghost_min = overflow[LMP_GHOST_MIN]; \
839 nlocal = overflow[LMP_LOCAL_MAX] + 1; \
840 int nghost = overflow[LMP_GHOST_MAX] + 1 - ghost_min; \
841 if (nghost < 0) nghost = 0; \
842 nall = nlocal + nghost; \
843 separate_flag--; \
844 int flength; \
845 if (newton) flength = nall; \
846 else flength = nlocal; \
847 IP_PRE_get_stride(f_stride, flength, sizeof(FORCE_T), \
848 separate_flag); \
849 if (nghost) { \
850 if (nlocal < all_local || ghost_min > all_local) { \
851 memmove(x + nlocal, x + ghost_min, \
852 (nall - nlocal) * sizeof(ATOM_T)); \
853 if (q != 0) \
854 memmove((void *)(q + nlocal), (void *)(q + ghost_min), \
855 (nall - nlocal) * sizeof(flt_t)); \
856 } \
857 } \
858 } \
859 x[nall].x = INTEL_BIGP; \
860 x[nall].y = INTEL_BIGP; \
861 x[nall].z = INTEL_BIGP; \
862 } \
863 }
864
865 #define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads, \
866 f_start, f_stride, x, offload, vflag, \
867 ov0, ov1, ov2, ov3, ov4, ov5) \
868 { \
869 if (newton) { \
870 _use_omp_pragma("omp barrier"); \
871 IP_PRE_fdotr_acc_force(nall, minlocal, nthreads, f_start, \
872 f_stride, x, offload, vflag, ov0, ov1, ov2, \
873 ov3, ov4, ov5); \
874 } \
875 }
876
877 #define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag, \
878 ov0, ov1, ov2, ov3, ov4, ov5)
879
880 #else
881
882 #if INTEL_NBOR_PAD > 1
883
884 #define IP_PRE_neighbor_pad(jnum, offload) \
885 { \
886 const int pad_mask = ~static_cast<int>(INTEL_NBOR_PAD * \
887 sizeof(float) / \
888 sizeof(flt_t) - 1); \
889 jnum = (jnum + INTEL_NBOR_PAD * sizeof(float) / \
890 sizeof(flt_t) - 1) & pad_mask; \
891 }
892
893 #else
894
895 #define IP_PRE_neighbor_pad(jnum, offload)
896
897 #endif
898
899 #define MIC_Wtime MPI_Wtime
900 #define IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, \
901 nlocal, nall)
902
903 #define IP_PRE_get_transfern(ago, newton, eflag, vflag, \
904 buffers, offload, fix, separate_flag, \
905 x_size, q_size, ev_size, f_stride) \
906 { \
907 separate_flag = 0; \
908 int f_length; \
909 if (newton) \
910 f_length = nall; \
911 else \
912 f_length = nlocal; \
913 f_stride = buffers->get_stride(f_length); \
914 }
915
916 #define IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, \
917 ev_global) \
918 { \
919 tc = comm->nthreads; \
920 f_start = buffers->get_f(); \
921 fix->start_watch(TIME_HOST_PAIR); \
922 ev_global = buffers->get_ev_global_host(); \
923 }
924
925 #define IP_PRE_repack_for_offload(newton, separate_flag, nlocal, nall, \
926 f_stride, x, q)
927
928 #define IP_PRE_fdotr_reduce_omp(newton, nall, minlocal, nthreads, \
929 f_start, f_stride, x, offload, vflag, \
930 ov0, ov1, ov2, ov3, ov4, ov5) \
931 { \
932 if (newton) { \
933 if (vflag == 2 && nthreads > INTEL_HTHREADS) { \
934 _use_omp_pragma("omp barrier"); \
935 buffers->fdotr_reduce(nall, nthreads, f_stride, ov0, ov1, ov2, \
936 ov3, ov4, ov5); \
937 } \
938 } \
939 }
940
941 #define IP_PRE_fdotr_reduce(newton, nall, nthreads, f_stride, vflag, \
942 ov0, ov1, ov2, ov3, ov4, ov5) \
943 { \
944 if (newton) { \
945 if (vflag == 2 && nthreads <= INTEL_HTHREADS) { \
946 int lt = nall * 4; \
947 buffers->fdotr_reduce_l5(0, lt, nthreads, f_stride, ov0, ov1, \
948 ov2, ov3, ov4, ov5); \
949 } \
950 } \
951 }
952
953 #endif
954
955 #define IP_PRE_ev_tally_nbor(vflag, fpair, delx, dely, delz) \
956 { \
957 if (vflag == 1) { \
958 sv0 += delx * delx * fpair; \
959 sv1 += dely * dely * fpair; \
960 sv2 += delz * delz * fpair; \
961 sv3 += delx * dely * fpair; \
962 sv4 += delx * delz * fpair; \
963 sv5 += dely * delz * fpair; \
964 } \
965 }
966
967 #define IP_PRE_ev_tally_nborv(vflag, dx, dy, dz, fpx, fpy, fpz) \
968 { \
969 if (vflag == 1) { \
970 sv0 += dx * fpx; \
971 sv1 += dy * fpy; \
972 sv2 += dz * fpz; \
973 sv3 += dx * fpy; \
974 sv4 += dx * fpz; \
975 sv5 += dy * fpz; \
976 } \
977 }
978
979 #define IP_PRE_ev_tally_nbor3(vflag, fj, fk, delx, dely, delz, delr2) \
980 { \
981 if (vflag == 1) { \
982 sv0 += delx * fj[0] + delr2[0] * fk[0]; \
983 sv1 += dely * fj[1] + delr2[1] * fk[1]; \
984 sv2 += delz * fj[2] + delr2[2] * fk[2]; \
985 sv3 += delx * fj[1] + delr2[0] * fk[1]; \
986 sv4 += delx * fj[2] + delr2[0] * fk[2]; \
987 sv5 += dely * fj[2] + delr2[1] * fk[2]; \
988 } \
989 }
990
991 #define IP_PRE_ev_tally_nbor3v(vflag, fj0, fj1, fj2, delx, dely, delz) \
992 { \
993 if (vflag == 1) { \
994 sv0 += delx * fj0; \
995 sv1 += dely * fj1; \
996 sv2 += delz * fj2; \
997 sv3 += delx * fj1; \
998 sv4 += delx * fj2; \
999 sv5 += dely * fj2; \
1000 } \
1001 }
1002
1003 #define IP_PRE_ev_tally_bond(eflag, VFLAG, eatom, vflag, ebond, i1, i2, \
1004 fbond, delx, dely, delz, obond, force, \
1005 newton, nlocal, ov0, ov1, ov2, ov3, ov4, \
1006 ov5) \
1007 { \
1008 flt_t ev_pre; \
1009 if (newton) ev_pre = (flt_t)1.0; \
1010 else { \
1011 ev_pre = (flt_t)0.0; \
1012 if (i1 < nlocal) ev_pre += (flt_t)0.5; \
1013 if (i2 < nlocal) ev_pre += (flt_t)0.5; \
1014 } \
1015 \
1016 if (eflag) { \
1017 obond += ev_pre * ebond; \
1018 if (eatom) { \
1019 flt_t halfeng = ebond * (flt_t)0.5; \
1020 if (newton || i1 < nlocal) f[i1].w += halfeng; \
1021 if (newton || i2 < nlocal) f[i2].w += halfeng; \
1022 } \
1023 } \
1024 \
1025 if (VFLAG && vflag) { \
1026 ov0 += ev_pre * (delx * delx * fbond); \
1027 ov1 += ev_pre * (dely * dely * fbond); \
1028 ov2 += ev_pre * (delz * delz * fbond); \
1029 ov3 += ev_pre * (delx * dely * fbond); \
1030 ov4 += ev_pre * (delx * delz * fbond); \
1031 ov5 += ev_pre * (dely * delz * fbond); \
1032 } \
1033 }
1034
1035 #define IP_PRE_ev_tally_angle(eflag, VFLAG, eatom, vflag, eangle, i1, \
1036 i2, i3, f1x, f1y, f1z, f3x, f3y, f3z, \
1037 delx1, dely1, delz1, delx2, dely2, delz2, \
1038 oeangle, force, newton, nlocal, ov0, ov1, \
1039 ov2, ov3, ov4, ov5) \
1040 { \
1041 flt_t ev_pre; \
1042 if (newton) ev_pre = (flt_t)1.0; \
1043 else { \
1044 ev_pre = (flt_t)0.0; \
1045 if (i1 < nlocal) ev_pre += (flt_t)0.3333333333333333; \
1046 if (i2 < nlocal) ev_pre += (flt_t)0.3333333333333333; \
1047 if (i3 < nlocal) ev_pre += (flt_t)0.3333333333333333; \
1048 } \
1049 \
1050 if (eflag) { \
1051 oeangle += ev_pre * eangle; \
1052 if (eatom) { \
1053 flt_t thirdeng = eangle * (flt_t)0.3333333333333333; \
1054 if (newton || i1 < nlocal) f[i1].w += thirdeng; \
1055 if (newton || i2 < nlocal) f[i2].w += thirdeng; \
1056 if (newton || i3 < nlocal) f[i3].w += thirdeng; \
1057 } \
1058 } \
1059 \
1060 if (VFLAG && vflag) { \
1061 ov0 += ev_pre * (delx1 * f1x + delx2 * f3x); \
1062 ov1 += ev_pre * (dely1 * f1y + dely2 * f3y); \
1063 ov2 += ev_pre * (delz1 * f1z + delz2 * f3z); \
1064 ov3 += ev_pre * (delx1 * f1y + delx2 * f3y); \
1065 ov4 += ev_pre * (delx1 * f1z + delx2 * f3z); \
1066 ov5 += ev_pre * (dely1 * f1z + dely2 * f3z); \
1067 } \
1068 }
1069
1070 #define IP_PRE_ev_tally_dihed(eflag, VFLAG, eatom, vflag, deng, i1, i2, \
1071 i3, i4, f1x, f1y, f1z, f3x, f3y, f3z, f4x,\
1072 f4y, f4z, vb1x, vb1y, vb1z, vb2x, vb2y, \
1073 vb2z, vb3x, vb3y, vb3z, oedihedral, force,\
1074 newton, nlocal, ov0, ov1, ov2, ov3, ov4, \
1075 ov5) \
1076 { \
1077 flt_t ev_pre; \
1078 if (newton) ev_pre = (flt_t)1.0; \
1079 else { \
1080 ev_pre = (flt_t)0.0; \
1081 if (i1 < nlocal) ev_pre += (flt_t)0.25; \
1082 if (i2 < nlocal) ev_pre += (flt_t)0.25; \
1083 if (i3 < nlocal) ev_pre += (flt_t)0.25; \
1084 if (i4 < nlocal) ev_pre += (flt_t)0.25; \
1085 } \
1086 \
1087 if (eflag) { \
1088 oedihedral += ev_pre * deng; \
1089 if (eatom) { \
1090 flt_t qdeng = deng * (flt_t)0.25; \
1091 if (newton || i1 < nlocal) f[i1].w += qdeng; \
1092 if (newton || i2 < nlocal) f[i2].w += qdeng; \
1093 if (newton || i3 < nlocal) f[i3].w += qdeng; \
1094 if (newton || i4 < nlocal) f[i4].w += qdeng; \
1095 } \
1096 } \
1097 \
1098 if (VFLAG && vflag) { \
1099 ov0 += ev_pre * (vb1x*f1x + vb2x*f3x + (vb3x+vb2x)*f4x); \
1100 ov1 += ev_pre * (vb1y*f1y + vb2y*f3y + (vb3y+vb2y)*f4y); \
1101 ov2 += ev_pre * (vb1z*f1z + vb2z*f3z + (vb3z+vb2z)*f4z); \
1102 ov3 += ev_pre * (vb1x*f1y + vb2x*f3y + (vb3x+vb2x)*f4y); \
1103 ov4 += ev_pre * (vb1x*f1z + vb2x*f3z + (vb3x+vb2x)*f4z); \
1104 ov5 += ev_pre * (vb1y*f1z + vb2y*f3z + (vb3y+vb2y)*f4z); \
1105 } \
1106 }
1107
1108 #define IP_PRE_ev_tally_atom(newton, eflag, vflag, f, fwtmp) \
1109 { \
1110 if (eflag) { \
1111 f[i].w += fwtmp; \
1112 oevdwl += sevdwl; \
1113 } \
1114 if (newton == 0 && vflag == 1) { \
1115 ov0 += sv0; \
1116 ov1 += sv1; \
1117 ov2 += sv2; \
1118 ov3 += sv3; \
1119 ov4 += sv4; \
1120 ov5 += sv5; \
1121 } \
1122 }
1123
1124 #define IP_PRE_ev_tally_atomq(newton, eflag, vflag, f, fwtmp) \
1125 { \
1126 if (eflag) { \
1127 f[i].w += fwtmp; \
1128 oevdwl += sevdwl; \
1129 oecoul += secoul; \
1130 } \
1131 if (newton == 0 && vflag == 1) { \
1132 ov0 += sv0; \
1133 ov1 += sv1; \
1134 ov2 += sv2; \
1135 ov3 += sv3; \
1136 ov4 += sv4; \
1137 ov5 += sv5; \
1138 } \
1139 }
1140
1141 }
1142
1143 #endif
1144