1 // clang-format off
2 /* -*- c++ -*- -------------------------------------------------------------
3    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
4    https://www.lammps.org/, Sandia National Laboratories
5    Steve Plimpton, sjplimp@sandia.gov
6 
7    Copyright (2003) Sandia Corporation.  Under the terms of Contract
8    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
9    certain rights in this software.  This software is distributed under
10    the GNU General Public License.
11 
12    See the README file in the top-level LAMMPS directory.
13 ------------------------------------------------------------------------- */
14 
15 /* ----------------------------------------------------------------------
16    Contributing author: W. Michael Brown (Intel)
17 ------------------------------------------------------------------------- */
18 
19 /* ----------------------------------------------------------------------
20 
21 Vector intrinsics are temporarily being used for the Stillinger-Weber
22 potential to allow for advanced features in the AVX512 instruction set to
23 be exploited on early hardware. We hope to see compiler improvements for
24 AVX512 that will eliminate this requirement, so it is not recommended to
25 develop code based on the intrinsics implementation. Please e-mail the
26 authors for more details.
27 
28 ------------------------------------------------------------------------- */
29 
30 #ifndef INTEL_SIMD_H
31 #define INTEL_SIMD_H
32 
33 #include "intel_preprocess.h"
34 #include "immintrin.h"
35 
36 #ifdef __AVX512F__
37 
38 #ifndef _MM_SCALE_1
39 #define _MM_SCALE_1 1
40 #define _MM_SCALE_2 2
41 #define _MM_SCALE_4 4
42 #define _MM_SCALE_8 8
43 #endif
44 
45 namespace ip_simd {
46 
47   typedef __mmask16 SIMD_mask;
48 
49   struct SIMD_int {
50     __m512i v;
SIMD_intSIMD_int51     SIMD_int() {}
SIMD_intSIMD_int52     SIMD_int(const __m512i in) : v(in) {}
__m512iSIMD_int53     operator __m512i() const { return v;}
54   };
55 
56   struct SIMD_float {
57     __m512 v;
SIMD_floatSIMD_float58     SIMD_float() {}
SIMD_floatSIMD_float59     SIMD_float(const __m512 in) : v(in) {}
__m512SIMD_float60     operator __m512() const { return v;}
61   };
62 
63   struct SIMD_double {
64     __m512d v;
SIMD_doubleSIMD_double65     SIMD_double() {}
SIMD_doubleSIMD_double66     SIMD_double(const __m512d in) : v(in) {}
__m512dSIMD_double67     operator __m512d() const { return v;}
68   };
69 
70   template<class flt_t>
71   class SIMD_type {
72   };
73 
74   template<>
75   class SIMD_type<float> {
76    public:
77     typedef SIMD_float SIMD_vec;
width()78     static inline int width() { return 16; }
79   };
80 
81   template<>
82   class SIMD_type<double> {
83    public:
84     typedef SIMD_double SIMD_vec;
width()85     static inline int width() { return 8; }
86   };
87 
88   template<class flt_t, class acc_t>
89   class is_same {
90    public:
91     static const int value = 1;
92   };
93 
94   template<>
95   class is_same<float,double> {
96    public:
97     static const int value = 0;
98   };
99 
100   // ------- Set Operations
101 
SIMD_set(const int l0,const int l1,const int l2,const int l3,const int l4,const int l5,const int l6,const int l7,const int l8,const int l9,const int l10,const int l11,const int l12,const int l13,const int l14,const int l15)102   inline SIMD_int SIMD_set(const int l0, const int l1, const int l2,
103                            const int l3, const int l4, const int l5,
104                            const int l6, const int l7, const int l8,
105                            const int l9, const int l10, const int l11,
106                            const int l12, const int l13, const int l14,
107                            const int l15) {
108     return _mm512_setr_epi32(l0,l1,l2,l3,l4,l5,l6,l7,
109                              l8,l9,l10,l11,l12,l13,l14,l15);
110   }
111 
SIMD_set(const int l)112   inline SIMD_int SIMD_set(const int l) {
113     return _mm512_set1_epi32(l);
114   }
115 
SIMD_set(const float l)116   inline SIMD_float SIMD_set(const float l) {
117     return _mm512_set1_ps(l);
118   }
119 
SIMD_set(const double l)120   inline SIMD_double SIMD_set(const double l) {
121     return _mm512_set1_pd(l);
122   }
123 
SIMD_zero_masked(const SIMD_mask & m,const SIMD_int & one)124   inline SIMD_int SIMD_zero_masked(const SIMD_mask &m, const SIMD_int &one) {
125     return _mm512_maskz_mov_epi32(m, one);
126   }
127 
SIMD_zero_masked(const SIMD_mask & m,const SIMD_float & one)128   inline SIMD_float SIMD_zero_masked(const SIMD_mask &m,
129                                      const SIMD_float &one) {
130     return _mm512_maskz_mov_ps(m, one);
131   }
132 
SIMD_zero_masked(const SIMD_mask & m,const SIMD_double & one)133   inline SIMD_double SIMD_zero_masked(const SIMD_mask &m,
134                                      const SIMD_double &one) {
135     return _mm512_maskz_mov_pd(m, one);
136   }
137 
SIMD_set(const SIMD_float & src,const SIMD_mask & m,const SIMD_float & one)138   inline SIMD_float SIMD_set(const SIMD_float &src, const SIMD_mask &m,
139                              const SIMD_float &one) {
140     return _mm512_mask_mov_ps(src,m,one);
141   }
142 
SIMD_set(const SIMD_double & src,const SIMD_mask & m,const SIMD_double & one)143   inline SIMD_double SIMD_set(const SIMD_double &src, const SIMD_mask &m,
144                               const SIMD_double &one) {
145     return _mm512_mask_mov_pd(src,m,one);
146   }
147 
148   // -------- Load Operations
149 
SIMD_load(const int * p)150   inline SIMD_int SIMD_load(const int *p) {
151     return _mm512_load_epi32(p);
152   }
153 
SIMD_load(const float * p)154   inline SIMD_float SIMD_load(const float *p) {
155     return _mm512_load_ps(p);
156   }
157 
SIMD_load(const double * p)158   inline SIMD_double SIMD_load(const double *p) {
159     return _mm512_load_pd(p);
160   }
161 
SIMD_loadz(const SIMD_mask & m,const int * p)162   inline SIMD_int SIMD_loadz(const SIMD_mask &m, const int *p) {
163     return _mm512_maskz_load_epi32(m, p);
164   }
165 
SIMD_loadz(const SIMD_mask & m,const float * p)166   inline SIMD_float SIMD_loadz(const SIMD_mask &m, const float *p) {
167     return _mm512_maskz_load_ps(m, p);
168   }
169 
SIMD_loadz(const SIMD_mask & m,const double * p)170   inline SIMD_double SIMD_loadz(const SIMD_mask &m, const double *p) {
171     return _mm512_maskz_load_pd(m, p);
172   }
173 
SIMD_gather(const int * p,const SIMD_int & i)174   inline SIMD_int SIMD_gather(const int *p, const SIMD_int &i) {
175     return _mm512_i32gather_epi32(i, p, _MM_SCALE_4);
176   }
177 
SIMD_gather(const float * p,const SIMD_int & i)178   inline SIMD_float SIMD_gather(const float *p, const SIMD_int &i) {
179     return _mm512_i32gather_ps(i, p, _MM_SCALE_4);
180   }
181 
SIMD_gather(const double * p,const SIMD_int & i)182   inline SIMD_double SIMD_gather(const double *p, const SIMD_int &i) {
183     return _mm512_i32gather_pd(_mm512_castsi512_si256(i), p, _MM_SCALE_8);
184   }
185 
SIMD_gather(const SIMD_mask & m,const int * p,const SIMD_int & i)186   inline SIMD_int SIMD_gather(const SIMD_mask &m, const int *p,
187                               const SIMD_int &i) {
188     return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, p,
189                                        _MM_SCALE_4);
190   }
191 
SIMD_gather(const SIMD_mask & m,const float * p,const SIMD_int & i)192   inline SIMD_float SIMD_gather(const SIMD_mask &m, const float *p,
193                                 const SIMD_int &i) {
194     return _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, p,
195                                     _MM_SCALE_4);
196   }
197 
SIMD_gather(const SIMD_mask & m,const double * p,const SIMD_int & i)198   inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p,
199                                  const SIMD_int &i) {
200     return _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
201                                     _mm512_castsi512_si256(i), p, _MM_SCALE_8);
202   }
203 
204   template <typename T>
SIMD_gatherz_offset(const SIMD_mask & m,const int * p,const SIMD_int & i)205   inline SIMD_int SIMD_gatherz_offset(const SIMD_mask &m, const int *p,
206                                       const SIMD_int &i) {
207   }
208 
209   template <>
210   inline SIMD_int SIMD_gatherz_offset<float>(const SIMD_mask &m, const int *p,
211                                              const SIMD_int &i) {
212     return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p,
213                                        _MM_SCALE_4);
214   }
215 
216   template <>
217   inline SIMD_int SIMD_gatherz_offset<double>(const SIMD_mask &m, const int *p,
218                                               const SIMD_int &i) {
219     return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p,
220                                        _MM_SCALE_8);
221   }
222 
SIMD_gatherz(const SIMD_mask & m,const int * p,const SIMD_int & i)223   inline SIMD_int SIMD_gatherz(const SIMD_mask &m, const int *p,
224                                const SIMD_int &i) {
225     return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p,
226                                     _MM_SCALE_4);
227   }
228 
SIMD_gatherz(const SIMD_mask & m,const float * p,const SIMD_int & i)229   inline SIMD_float SIMD_gatherz(const SIMD_mask &m, const float *p,
230                                  const SIMD_int &i) {
231     return _mm512_mask_i32gather_ps( _mm512_set1_ps((float)0), m, i, p,
232                                     _MM_SCALE_4);
233   }
234 
SIMD_gatherz(const SIMD_mask & m,const double * p,const SIMD_int & i)235   inline SIMD_double SIMD_gatherz(const SIMD_mask &m, const double *p,
236                                   const SIMD_int &i) {
237     return _mm512_mask_i32gather_pd( _mm512_set1_pd(0.0), m,
238                                      _mm512_castsi512_si256(i),p, _MM_SCALE_8);
239   }
240 
241   // ------- Store Operations
242 
SIMD_store(int * p,const SIMD_int & one)243   inline void SIMD_store(int *p, const SIMD_int &one) {
244     return _mm512_store_epi32(p,one);
245   }
246 
SIMD_store(float * p,const SIMD_float & one)247   inline void SIMD_store(float *p, const SIMD_float &one) {
248     return _mm512_store_ps(p,one);
249   }
250 
SIMD_store(double * p,const SIMD_double & one)251   inline void SIMD_store(double *p, const SIMD_double &one) {
252     return _mm512_store_pd(p,one);
253   }
254 
SIMD_scatter(const SIMD_mask & m,int * p,const SIMD_int & i,const SIMD_int & vec)255   inline void SIMD_scatter(const SIMD_mask &m, int *p,
256                            const SIMD_int &i, const SIMD_int &vec) {
257     _mm512_mask_i32scatter_epi32(p, m, i, vec, _MM_SCALE_4);
258   }
259 
SIMD_scatter(const SIMD_mask & m,float * p,const SIMD_int & i,const SIMD_float & vec)260   inline void SIMD_scatter(const SIMD_mask &m, float *p,
261                            const SIMD_int &i, const SIMD_float &vec) {
262     _mm512_mask_i32scatter_ps(p, m, i, vec, _MM_SCALE_4);
263   }
264 
SIMD_scatter(const SIMD_mask & m,double * p,const SIMD_int & i,const SIMD_double & vec)265   inline void SIMD_scatter(const SIMD_mask &m, double *p,
266                            const SIMD_int &i, const SIMD_double &vec) {
267     _mm512_mask_i32scatter_pd(p, m, _mm512_castsi512_si256(i), vec,
268                               _MM_SCALE_8);
269   }
270 
271   // ------- Arithmetic Operations
272 
273   inline SIMD_int operator+(const SIMD_int &one, const SIMD_int &two) {
274     return _mm512_add_epi32(one,two);
275   }
276 
277   inline SIMD_float operator+(const SIMD_float &one, const SIMD_float &two) {
278     return _mm512_add_ps(one,two);
279   }
280 
281   inline SIMD_double operator+(const SIMD_double &one, const SIMD_double &two) {
282     return _mm512_add_pd(one,two);
283   }
284 
285   inline SIMD_int operator+(const SIMD_int &one, const int two) {
286     return _mm512_add_epi32(one,SIMD_set(two));
287   }
288 
289   inline SIMD_float operator+(const SIMD_float &one, const float two) {
290     return _mm512_add_ps(one,SIMD_set(two));
291   }
292 
293   inline SIMD_double operator+(const SIMD_double &one, const double two) {
294     return _mm512_add_pd(one,SIMD_set(two));
295   }
296 
SIMD_add(const SIMD_mask & m,const SIMD_int & one,const int two)297   inline SIMD_int SIMD_add(const SIMD_mask &m,
298                            const SIMD_int &one, const int two) {
299     return _mm512_mask_add_epi32(one,m,one,SIMD_set(two));
300   }
301 
SIMD_add(const SIMD_mask & m,const SIMD_float & one,const float two)302   inline SIMD_float SIMD_add(const SIMD_mask &m,
303                              const SIMD_float &one, const float two) {
304     return _mm512_mask_add_ps(one,m,one,SIMD_set(two));
305   }
306 
SIMD_add(const SIMD_mask & m,const SIMD_double & one,const double two)307   inline SIMD_double SIMD_add(const SIMD_mask &m,
308                               const SIMD_double &one, const double two) {
309     return _mm512_mask_add_pd(one,m,one,SIMD_set(two));
310   }
311 
SIMD_add(const SIMD_int & s,const SIMD_mask & m,const SIMD_int & one,const SIMD_int & two)312   inline SIMD_int SIMD_add(const SIMD_int &s, const SIMD_mask &m,
313                            const SIMD_int &one, const SIMD_int &two) {
314     return _mm512_mask_add_epi32(s,m,one,two);
315   }
316 
SIMD_add(const SIMD_float & s,const SIMD_mask & m,const SIMD_float & one,const SIMD_float & two)317   inline SIMD_float SIMD_add(const SIMD_float &s, const SIMD_mask &m,
318                              const SIMD_float &one, const SIMD_float &two) {
319     return _mm512_mask_add_ps(s,m,one,two);
320   }
321 
SIMD_add(const SIMD_double & s,const SIMD_mask & m,const SIMD_double & one,const SIMD_double & two)322   inline SIMD_double SIMD_add(const SIMD_double &s, const SIMD_mask &m,
323                               const SIMD_double &one, const SIMD_double &two) {
324     return _mm512_mask_add_pd(s,m,one,two);
325   }
326 
SIMD_sub(const SIMD_int & s,const SIMD_mask & m,const SIMD_int & one,const SIMD_int & two)327   inline SIMD_int SIMD_sub(const SIMD_int &s, const SIMD_mask &m,
328                            const SIMD_int &one, const SIMD_int &two) {
329     return _mm512_mask_sub_epi32(s,m,one,two);
330   }
331 
SIMD_sub(const SIMD_float & s,const SIMD_mask & m,const SIMD_float & one,const SIMD_float & two)332   inline SIMD_float SIMD_sub(const SIMD_float &s, const SIMD_mask &m,
333                              const SIMD_float &one, const SIMD_float &two) {
334     return _mm512_mask_sub_ps(s,m,one,two);
335   }
336 
SIMD_sub(const SIMD_double & s,const SIMD_mask & m,const SIMD_double & one,const SIMD_double & two)337   inline SIMD_double SIMD_sub(const SIMD_double &s, const SIMD_mask &m,
338                               const SIMD_double &one, const SIMD_double &two) {
339     return _mm512_mask_sub_pd(s,m,one,two);
340   }
341 
342   inline SIMD_int operator-(const SIMD_int &one) {
343     return _mm512_sub_epi32(SIMD_set((int)0),one);
344   }
345 
346   inline SIMD_float operator-(const SIMD_float &one) {
347     return _mm512_sub_ps(SIMD_set((float)0),one);
348   }
349 
350   inline SIMD_double operator-(const SIMD_double &one) {
351     return _mm512_sub_pd(SIMD_set((double)0),one);
352   }
353 
354   inline SIMD_int operator-(const SIMD_int &one, const SIMD_int &two) {
355     return _mm512_sub_epi32(one,two);
356   }
357 
358   inline SIMD_float operator-(const SIMD_float &one, const SIMD_float &two) {
359     return _mm512_sub_ps(one,two);
360   }
361 
362   inline SIMD_double operator-(const SIMD_double &one, const SIMD_double &two) {
363     return _mm512_sub_pd(one,two);
364   }
365 
366   inline SIMD_int operator-(const SIMD_int &one, const int two) {
367     return _mm512_sub_epi32(one,SIMD_set(two));
368   }
369 
370   inline SIMD_float operator-(const SIMD_float &one, const float two) {
371     return _mm512_sub_ps(one,SIMD_set(two));
372   }
373 
374   inline SIMD_double operator-(const SIMD_double &one, const double two) {
375     return _mm512_sub_pd(one,SIMD_set(two));
376   }
377 
378   inline SIMD_int operator*(const SIMD_int &one, const SIMD_int &two) {
379     return _mm512_mullo_epi32(one,two);
380   }
381 
382   inline SIMD_float operator*(const SIMD_float &one, const SIMD_float &two) {
383     return _mm512_mul_ps(one,two);
384   }
385 
386   inline SIMD_double operator*(const SIMD_double &one, const SIMD_double &two) {
387     return _mm512_mul_pd(one,two);
388   }
389 
390   inline SIMD_int operator*(const SIMD_int &one, const int two) {
391     return _mm512_mullo_epi32(one,SIMD_set(two));
392   }
393 
394   inline SIMD_float operator*(const SIMD_float &one, const float two) {
395     return _mm512_mul_ps(one,SIMD_set(two));
396   }
397 
398   inline SIMD_double operator*(const SIMD_double &one, const double two) {
399     return _mm512_mul_pd(one,SIMD_set(two));
400   }
401 
402   inline SIMD_float operator/(const SIMD_float &one, const SIMD_float &two) {
403     return _mm512_div_ps(one,two);
404   }
405 
406   inline SIMD_double operator/(const SIMD_double &one, const SIMD_double &two) {
407     return _mm512_div_pd(one,two);
408   }
409 
SIMD_fma(const SIMD_float & one,const SIMD_float & two,const SIMD_float & three)410   inline SIMD_float SIMD_fma(const SIMD_float &one, const SIMD_float &two,
411                              const SIMD_float &three) {
412     return _mm512_fmadd_ps(one,two,three);
413   }
414 
SIMD_fma(const SIMD_double & one,const SIMD_double & two,const SIMD_double & three)415   inline SIMD_double SIMD_fma(const SIMD_double &one, const SIMD_double &two,
416                               const SIMD_double &three) {
417     return _mm512_fmadd_pd(one,two,three);
418   }
419 
SIMD_fms(const SIMD_float & one,const SIMD_float & two,const SIMD_float & three)420   inline SIMD_float SIMD_fms(const SIMD_float &one, const SIMD_float &two,
421                              const SIMD_float &three) {
422     return _mm512_fmsub_ps(one,two,three);
423   }
424 
SIMD_fms(const SIMD_double & one,const SIMD_double & two,const SIMD_double & three)425   inline SIMD_double SIMD_fms(const SIMD_double &one, const SIMD_double &two,
426                               const SIMD_double &three) {
427     return _mm512_fmsub_pd(one,two,three);
428   }
429 
430   // ------- SVML operations
431 
SIMD_rcp(const SIMD_float & one)432   inline SIMD_float SIMD_rcp(const SIMD_float &one) {
433     #ifdef __AVX512ER__
434     return _mm512_rcp28_ps(one);
435     #else
436     return _mm512_recip_ps(one);
437     #endif
438   }
439 
SIMD_rcp(const SIMD_double & one)440   inline SIMD_double SIMD_rcp(const SIMD_double &one) {
441     #ifdef __AVX512ER__
442     return _mm512_rcp28_pd(one);
443     #else
444     return _mm512_recip_pd(one);
445     #endif
446   }
447 
SIMD_rcpz(const SIMD_mask & m,const SIMD_float & one)448   inline SIMD_float SIMD_rcpz(const SIMD_mask &m, const SIMD_float &one) {
449     #ifdef __AVX512ER__
450     return _mm512_maskz_rcp28_ps(m, one);
451     #else
452     return _mm512_mask_recip_ps(_mm512_set1_ps(0), m, one);
453     #endif
454   }
455 
SIMD_rcpz(const SIMD_mask & m,const SIMD_double & one)456   inline SIMD_double SIMD_rcpz(const SIMD_mask &m, const SIMD_double &one) {
457     #ifdef __AVX512ER__
458     return _mm512_maskz_rcp28_pd(m, one);
459     #else
460     return _mm512_mask_recip_pd(_mm512_set1_pd(0), m, one);
461     #endif
462   }
463 
SIMD_sqrt(const SIMD_float & one)464   inline SIMD_float SIMD_sqrt(const SIMD_float &one) {
465     return _mm512_sqrt_ps(one);
466   }
467 
SIMD_sqrt(const SIMD_double & one)468   inline SIMD_double SIMD_sqrt(const SIMD_double &one) {
469     return _mm512_sqrt_pd(one);
470   }
471 
SIMD_invsqrt(const SIMD_float & one)472   inline SIMD_float SIMD_invsqrt(const SIMD_float &one) {
473     #ifdef __AVX512ER__
474     return _mm512_rsqrt28_ps(one);
475     #else
476     return _mm512_invsqrt_ps(one);
477     #endif
478   }
479 
SIMD_invsqrt(const SIMD_double & one)480   inline SIMD_double SIMD_invsqrt(const SIMD_double &one) {
481     #ifdef __AVX512ER__
482     return _mm512_rsqrt28_pd(one);
483     #else
484     return _mm512_invsqrt_pd(one);
485     #endif
486   }
487 
SIMD_pow(const SIMD_float & one,const SIMD_float & two)488   inline SIMD_float SIMD_pow(const SIMD_float &one, const SIMD_float &two) {
489     return _mm512_pow_ps(one, two);
490   }
491 
SIMD_pow(const SIMD_double & one,const SIMD_double & two)492   inline SIMD_double SIMD_pow(const SIMD_double &one, const SIMD_double &two) {
493     return _mm512_pow_pd(one, two);
494   }
495 
SIMD_exp(const SIMD_float & one)496   inline SIMD_float SIMD_exp(const SIMD_float &one) {
497     return _mm512_exp_ps(one);
498   }
499 
SIMD_exp(const SIMD_double & one)500   inline SIMD_double SIMD_exp(const SIMD_double &one) {
501     return _mm512_exp_pd(one);
502   }
503 
504   // ------- Comparison operations
505 
SIMD_lt(SIMD_mask m,const SIMD_int & one,const SIMD_int & two)506   inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_int &one,
507                            const SIMD_int &two) {
508     return _mm512_mask_cmplt_epi32_mask(m, one, two);
509   }
510 
SIMD_lt(SIMD_mask m,const SIMD_float & one,const SIMD_float & two)511   inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_float &one,
512                            const SIMD_float &two) {
513     return _mm512_mask_cmplt_ps_mask(m, one, two);
514   }
515 
SIMD_lt(SIMD_mask m,const SIMD_double & one,const SIMD_double & two)516   inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_double &one,
517                            const SIMD_double &two) {
518     return _mm512_mask_cmplt_pd_mask(m, one, two);
519   }
520 
SIMD_lt(SIMD_mask m,const int one,const SIMD_int & two)521   inline SIMD_mask SIMD_lt(SIMD_mask m, const int one,
522                            const SIMD_int &two) {
523     return _mm512_mask_cmplt_epi32_mask(m, SIMD_set(one), two);
524   }
525 
SIMD_lt(SIMD_mask m,const float one,const SIMD_float & two)526   inline SIMD_mask SIMD_lt(SIMD_mask m, const float one,
527                            const SIMD_float &two) {
528     return _mm512_mask_cmplt_ps_mask(m, SIMD_set(one), two);
529   }
530 
SIMD_lt(SIMD_mask m,const double one,const SIMD_double & two)531   inline SIMD_mask SIMD_lt(SIMD_mask m, const double one,
532                            const SIMD_double &two) {
533     return _mm512_mask_cmplt_pd_mask(m, SIMD_set(one), two);
534   }
535 
536   inline SIMD_mask operator<(const SIMD_int &one, const SIMD_int &two) {
537     return _mm512_cmplt_epi32_mask(one,two);
538   }
539 
540   inline SIMD_mask operator<(const SIMD_float &one, const SIMD_float &two) {
541     return _mm512_cmplt_ps_mask(one,two);
542   }
543 
544   inline SIMD_mask operator<(const SIMD_double &one, const SIMD_double &two) {
545     return _mm512_cmplt_pd_mask(one,two);
546   }
547 
548   inline SIMD_mask operator<(const SIMD_int &one, const int two) {
549     return _mm512_cmplt_epi32_mask(one,SIMD_set(two));
550   }
551 
552   inline SIMD_mask operator<(const SIMD_float &one, const float two) {
553     return _mm512_cmplt_ps_mask(one,SIMD_set(two));
554   }
555 
556   inline SIMD_mask operator<(const SIMD_double &one, const double two) {
557     return _mm512_cmplt_pd_mask(one,SIMD_set(two));
558   }
559 
560   inline SIMD_mask operator<(const int one, const SIMD_int &two) {
561     return _mm512_cmplt_epi32_mask(SIMD_set(one),two);
562   }
563 
564   inline SIMD_mask operator<(const float one, const SIMD_float &two) {
565     return _mm512_cmplt_ps_mask(SIMD_set(one),two);
566   }
567 
568   inline SIMD_mask operator<(const double one, const SIMD_double &two) {
569     return _mm512_cmplt_pd_mask(SIMD_set(one),two);
570   }
571 
572   inline SIMD_mask operator<=(const int one, const SIMD_int &two) {
573     return _mm512_cmple_epi32_mask(SIMD_set(one), two);
574   }
575 
576   inline SIMD_mask operator<=(const float one, const SIMD_float &two) {
577     return _mm512_cmple_ps_mask(SIMD_set(one), two);
578   }
579 
580   inline SIMD_mask operator<=(const double one, const SIMD_double &two) {
581     return _mm512_cmple_pd_mask(SIMD_set(one), two);
582   }
583 
584   inline SIMD_mask operator>(const SIMD_int &one, const SIMD_int &two) {
585     return _mm512_cmpgt_epi32_mask(one,two);
586   }
587 
588   inline SIMD_mask operator>(const SIMD_float &one, const SIMD_float &two) {
589     return _mm512_cmplt_ps_mask(two,one);
590   }
591 
592   inline SIMD_mask operator>(const SIMD_double &one, const SIMD_double &two) {
593     return _mm512_cmplt_pd_mask(two,one);
594   }
595 
596   inline SIMD_mask operator==(const SIMD_int &one, const SIMD_int &two) {
597     return _mm512_cmpeq_epi32_mask(one,two);
598   }
599 
600   inline SIMD_mask operator==(const SIMD_float &one, const SIMD_float &two) {
601     return _mm512_cmpeq_ps_mask(one,two);
602   }
603 
604   inline SIMD_mask operator==(const SIMD_double &one, const SIMD_double &two) {
605     return _mm512_cmpeq_pd_mask(one,two);
606   }
607 
608   // ------- Typecast operations
609 
SIMD_cast(const SIMD_int & one,SIMD_float & two)610   inline void SIMD_cast(const SIMD_int &one, SIMD_float &two) {
611     two = _mm512_cvtepi32_ps(one);
612   }
613 
SIMD_cast(const SIMD_int & one,SIMD_double & two)614   inline void SIMD_cast(const SIMD_int &one, SIMD_double &two) {
615     two = _mm512_cvtepi32lo_pd(one);
616   }
617 
618   // ------- Reduction operations
619 
SIMD_max(const SIMD_int & i)620   inline int SIMD_max(const SIMD_int &i) {
621     return _mm512_reduce_max_epi32(i);
622   }
623 
SIMD_max(const SIMD_float & i)624   inline float SIMD_max(const SIMD_float &i) {
625     return _mm512_reduce_max_ps(i);
626   }
627 
SIMD_max(const SIMD_double & i)628   inline double SIMD_max(const SIMD_double &i) {
629     return _mm512_reduce_max_pd(i);
630   }
631 
SIMD_sum(const SIMD_int & i)632   inline int SIMD_sum(const SIMD_int &i) {
633     return _mm512_reduce_add_epi32(i);
634   }
635 
SIMD_sum(const SIMD_float & i)636   inline float SIMD_sum(const SIMD_float &i) {
637     return _mm512_reduce_add_ps(i);
638   }
639 
SIMD_sum(const SIMD_double & i)640   inline double SIMD_sum(const SIMD_double &i) {
641     return _mm512_reduce_add_pd(i);
642   }
643 
644   // i indices should be positive
SIMD_conflict_pi_reduce1(const SIMD_mask & m,const SIMD_int & i,SIMD_float & v1)645   inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i,
646                                        SIMD_float &v1) {
647     SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
648     SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
649     SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
650     if (todo_mask) {
651       SIMD_int lz  = _mm512_lzcnt_epi32(cd);
652       SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
653                                       _mm512_lzcnt_epi32(cd));
654 
655       while(todo_mask) {
656         SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
657         SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd,
658                                                           todo_bcast);
659         SIMD_float am_perm;
660         am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
661                                              now_mask, lid, v1);
662         v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm);
663         todo_mask = _mm512_kxor(todo_mask, now_mask);
664       }
665     }
666   }
667 
668   // i indices should be positive
SIMD_conflict_pi_reduce1(const SIMD_mask & m,const SIMD_int & i,SIMD_double & v1)669   inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i,
670                                        SIMD_double &v1) {
671     SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
672     SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
673     SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
674     if (todo_mask) {
675       SIMD_int lz  = _mm512_lzcnt_epi32(cd);
676       SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
677                                       _mm512_lzcnt_epi32(cd));
678       lid = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(lid));
679 
680       while(todo_mask) {
681         SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
682         SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd,
683                                                           todo_bcast);
684         SIMD_double am_perm;
685         am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
686                                              now_mask, lid, v1);
687         v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm);
688         todo_mask = _mm512_kxor(todo_mask, now_mask);
689       }
690     }
691   }
692 
693   // i indices should be positive
SIMD_conflict_pi_reduce3(const SIMD_mask & m,const SIMD_int & i,SIMD_float & v1,SIMD_float & v2,SIMD_float & v3)694   inline void SIMD_conflict_pi_reduce3(const SIMD_mask &m, const SIMD_int &i,
695                                        SIMD_float &v1, SIMD_float &v2,
696                                        SIMD_float &v3) {
697     SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
698     SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
699     SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
700     if (todo_mask) {
701       SIMD_int lz  = _mm512_lzcnt_epi32(cd);
702       SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
703                                       _mm512_lzcnt_epi32(cd));
704 
705       while(todo_mask) {
706         SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
707         SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd,
708                                                           todo_bcast);
709         SIMD_float am_perm;
710         am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
711                                              now_mask, lid, v1);
712         v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm);
713         am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
714                                              now_mask, lid, v2);
715         v2 = _mm512_mask_add_ps(v2, now_mask, v2, am_perm);
716         am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(),
717                                              now_mask, lid, v3);
718         v3 = _mm512_mask_add_ps(v3, now_mask, v3, am_perm);
719         todo_mask = _mm512_kxor(todo_mask, now_mask);
720       }
721     }
722   }
723 
724   // i indices should be positive
SIMD_conflict_pi_reduce3(const SIMD_mask & m,const SIMD_int & i,SIMD_double & v1,SIMD_double & v2,SIMD_double & v3)725   inline void SIMD_conflict_pi_reduce3(const SIMD_mask &m, const SIMD_int &i,
726                                        SIMD_double &v1, SIMD_double &v2,
727                                        SIMD_double &v3) {
728     SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i);
729     SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc);
730     SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1));
731     if (todo_mask) {
732       SIMD_int lz  = _mm512_lzcnt_epi32(cd);
733       SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31),
734                                       _mm512_lzcnt_epi32(cd));
735       lid = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(lid));
736 
737       while(todo_mask) {
738         SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask);
739         SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd,
740                                                           todo_bcast);
741         SIMD_double am_perm;
742         am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
743                                              now_mask, lid, v1);
744         v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm);
745         am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
746                                              now_mask, lid, v2);
747         v2 = _mm512_mask_add_pd(v2, now_mask, v2, am_perm);
748         am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(),
749                                              now_mask, lid, v3);
750         v3 = _mm512_mask_add_pd(v3, now_mask, v3, am_perm);
751         todo_mask = _mm512_kxor(todo_mask, now_mask);
752       }
753     }
754   }
755 
756   // ------- Bit shift operations
757 
758   inline SIMD_int operator&(const SIMD_int &one, const SIMD_int &two) {
759     return _mm512_and_epi32(one,two);
760   }
761 
762   inline SIMD_int operator>>(const SIMD_int &one, const SIMD_int &two) {
763     return _mm512_srlv_epi32(one,two);
764   }
765 
766   inline SIMD_int operator<<(const SIMD_int &one, const unsigned two) {
767     return _mm512_slli_epi32(one,two);
768   }
769 
770   // -------- I/O operations
771 
SIMD_print(const __m512i & vec)772   inline void SIMD_print(const __m512i &vec) {
773     for (int i = 0; i < 16; i++)
774       printf("%d ",(*((int*)&(vec) + (i))));
775   }
776 
SIMD_print(const __m512 & vec)777   inline void SIMD_print(const __m512 &vec) {
778     for (int i = 0; i < 16; i++)
779       printf("%f ",(*((float*)&(vec) + (i))));
780   }
781 
SIMD_print(const __m512d & vec)782   inline void SIMD_print(const __m512d &vec) {
783     for (int i = 0; i < 8; i++)
784       printf("%f ",(*((double*)&(vec) + (i))));
785   }
786 
SIMD_print(const SIMD_mask & mask)787   inline void SIMD_print(const SIMD_mask &mask) {
788     SIMD_print(_mm512_maskz_mov_epi32(mask,SIMD_set(1)));
789   }
790 
SIMD_print(const char * id,const SIMD_mask & mask)791   inline void SIMD_print(const char *id, const SIMD_mask &mask) {
792     printf("%s ",id);
793     SIMD_print(mask);
794     printf("\n");
795   }
796 
SIMD_print(const char * id,const SIMD_int & vec)797   inline void SIMD_print(const char *id, const SIMD_int &vec) {
798     printf("%s ",id);
799     SIMD_print(vec);
800     printf("\n");
801   }
802 
SIMD_print(const char * id,const SIMD_float & vec)803   inline void SIMD_print(const char *id, const SIMD_float &vec) {
804     printf("%s ",id);
805     SIMD_print(vec);
806     printf("\n");
807   }
808 
SIMD_print(const char * id,const SIMD_double & vec)809   inline void SIMD_print(const char *id, const SIMD_double &vec) {
810     printf("%s ",id);
811     SIMD_print(vec);
812     printf("\n");
813   }
814 
815   // ---------- LAMMPS operations
816   #ifndef SW_GATHER_TEST
SIMD_atom_gather(const SIMD_mask & m,const float * atom,const SIMD_int & i,SIMD_float & x,SIMD_float & y,SIMD_float & z)817   inline void SIMD_atom_gather(const SIMD_mask &m, const float *atom,
818                                const SIMD_int &i, SIMD_float &x, SIMD_float &y,
819                                SIMD_float &z) {
820     x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom,
821                                  _MM_SCALE_1);
822     y = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+1,
823                                  _MM_SCALE_1);
824     z = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+2,
825                                  _MM_SCALE_1);
826   }
827 
SIMD_atom_gather(const SIMD_mask & m,const float * atom,const SIMD_int & i,SIMD_float & x,SIMD_float & y,SIMD_float & z,SIMD_int & type)828   inline void SIMD_atom_gather(const SIMD_mask &m, const float *atom,
829                                const SIMD_int &i, SIMD_float &x, SIMD_float &y,
830                                SIMD_float &z, SIMD_int &type) {
831     x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom,
832                                  _MM_SCALE_1);
833     y = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+1,
834                                  _MM_SCALE_1);
835     z = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+2,
836                                  _MM_SCALE_1);
837     type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3,
838                                        _MM_SCALE_1);
839   }
840   #endif
841 
SIMD_atom_gather(const SIMD_mask & m,const double * atom,const SIMD_int & i,SIMD_double & x,SIMD_double & y,SIMD_double & z)842   inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
843                                const SIMD_int &i, SIMD_double &x,
844                                SIMD_double &y, SIMD_double &z) {
845     x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
846                                  _mm512_castsi512_si256(i), atom,
847                                  _MM_SCALE_2);
848     y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
849                                  _mm512_castsi512_si256(i), atom+1,
850                                  _MM_SCALE_2);
851     z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
852                                  _mm512_castsi512_si256(i), atom+2,
853                                  _MM_SCALE_2);
854   }
855 
SIMD_atom_gather(const SIMD_mask & m,const double * atom,const SIMD_int & i,SIMD_double & x,SIMD_double & y,SIMD_double & z,SIMD_int & type)856   inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom,
857                                const SIMD_int &i, SIMD_double &x,
858                                SIMD_double &y, SIMD_double &z, SIMD_int &type) {
859     x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
860                                  _mm512_castsi512_si256(i), atom,
861                                  _MM_SCALE_2);
862     y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
863                                  _mm512_castsi512_si256(i), atom+1,
864                                  _MM_SCALE_2);
865     z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
866                                  _mm512_castsi512_si256(i), atom+2,
867                                  _MM_SCALE_2);
868     type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3,
869                                        _MM_SCALE_2);
870   }
871 
SIMD_ev_add(const SIMD_float & one,const SIMD_float & two)872   inline SIMD_float SIMD_ev_add(const SIMD_float &one,
873                                 const SIMD_float &two) {
874     return _mm512_add_ps(one,two);
875   }
876 
SIMD_ev_add(const SIMD_double & one,const SIMD_double & two)877   inline SIMD_double SIMD_ev_add(const SIMD_double &one,
878                                  const SIMD_double &two) {
879     return _mm512_add_pd(one,two);
880   }
881 
SIMD_ev_add(const SIMD_double & one,const SIMD_float & two)882   inline SIMD_double SIMD_ev_add(const SIMD_double &one,
883                                  const SIMD_float &two) {
884     SIMD_double twod = _mm512_cvtps_pd(_mm512_castps512_ps256(two));
885     SIMD_double ans = _mm512_add_pd(one,twod);
886     twod = _mm512_cvtps_pd(_mm512_castps512_ps256(
887                              _mm512_shuffle_f32x4(two,two,238)));
888     return _mm512_add_pd(ans,twod);
889   }
890 
SIMD_jeng_update(const SIMD_mask & rmask,float * force,const SIMD_int & joffset,SIMD_float & eng)891   inline void SIMD_jeng_update(const SIMD_mask &rmask, float *force,
892                                const SIMD_int &joffset, SIMD_float &eng) {
893     SIMD_float jeng;
894     SIMD_conflict_pi_reduce1(rmask, joffset, eng);
895     jeng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), rmask, joffset,
896                                     force, _MM_SCALE_1);
897     jeng = jeng + eng;
898     _mm512_mask_i32scatter_ps(force, rmask, joffset, jeng, _MM_SCALE_1);
899   }
900 
SIMD_jeng_update(const SIMD_mask & rmask,double * force,const SIMD_int & joffset,SIMD_double & eng)901   inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
902                                const SIMD_int &joffset, SIMD_double &eng) {
903     SIMD_double jeng;
904     SIMD_conflict_pi_reduce1(rmask, joffset, eng);
905     jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
906                                     _mm512_castsi512_si256(joffset),
907                                     force, _MM_SCALE_2);
908     jeng = jeng + eng;
909     _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
910                               jeng, _MM_SCALE_2);
911   }
912 
SIMD_jeng_update(const SIMD_mask & rmask,double * force,const SIMD_int & joffset,SIMD_float & eng)913   inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force,
914                                const SIMD_int &joffset, SIMD_float &eng) {
915     SIMD_double engd, jeng;
916     engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng));
917     SIMD_conflict_pi_reduce1(rmask, joffset, engd);
918     jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
919                                     _mm512_castsi512_si256(joffset),
920                                     force, _MM_SCALE_2);
921     jeng = jeng + engd;
922     _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
923                               jeng, _MM_SCALE_2);
924 
925     SIMD_mask rmask2 = rmask >> 8;
926     engd = _mm512_cvtps_pd(_mm512_castps512_ps256(
927                              _mm512_shuffle_f32x4(eng,eng,238)));
928     SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
929     SIMD_conflict_pi_reduce1(rmask2, joffset2, engd);
930     jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
931                                     _mm512_castsi512_si256(joffset2),
932                                     force, _MM_SCALE_2);
933     jeng = jeng + engd;
934     _mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
935                               jeng, _MM_SCALE_2);
936   }
937 
SIMD_jeng_update_hi(const SIMD_mask & mask,float * force,const SIMD_int & joffset1,SIMD_float & eng)938   inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force,
939                                   const SIMD_int &joffset1, SIMD_float &eng) {
940   }
941 
SIMD_jeng_update_hi(const SIMD_mask & mask,double * force,const SIMD_int & joffset1,SIMD_double & eng)942   inline void SIMD_jeng_update_hi(const SIMD_mask &mask, double *force,
943                                   const SIMD_int &joffset1, SIMD_double &eng) {
944     SIMD_mask rmask = mask >> 8;
945     SIMD_int joffset = _mm512_shuffle_i32x4(joffset1, joffset1, 238);
946 
947     SIMD_double jeng;
948     SIMD_conflict_pi_reduce1(rmask, joffset, eng);
949     jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
950                                     _mm512_castsi512_si256(joffset),
951                                     force, _MM_SCALE_2);
952     jeng = jeng + eng;
953     _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
954                               jeng, _MM_SCALE_2);
955   }
956 
SIMD_safe_jforce(const SIMD_mask & m,float * force,const SIMD_int & i,SIMD_float & fx,SIMD_float & fy,SIMD_float & fz)957   inline void SIMD_safe_jforce(const SIMD_mask &m, float *force,
958                                const SIMD_int &i, SIMD_float &fx,
959                                SIMD_float &fy, SIMD_float &fz) {
960     SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
961     SIMD_float jfrc;
962     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force,
963                                     _MM_SCALE_1);
964     jfrc = jfrc + fx;
965     _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1);
966     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1,
967                                     _MM_SCALE_1);
968     jfrc = jfrc + fy;
969     _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1);
970     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2,
971                                     _MM_SCALE_1);
972     jfrc = jfrc + fz;
973     _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
974   }
975 
SIMD_safe_jforce(const SIMD_mask & m,double * force,const SIMD_int & i,SIMD_double & fx,SIMD_double & fy,SIMD_double & fz)976   inline void SIMD_safe_jforce(const SIMD_mask &m, double *force,
977                                const SIMD_int &i, SIMD_double &fx,
978                                SIMD_double &fy, SIMD_double &fz) {
979     SIMD_conflict_pi_reduce3(m, i, fx, fy, fz);
980     SIMD_double jfrc;
981     jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
982                                     _mm512_castsi512_si256(i), force,
983                                     _MM_SCALE_2);
984     jfrc = jfrc + fx;
985     _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
986                               _MM_SCALE_2);
987     jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
988                                     _mm512_castsi512_si256(i), force + 1,
989                                     _MM_SCALE_2);
990     jfrc = jfrc + fy;
991     _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
992                               _MM_SCALE_2);
993     jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
994                                     _mm512_castsi512_si256(i), force + 2,
995                                     _MM_SCALE_2);
996     jfrc = jfrc + fz;
997     _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
998                               _MM_SCALE_2);
999   }
1000 
SIMD_safe_jforce(const SIMD_mask & rmask,double * force,const SIMD_int & joffset,SIMD_float & amx,SIMD_float & amy,SIMD_float & amz)1001   inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force,
1002                                const SIMD_int &joffset, SIMD_float &amx,
1003                                SIMD_float &amy, SIMD_float &amz) {
1004     SIMD_double amxd, amyd, amzd;
1005     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx));
1006     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(amy));
1007     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz));
1008     SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd);
1009     SIMD_double jfrc;
1010     jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
1011                                     _mm512_castsi512_si256(joffset),
1012                                     force, _MM_SCALE_2);
1013     jfrc = jfrc + amxd;
1014     _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset),
1015                               jfrc, _MM_SCALE_2);
1016     jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
1017                                     _mm512_castsi512_si256(joffset),
1018                                     force + 1, _MM_SCALE_2);
1019     jfrc = jfrc + amyd;
1020     _mm512_mask_i32scatter_pd(force+1, rmask, _mm512_castsi512_si256(joffset),
1021                               jfrc, _MM_SCALE_2);
1022     jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask,
1023                                     _mm512_castsi512_si256(joffset),
1024                                     force + 2, _MM_SCALE_2);
1025     jfrc = jfrc + amzd;
1026     _mm512_mask_i32scatter_pd(force+2, rmask, _mm512_castsi512_si256(joffset),
1027                               jfrc, _MM_SCALE_2);
1028 
1029     SIMD_mask rmask2 = rmask >> 8;
1030     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1031                                                   _mm512_shuffle_f32x4(amx,amx,238)));
1032     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1033                                                   _mm512_shuffle_f32x4(amy,amy,238)));
1034     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1035                                                   _mm512_shuffle_f32x4(amz,amz,238)));
1036     SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
1037     SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd);
1038     jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
1039                                     _mm512_castsi512_si256(joffset2),
1040                                     force, _MM_SCALE_2);
1041     jfrc = jfrc + amxd;
1042     _mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2),
1043                               jfrc, _MM_SCALE_2);
1044     jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
1045                                     _mm512_castsi512_si256(joffset2),
1046                                     force + 1, _MM_SCALE_2);
1047     jfrc = jfrc + amyd;
1048     _mm512_mask_i32scatter_pd(force+1, rmask2,
1049                               _mm512_castsi512_si256(joffset2), jfrc,
1050                               _MM_SCALE_2);
1051     jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2,
1052                                     _mm512_castsi512_si256(joffset2),
1053                                     force + 2, _MM_SCALE_2);
1054     jfrc = jfrc + amzd;
1055     _mm512_mask_i32scatter_pd(force+2, rmask2,
1056                               _mm512_castsi512_si256(joffset2), jfrc,
1057                               _MM_SCALE_2);
1058   }
1059 
SIMD_jforce_update(const SIMD_mask & m,float * force,const SIMD_int & i,const SIMD_float & fx,const SIMD_float & fy,const SIMD_float & fz)1060   inline void SIMD_jforce_update(const SIMD_mask &m, float *force,
1061                                  const SIMD_int &i, const SIMD_float &fx,
1062                                  const SIMD_float &fy, const SIMD_float &fz) {
1063     SIMD_float jfrc;
1064     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force,
1065                                     _MM_SCALE_1);
1066     jfrc = jfrc - fx;
1067     _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1);
1068     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1,
1069                                     _MM_SCALE_1);
1070     jfrc = jfrc - fy;
1071     _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1);
1072     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2,
1073                                     _MM_SCALE_1);
1074     jfrc = jfrc - fz;
1075     _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
1076   }
1077 
1078   template <class ft>
SIMD_scalar_update(const int jj,const int * ejnum,ft * force,const int * i,const double * fx,const double * fy,const double * fz,const double * fx2,const double * fy2,const double * fz2)1079   inline void SIMD_scalar_update(const int jj, const int* ejnum, ft *force,
1080                                  const int* i, const double *fx,
1081                                  const double *fy, const double *fz,
1082                                  const double *fx2, const double *fy2,
1083                                  const double *fz2) {
1084     #pragma novector
1085     for (int k=0; k<8; k++) {
1086       if (jj < ejnum[k]) {
1087         const int j = i[k];
1088         force[j].x -= fx[k];
1089         force[j].y -= fy[k];
1090         force[j].z -= fz[k];
1091       }
1092     }
1093 
1094     #pragma novector
1095     for (int k=8; k<16; k++) {
1096       if (jj < ejnum[k]) {
1097         const int j = i[k];
1098         force[j].x -= fx2[k-8];
1099         force[j].y -= fy2[k-8];
1100         force[j].z -= fz2[k-8];
1101       }
1102     }
1103   }
1104 
SIMD_jforce_update(const SIMD_mask & m,double * force,const SIMD_int & i,const SIMD_double & fx,const SIMD_double & fy,const SIMD_double & fz)1105   inline void SIMD_jforce_update(const SIMD_mask &m, double *force,
1106                                  const SIMD_int &i, const SIMD_double &fx,
1107                                  const SIMD_double &fy, const SIMD_double &fz)   {
1108     SIMD_double jfrc;
1109     jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
1110                                     _mm512_castsi512_si256(i), force,
1111                                     _MM_SCALE_2);
1112     jfrc = jfrc - fx;
1113     _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
1114                               _MM_SCALE_2);
1115     jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
1116                                     _mm512_castsi512_si256(i), force + 1,
1117                                     _MM_SCALE_2);
1118     jfrc = jfrc - fy;
1119     _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
1120                               _MM_SCALE_2);
1121     jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
1122                                     _mm512_castsi512_si256(i), force + 2,
1123                                     _MM_SCALE_2);
1124     jfrc = jfrc - fz;
1125     _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
1126                               _MM_SCALE_2);
1127   }
1128 
SIMD_jforce_update(const SIMD_mask & rmask,double * force,const SIMD_int & joffset,SIMD_float & amx,SIMD_float & amy,SIMD_float & amz)1129   inline void SIMD_jforce_update(const SIMD_mask &rmask,
1130          double *force, const SIMD_int &joffset, SIMD_float &amx,
1131                                  SIMD_float &amy, SIMD_float &amz) {
1132     SIMD_double amxd, amyd, amzd;
1133     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx));
1134     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(amy));
1135     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz));
1136     SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd);
1137     SIMD_jforce_update(rmask, force, joffset, amxd, amyd, amzd);
1138 
1139     SIMD_mask rmask2 = rmask >> 8;
1140     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1141                                 _mm512_shuffle_f32x4(amx,amx,238)));
1142     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1143                                 _mm512_shuffle_f32x4(amy,amy,238)));
1144     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1145                                 _mm512_shuffle_f32x4(amz,amz,238)));
1146     SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
1147     SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd);
1148     SIMD_jforce_update(rmask2, force, joffset2, amxd, amyd, amzd);
1149   }
1150 
SIMD_cache3(float * pr,const int offset,const SIMD_float & fx,const SIMD_float & fy,const SIMD_float & fz)1151   inline void SIMD_cache3(float *pr, const int offset,
1152                           const SIMD_float &fx,
1153                           const SIMD_float &fy, const SIMD_float &fz) {
1154     float *p = pr;
1155     SIMD_float t;
1156     t = SIMD_load(p);
1157     t = t + fx;
1158     SIMD_store(p,t);
1159     p = p + offset;
1160     t = SIMD_load(p);
1161     t = t + fy;
1162     SIMD_store(p, t);
1163     p = p + offset;
1164     t = SIMD_load(p);
1165     t = t + fz;
1166     SIMD_store(p, t);
1167   }
1168 
SIMD_cache3(double * pr,const int offset,const SIMD_double & fx,const SIMD_double & fy,const SIMD_double & fz)1169   inline void SIMD_cache3(double *pr, const int offset,
1170                           const SIMD_double &fx,
1171                           const SIMD_double &fy, const SIMD_double &fz) {
1172     double *p = pr;
1173     SIMD_double t;
1174     t = SIMD_load(p);
1175     t = t + fx;
1176     SIMD_store(p,t);
1177     p = p + offset;
1178     t = SIMD_load(p);
1179     t = t + fy;
1180     SIMD_store(p, t);
1181     p = p + offset;
1182     t = SIMD_load(p);
1183     t = t + fz;
1184     SIMD_store(p, t);
1185   }
1186 
SIMD_cache3(double * pr,const int foffset,const SIMD_float & fx,const SIMD_float & fy,const SIMD_float & fz)1187   inline void SIMD_cache3(double *pr, const int foffset,
1188                           const SIMD_float &fx,
1189                           const SIMD_float &fy, const SIMD_float &fz) {
1190     const int offset = foffset >> 1;
1191     double *p = pr;
1192     SIMD_double t, fd;
1193 
1194     fd = _mm512_cvtps_pd(_mm512_castps512_ps256(fx));
1195     t = SIMD_load(p);
1196     t = t + fd;
1197     SIMD_store(p,t);
1198     fd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1199                                 _mm512_shuffle_f32x4(fx,fx,238)));
1200     p = p + offset;
1201     t = SIMD_load(p);
1202     t = t + fd;
1203     SIMD_store(p,t);
1204 
1205     fd = _mm512_cvtps_pd(_mm512_castps512_ps256(fy));
1206     p = p + offset;
1207     t = SIMD_load(p);
1208     t = t + fd;
1209     SIMD_store(p,t);
1210     fd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1211                                 _mm512_shuffle_f32x4(fy,fy,238)));
1212     p = p + offset;
1213     t = SIMD_load(p);
1214     t = t + fd;
1215     SIMD_store(p,t);
1216 
1217     fd = _mm512_cvtps_pd(_mm512_castps512_ps256(fz));
1218     p = p + offset;
1219     t = SIMD_load(p);
1220     t = t + fd;
1221     SIMD_store(p,t);
1222     fd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1223                                 _mm512_shuffle_f32x4(fz,fz,238)));
1224     p = p + offset;
1225     t = SIMD_load(p);
1226     t = t + fd;
1227     SIMD_store(p,t);
1228   }
1229 
SIMD_cache3(float * pr,const int offset,const SIMD_float & fx,const SIMD_float & fy,const SIMD_float & fz,const SIMD_float & fx2,const SIMD_float & fy2,const SIMD_float & fz2)1230   inline void SIMD_cache3(float *pr, const int offset,
1231                           const SIMD_float &fx, const SIMD_float &fy,
1232                           const SIMD_float &fz, const SIMD_float &fx2,
1233                           const SIMD_float &fy2, const SIMD_float &fz2) {
1234   }
1235 
SIMD_cache3(double * pr,const int foffset,const SIMD_double & fx,const SIMD_double & fy,const SIMD_double & fz,const SIMD_double & fx2,const SIMD_double & fy2,const SIMD_double & fz2)1236   inline void SIMD_cache3(double *pr, const int foffset,
1237                           const SIMD_double &fx, const SIMD_double &fy,
1238                           const SIMD_double &fz, const SIMD_double &fx2,
1239                           const SIMD_double &fy2, const SIMD_double &fz2) {
1240     const int offset = foffset >> 1;
1241     double *p = pr;
1242     SIMD_double t;
1243 
1244     t = SIMD_load(p);
1245     t = t + fx;
1246     SIMD_store(p,t);
1247     p = p + offset;
1248     t = SIMD_load(p);
1249     t = t + fx2;
1250     SIMD_store(p,t);
1251 
1252     p = p + offset;
1253     t = SIMD_load(p);
1254     t = t + fy;
1255     SIMD_store(p,t);
1256     p = p + offset;
1257     t = SIMD_load(p);
1258     t = t + fy2;
1259     SIMD_store(p,t);
1260 
1261     p = p + offset;
1262     t = SIMD_load(p);
1263     t = t + fz;
1264     SIMD_store(p,t);
1265     p = p + offset;
1266     t = SIMD_load(p);
1267     t = t + fz2;
1268     SIMD_store(p,t);
1269   }
1270 
SIMD_accumulate3(const SIMD_mask & kmask,const SIMD_float & fjx,const SIMD_float & fjy,const SIMD_float & fjz,SIMD_float & fxtmp,SIMD_float & fytmp,SIMD_float & fztmp,SIMD_float & fjxtmp,SIMD_float & fjytmp,SIMD_float & fjztmp,SIMD_float & fxtmp2,SIMD_float & fytmp2,SIMD_float & fztmp2,SIMD_float & fjxtmp2,SIMD_float & fjytmp2,SIMD_float & fjztmp2)1271   inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx,
1272                                const SIMD_float &fjy, const SIMD_float &fjz,
1273                                SIMD_float &fxtmp, SIMD_float &fytmp,
1274                                SIMD_float &fztmp, SIMD_float &fjxtmp,
1275                                SIMD_float &fjytmp, SIMD_float &fjztmp,
1276                                SIMD_float &fxtmp2, SIMD_float &fytmp2,
1277                                SIMD_float &fztmp2, SIMD_float &fjxtmp2,
1278                                SIMD_float &fjytmp2, SIMD_float &fjztmp2) {
1279     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx);
1280     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx);
1281     fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy);
1282     fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, fjy);
1283     fztmp = SIMD_sub(fztmp, kmask, fztmp, fjz);
1284     fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, fjz);
1285   }
1286 
SIMD_accumulate3(const SIMD_mask & kmask,const SIMD_double & fjx,const SIMD_double & fjy,const SIMD_double & fjz,SIMD_double & fxtmp,SIMD_double & fytmp,SIMD_double & fztmp,SIMD_double & fjxtmp,SIMD_double & fjytmp,SIMD_double & fjztmp,SIMD_double & fxtmp2,SIMD_double & fytmp2,SIMD_double & fztmp2,SIMD_double & fjxtmp2,SIMD_double & fjytmp2,SIMD_double & fjztmp2)1287   inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_double &fjx,
1288                                const SIMD_double &fjy, const SIMD_double &fjz,
1289                                SIMD_double &fxtmp, SIMD_double &fytmp,
1290                                SIMD_double &fztmp, SIMD_double &fjxtmp,
1291                                SIMD_double &fjytmp, SIMD_double &fjztmp,
1292                                SIMD_double &fxtmp2, SIMD_double &fytmp2,
1293                                SIMD_double &fztmp2, SIMD_double &fjxtmp2,
1294                                SIMD_double &fjytmp2, SIMD_double &fjztmp2) {
1295     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx);
1296     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx);
1297     fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy);
1298     fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, fjy);
1299     fztmp = SIMD_sub(fztmp, kmask, fztmp, fjz);
1300     fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, fjz);
1301   }
1302 
SIMD_accumulate3(const SIMD_mask & kmask,const SIMD_float & fjx,const SIMD_float & fjy,const SIMD_float & fjz,SIMD_double & fxtmp,SIMD_double & fytmp,SIMD_double & fztmp,SIMD_double & fjxtmp,SIMD_double & fjytmp,SIMD_double & fjztmp,SIMD_double & fxtmp2,SIMD_double & fytmp2,SIMD_double & fztmp2,SIMD_double & fjxtmp2,SIMD_double & fjytmp2,SIMD_double & fjztmp2)1303   inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx,
1304                                const SIMD_float &fjy, const SIMD_float &fjz,
1305                                SIMD_double &fxtmp, SIMD_double &fytmp,
1306                                SIMD_double &fztmp, SIMD_double &fjxtmp,
1307                                SIMD_double &fjytmp, SIMD_double &fjztmp,
1308                                SIMD_double &fxtmp2, SIMD_double &fytmp2,
1309                                SIMD_double &fztmp2, SIMD_double &fjxtmp2,
1310                                SIMD_double &fjytmp2, SIMD_double &fjztmp2) {
1311     SIMD_mask kmask2 = kmask >> 8;
1312     SIMD_double delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(fjx));
1313     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, delfd);
1314     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, delfd);
1315     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1316                                 _mm512_shuffle_f32x4(fjx,fjx,238)));
1317     fxtmp2 = SIMD_sub(fxtmp2, kmask2, fxtmp2, delfd);
1318     fjxtmp2 = SIMD_sub(fjxtmp2, kmask2, fjxtmp2, delfd);
1319 
1320     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(fjy));
1321     fytmp = SIMD_sub(fytmp, kmask, fytmp, delfd);
1322     fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, delfd);
1323     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1324                                 _mm512_shuffle_f32x4(fjy,fjy,238)));
1325     fytmp2 = SIMD_sub(fytmp2, kmask2, fytmp2, delfd);
1326     fjytmp2 = SIMD_sub(fjytmp2, kmask2, fjytmp2, delfd);
1327 
1328     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(fjz));
1329     fztmp = SIMD_sub(fztmp, kmask, fztmp, delfd);
1330     fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, delfd);
1331     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1332                                 _mm512_shuffle_f32x4(fjz,fjz,238)));
1333     fztmp2 = SIMD_sub(fztmp2, kmask2, fztmp2, delfd);
1334     fjztmp2 = SIMD_sub(fjztmp2, kmask2, fjztmp2, delfd);
1335   }
1336 
SIMD_acc_cache3(const SIMD_mask & kmask,const SIMD_float & fjx,const SIMD_float & fjy,const SIMD_float & fjz,const SIMD_float & fkx,const SIMD_float & fky,const SIMD_float & fkz,SIMD_float & fxtmp,SIMD_float & fytmp,SIMD_float & fztmp,SIMD_float & fjxtmp,SIMD_float & fjytmp,SIMD_float & fjztmp,SIMD_float & fxtmp2,SIMD_float & fytmp2,SIMD_float & fztmp2,SIMD_float & fjxtmp2,SIMD_float & fjytmp2,SIMD_float & fjztmp2,float * pr,const int offset)1337   inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx,
1338                               const SIMD_float &fjy, const SIMD_float &fjz,
1339                               const SIMD_float &fkx, const SIMD_float &fky,
1340                               const SIMD_float &fkz,
1341                               SIMD_float &fxtmp, SIMD_float &fytmp,
1342                               SIMD_float &fztmp, SIMD_float &fjxtmp,
1343                               SIMD_float &fjytmp, SIMD_float &fjztmp,
1344                               SIMD_float &fxtmp2, SIMD_float &fytmp2,
1345                               SIMD_float &fztmp2, SIMD_float &fjxtmp2,
1346                               SIMD_float &fjytmp2, SIMD_float &fjztmp2,
1347                               float *pr, const int offset) {
1348     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx - fkx);
1349     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx);
1350     fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy - fky);
1351     fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, fjy);
1352     fztmp = SIMD_sub(fztmp, kmask, fztmp, fjz - fkz);
1353     fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, fjz);
1354     float *p = pr;
1355     SIMD_float t;
1356     t = SIMD_load(p);
1357     t = t + fkx;
1358     SIMD_store(p,t);
1359     p = p + offset;
1360     t = SIMD_load(p);
1361     t = t + fky;
1362     SIMD_store(p, t);
1363     p = p + offset;
1364     t = SIMD_load(p);
1365     t = t + fkz;
1366     SIMD_store(p, t);
1367   }
1368 
SIMD_acc_cache3(const SIMD_mask & kmask,const SIMD_double & fjx,const SIMD_double & fjy,const SIMD_double & fjz,const SIMD_double & fkx,const SIMD_double & fky,const SIMD_double & fkz,SIMD_double & fxtmp,SIMD_double & fytmp,SIMD_double & fztmp,SIMD_double & fjxtmp,SIMD_double & fjytmp,SIMD_double & fjztmp,SIMD_double & fxtmp2,SIMD_double & fytmp2,SIMD_double & fztmp2,SIMD_double & fjxtmp2,SIMD_double & fjytmp2,SIMD_double & fjztmp2,double * pr,const int offset)1369   inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_double &fjx,
1370                               const SIMD_double &fjy, const SIMD_double &fjz,
1371                               const SIMD_double &fkx, const SIMD_double &fky,
1372                               const SIMD_double &fkz,
1373                               SIMD_double &fxtmp, SIMD_double &fytmp,
1374                               SIMD_double &fztmp, SIMD_double &fjxtmp,
1375                               SIMD_double &fjytmp, SIMD_double &fjztmp,
1376                               SIMD_double &fxtmp2, SIMD_double &fytmp2,
1377                               SIMD_double &fztmp2, SIMD_double &fjxtmp2,
1378                               SIMD_double &fjytmp2, SIMD_double &fjztmp2,
1379                               double *pr, const int offset) {
1380     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx - fkx);
1381     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx);
1382     fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy - fky);
1383     fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, fjy);
1384     fztmp = SIMD_sub(fztmp, kmask, fztmp, fjz - fkz);
1385     fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, fjz);
1386     double *p = pr;
1387     SIMD_double t;
1388     t = SIMD_load(p);
1389     t = t + fkx;
1390     SIMD_store(p,t);
1391     p = p + offset;
1392     t = SIMD_load(p);
1393     t = t + fky;
1394     SIMD_store(p, t);
1395     p = p + offset;
1396     t = SIMD_load(p);
1397     t = t + fkz;
1398     SIMD_store(p, t);
1399   }
1400 
SIMD_acc_cache3(const SIMD_mask & kmask,const SIMD_float & fjx,const SIMD_float & fjy,const SIMD_float & fjz,const SIMD_float & fkx,const SIMD_float & fky,const SIMD_float & fkz,SIMD_double & fxtmp,SIMD_double & fytmp,SIMD_double & fztmp,SIMD_double & fjxtmp,SIMD_double & fjytmp,SIMD_double & fjztmp,SIMD_double & fxtmp2,SIMD_double & fytmp2,SIMD_double & fztmp2,SIMD_double & fjxtmp2,SIMD_double & fjytmp2,SIMD_double & fjztmp2,double * pr,const int foffset)1401   inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx,
1402                               const SIMD_float &fjy, const SIMD_float &fjz,
1403                               const SIMD_float &fkx, const SIMD_float &fky,
1404                               const SIMD_float &fkz,
1405                               SIMD_double &fxtmp, SIMD_double &fytmp,
1406                               SIMD_double &fztmp, SIMD_double &fjxtmp,
1407                               SIMD_double &fjytmp, SIMD_double &fjztmp,
1408                               SIMD_double &fxtmp2, SIMD_double &fytmp2,
1409                               SIMD_double &fztmp2, SIMD_double &fjxtmp2,
1410                               SIMD_double &fjytmp2, SIMD_double &fjztmp2,
1411                               double *pr, const int foffset) {
1412     SIMD_mask kmask2 = kmask >> 8;
1413     const int offset = foffset >> 1;
1414     double *p = pr;
1415     SIMD_double t;
1416 
1417     SIMD_double delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(fjx));
1418     SIMD_double delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256(fkx));
1419     t = SIMD_load(p);
1420     t = t + delfdk;
1421     SIMD_store(p,t);
1422     fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, delfd - delfdk);
1423     fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, delfd);
1424     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1425                                 _mm512_shuffle_f32x4(fjx,fjx,238)));
1426     delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256(
1427                                 _mm512_shuffle_f32x4(fkx,fkx,238)));
1428     p = p + offset;
1429     t = SIMD_load(p);
1430     t = t + delfdk;
1431     SIMD_store(p,t);
1432     fxtmp2 = SIMD_sub(fxtmp2, kmask2, fxtmp2, delfd - delfdk);
1433     fjxtmp2 = SIMD_sub(fjxtmp2, kmask2, fjxtmp2, delfd);
1434 
1435     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(fjy));
1436     delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256(fky));
1437     p = p + offset;
1438     t = SIMD_load(p);
1439     t = t + delfdk;
1440     SIMD_store(p,t);
1441     fytmp = SIMD_sub(fytmp, kmask, fytmp, delfd - delfdk);
1442     fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, delfd);
1443     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1444                                 _mm512_shuffle_f32x4(fjy,fjy,238)));
1445     delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256(
1446                                 _mm512_shuffle_f32x4(fky,fky,238)));
1447     p = p + offset;
1448     t = SIMD_load(p);
1449     t = t + delfdk;
1450     SIMD_store(p,t);
1451     fytmp2 = SIMD_sub(fytmp2, kmask2, fytmp2, delfd - delfdk);
1452     fjytmp2 = SIMD_sub(fjytmp2, kmask2, fjytmp2, delfd);
1453 
1454     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(fjz));
1455     delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256(fkz));
1456     p = p + offset;
1457     t = SIMD_load(p);
1458     t = t + delfdk;
1459     SIMD_store(p,t);
1460     fztmp = SIMD_sub(fztmp, kmask, fztmp, delfd - delfdk);
1461     fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, delfd);
1462     delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1463                                 _mm512_shuffle_f32x4(fjz,fjz,238)));
1464     delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256(
1465                                 _mm512_shuffle_f32x4(fkz,fkz,238)));
1466     p = p + offset;
1467     t = SIMD_load(p);
1468     t = t + delfdk;
1469     SIMD_store(p,t);
1470     fztmp2 = SIMD_sub(fztmp2, kmask2, fztmp2, delfd - delfdk);
1471     fjztmp2 = SIMD_sub(fjztmp2, kmask2, fjztmp2, delfd);
1472   }
1473 
SIMD_acc_energy3(const SIMD_mask & hmask,const SIMD_float & evdwl,const int eatom,SIMD_float & sevdwl,SIMD_float & fwtmp,SIMD_float & fjtmp,SIMD_float & fwtmp2,SIMD_float & fjtmp2)1474   inline void SIMD_acc_energy3(const SIMD_mask &hmask,
1475                                const SIMD_float &evdwl, const int eatom,
1476                                SIMD_float &sevdwl, SIMD_float &fwtmp,
1477                                SIMD_float &fjtmp, SIMD_float &fwtmp2,
1478                                SIMD_float &fjtmp2) {
1479     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, evdwl);
1480     if (eatom) {
1481       const SIMD_float hevdwl = evdwl * (float)0.5;
1482       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
1483       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
1484     }
1485   }
1486 
SIMD_acc_energy3(const SIMD_mask & hmask,const SIMD_double & evdwl,const int eatom,SIMD_double & sevdwl,SIMD_double & fwtmp,SIMD_double & fjtmp,SIMD_double & fwtmp2,SIMD_double & fjtmp2)1487   inline void SIMD_acc_energy3(const SIMD_mask &hmask,
1488                                const SIMD_double &evdwl, const int eatom,
1489                                SIMD_double &sevdwl, SIMD_double &fwtmp,
1490                                SIMD_double &fjtmp, SIMD_double &fwtmp2,
1491                                SIMD_double &fjtmp2) {
1492     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, evdwl);
1493     if (eatom) {
1494       const SIMD_double hevdwl = evdwl * (double)0.5;
1495       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
1496       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
1497     }
1498   }
1499 
SIMD_acc_energy3(const SIMD_mask & hmask,const SIMD_float & evdwl,const int eatom,SIMD_double & sevdwl,SIMD_double & fwtmp,SIMD_double & fjtmp,SIMD_double & fwtmp2,SIMD_double & fjtmp2)1500   inline void SIMD_acc_energy3(const SIMD_mask &hmask,
1501                                const SIMD_float &evdwl, const int eatom,
1502                                SIMD_double &sevdwl, SIMD_double &fwtmp,
1503                                SIMD_double &fjtmp, SIMD_double &fwtmp2,
1504                                SIMD_double &fjtmp2) {
1505     SIMD_double evdwld;
1506     evdwld = _mm512_cvtps_pd(_mm512_castps512_ps256(evdwl));
1507     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, evdwld);
1508     if (eatom) {
1509       const SIMD_double hevdwl = evdwld * (double)0.5;
1510       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
1511       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
1512     }
1513     SIMD_mask hmask2 = hmask >> 8;
1514     evdwld = _mm512_cvtps_pd(_mm512_castps512_ps256(
1515                                 _mm512_shuffle_f32x4(evdwl,evdwl,238)));
1516     sevdwl = SIMD_add(sevdwl, hmask2, sevdwl, evdwld);
1517     if (eatom) {
1518       const SIMD_double hevdwl = evdwld * (double)0.5;
1519       fwtmp2 = SIMD_add(fwtmp2, hmask2, fwtmp2, hevdwl);
1520       fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl);
1521     }
1522   }
1523 
SIMD_acc_three(const SIMD_mask & hmask,const SIMD_float & facrad,const int eatom,SIMD_float & sevdwl,SIMD_float & fwtmp,SIMD_float & fjtmp,SIMD_float & fwtmp2,SIMD_float & fjtmp2,const SIMD_int & k,float * force)1524   inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad,
1525                              const int eatom, SIMD_float &sevdwl,
1526                              SIMD_float &fwtmp, SIMD_float &fjtmp,
1527                              SIMD_float &fwtmp2, SIMD_float &fjtmp2,
1528                              const SIMD_int &k, float *force) {
1529     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad);
1530     if (eatom) {
1531       SIMD_float hevdwl = facrad * SIMD_set((float)0.33333333);
1532       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
1533       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
1534       SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
1535       SIMD_float keng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), hmask,
1536                                                  k, force + 3, _MM_SCALE_1);
1537       keng = keng + hevdwl;
1538       _mm512_mask_i32scatter_ps(force + 3, hmask, k, keng, _MM_SCALE_1);
1539     }
1540   }
1541 
SIMD_acc_three(const SIMD_mask & hmask,const SIMD_double & facrad,const int eatom,SIMD_double & sevdwl,SIMD_double & fwtmp,SIMD_double & fjtmp,SIMD_double & fwtmp2,SIMD_double & fjtmp2,const SIMD_int & k,double * force)1542   inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_double &facrad,
1543                              const int eatom, SIMD_double &sevdwl,
1544                              SIMD_double &fwtmp, SIMD_double &fjtmp,
1545                              SIMD_double &fwtmp2, SIMD_double &fjtmp2,
1546                              const SIMD_int &k, double *force) {
1547     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad);
1548     if (eatom) {
1549       SIMD_double hevdwl = facrad * SIMD_set((double)0.33333333);
1550       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
1551       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
1552       SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
1553       SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
1554                                                   _mm512_castsi512_si256(k),
1555                                                   force + 3, _MM_SCALE_2);
1556       keng = keng + hevdwl;
1557       _mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
1558                                 keng, _MM_SCALE_2);
1559     }
1560   }
1561 
SIMD_acc_three(const SIMD_mask & hmask,const SIMD_float & facrad,const int eatom,SIMD_double & sevdwl,SIMD_double & fwtmp,SIMD_double & fjtmp,SIMD_double & fwtmp2,SIMD_double & fjtmp2,const SIMD_int & k,double * force)1562   inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad,
1563                              const int eatom, SIMD_double &sevdwl,
1564                              SIMD_double &fwtmp, SIMD_double &fjtmp,
1565                              SIMD_double &fwtmp2, SIMD_double &fjtmp2,
1566                              const SIMD_int &k, double *force) {
1567     SIMD_double facradd;
1568     facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(facrad));
1569     sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facradd);
1570     if (eatom) {
1571       SIMD_double hevdwl = facradd * SIMD_set((double)0.33333333);
1572       fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl);
1573       fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl);
1574       SIMD_conflict_pi_reduce1(hmask, k, hevdwl);
1575       SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask,
1576                                                   _mm512_castsi512_si256(k),
1577                                                   force + 3, _MM_SCALE_2);
1578       keng = keng + hevdwl;
1579       _mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k),
1580                                 keng, _MM_SCALE_2);
1581     }
1582     SIMD_mask hmask2 = hmask >> 8;
1583     facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1584                                 _mm512_shuffle_f32x4(facrad,facrad,238)));
1585     sevdwl = SIMD_add(sevdwl, hmask2, sevdwl, facradd);
1586     if (eatom) {
1587       SIMD_double hevdwl = facradd * SIMD_set((double)0.33333333);
1588       fwtmp2 = SIMD_add(fwtmp2, hmask2, fwtmp2, hevdwl);
1589       fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl);
1590       SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238);
1591       SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl);
1592       SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(),
1593                                                   hmask2,
1594                                                   _mm512_castsi512_si256(k2),
1595                                                   force + 3, _MM_SCALE_2);
1596       keng = keng + hevdwl;
1597       _mm512_mask_i32scatter_pd(force + 3, hmask2, _mm512_castsi512_si256(k2),
1598                                 keng, _MM_SCALE_2);
1599     }
1600   }
1601 
SIMD_ev_tally_nbor(const SIMD_mask & m,const int vflag,const float ev_pre,const SIMD_float & fpair,const SIMD_float & delx,const SIMD_float & dely,const SIMD_float & delz,SIMD_float & sv0,SIMD_float & sv1,SIMD_float & sv2,SIMD_float & sv3,SIMD_float & sv4,SIMD_float & sv5)1602   inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag,
1603                                  const float ev_pre,
1604                  const SIMD_float &fpair, const SIMD_float &delx,
1605                  const SIMD_float &dely,  const SIMD_float &delz,
1606                  SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2,
1607                  SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) {
1608     if (vflag == 1) {
1609       const SIMD_float prefpair = SIMD_set(ev_pre) * fpair;
1610       sv0 = SIMD_add(sv0, m, sv0, delx * delx * prefpair);
1611       sv1 = SIMD_add(sv1, m, sv1, dely * dely * prefpair);
1612       sv2 = SIMD_add(sv2, m, sv2, delz * delz * prefpair);
1613       sv3 = SIMD_add(sv3, m, sv3, delx * dely * prefpair);
1614       sv4 = SIMD_add(sv4, m, sv4, delx * delz * prefpair);
1615       sv5 = SIMD_add(sv5, m, sv5, dely * delz * prefpair);
1616     }
1617   }
1618 
SIMD_ev_tally_nbor(const SIMD_mask & m,const int vflag,const double ev_pre,const SIMD_double & fpair,const SIMD_double & delx,const SIMD_double & dely,const SIMD_double & delz,SIMD_double & sv0,SIMD_double & sv1,SIMD_double & sv2,SIMD_double & sv3,SIMD_double & sv4,SIMD_double & sv5)1619   inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag,
1620                                  const double ev_pre,
1621                  const SIMD_double &fpair, const SIMD_double &delx,
1622                  const SIMD_double &dely,  const SIMD_double &delz,
1623                  SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
1624                  SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
1625     if (vflag == 1) {
1626       const SIMD_double prefpair = SIMD_set(ev_pre) * fpair;
1627       sv0 = SIMD_add(sv0, m, sv0, delx * delx * prefpair);
1628       sv1 = SIMD_add(sv1, m, sv1, dely * dely * prefpair);
1629       sv2 = SIMD_add(sv2, m, sv2, delz * delz * prefpair);
1630       sv3 = SIMD_add(sv3, m, sv3, delx * dely * prefpair);
1631       sv4 = SIMD_add(sv4, m, sv4, delx * delz * prefpair);
1632       sv5 = SIMD_add(sv5, m, sv5, dely * delz * prefpair);
1633     }
1634   }
1635 
SIMD_ev_tally_nbor(const SIMD_mask & m,const int vflag,const float ev_pre,const SIMD_float & fpair,const SIMD_float & delx,const SIMD_float & dely,const SIMD_float & delz,SIMD_double & sv0,SIMD_double & sv1,SIMD_double & sv2,SIMD_double & sv3,SIMD_double & sv4,SIMD_double & sv5)1636   inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag,
1637                                  const float ev_pre,
1638                  const SIMD_float &fpair, const SIMD_float &delx,
1639                  const SIMD_float &dely,  const SIMD_float &delz,
1640                  SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
1641                  SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
1642     if (vflag == 1) {
1643       const SIMD_mask m2 = m >> 8;
1644       const SIMD_float prefpair = SIMD_set(ev_pre) * fpair;
1645       SIMD_float dpair = delx * delx * prefpair;
1646       SIMD_double dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
1647       sv0 = SIMD_add(sv0, m, sv0, dpaird);
1648       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
1649                                 _mm512_shuffle_f32x4(dpair,dpair,238)));
1650       sv0 = SIMD_add(sv0, m2, sv0, dpaird);
1651 
1652       dpair = dely * dely * prefpair;
1653       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
1654       sv1 = SIMD_add(sv1, m, sv1, dpaird);
1655       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
1656                                 _mm512_shuffle_f32x4(dpair,dpair,238)));
1657       sv1 = SIMD_add(sv1, m2, sv1, dpaird);
1658 
1659       dpair = delz * delz * prefpair;
1660       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
1661       sv2 = SIMD_add(sv2, m, sv2, dpaird);
1662       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
1663                                 _mm512_shuffle_f32x4(dpair,dpair,238)));
1664       sv2 = SIMD_add(sv2, m2, sv2, dpaird);
1665 
1666       dpair = delx * dely * prefpair;
1667       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
1668       sv3 = SIMD_add(sv3, m, sv3, dpaird);
1669       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
1670                                 _mm512_shuffle_f32x4(dpair,dpair,238)));
1671       sv3 = SIMD_add(sv3, m2, sv3, dpaird);
1672 
1673       dpair = delx * delz * prefpair;
1674       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
1675       sv4 = SIMD_add(sv4, m, sv4, dpaird);
1676       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
1677                                 _mm512_shuffle_f32x4(dpair,dpair,238)));
1678       sv4 = SIMD_add(sv4, m2, sv4, dpaird);
1679 
1680       dpair = dely * delz * prefpair;
1681       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
1682       sv5 = SIMD_add(sv5, m, sv5, dpaird);
1683       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
1684                                 _mm512_shuffle_f32x4(dpair,dpair,238)));
1685       sv5 = SIMD_add(sv5, m2, sv5, dpaird);
1686     }
1687   }
1688 
SIMD_ev_tally_nbor3v(const SIMD_mask & m,const int vflag,const SIMD_float & fj0,const SIMD_float & fj1,const SIMD_float & fj2,const SIMD_float & fk0,const SIMD_float & fk1,const SIMD_float & fk2,const SIMD_float & delx,const SIMD_float & dely,const SIMD_float & delz,const SIMD_float & delr2x,const SIMD_float & delr2y,const SIMD_float & delr2z,SIMD_float & sv0,SIMD_float & sv1,SIMD_float & sv2,SIMD_float & sv3,SIMD_float & sv4,SIMD_float & sv5)1689   inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag,
1690                  const SIMD_float &fj0, const SIMD_float &fj1,
1691                  const SIMD_float &fj2, const SIMD_float &fk0,
1692                  const SIMD_float &fk1, const SIMD_float &fk2,
1693                  const SIMD_float &delx, const SIMD_float &dely,
1694                  const SIMD_float &delz, const SIMD_float &delr2x,
1695                  const SIMD_float &delr2y, const SIMD_float &delr2z,
1696                  SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2,
1697                  SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) {
1698     if (vflag == 1) {
1699       sv0 = SIMD_add(sv0, m, sv0, delx * fj0 + delr2x * fk0);
1700       sv1 = SIMD_add(sv1, m, sv1, dely * fj1 + delr2y * fk1);
1701       sv2 = SIMD_add(sv2, m, sv2, delz * fj2 + delr2z * fk2);
1702       sv3 = SIMD_add(sv3, m, sv3, delx * fj1 + delr2x * fk1);
1703       sv4 = SIMD_add(sv4, m, sv4, delx * fj2 + delr2x * fk2);
1704       sv5 = SIMD_add(sv5, m, sv5, dely * fj2 + delr2y * fk2);
1705     }
1706   }
1707 
SIMD_ev_tally_nbor3v(const SIMD_mask & m,const int vflag,const SIMD_double & fj0,const SIMD_double & fj1,const SIMD_double & fj2,const SIMD_double & fk0,const SIMD_double & fk1,const SIMD_double & fk2,const SIMD_double & delx,const SIMD_double & dely,const SIMD_double & delz,const SIMD_double & delr2x,const SIMD_double & delr2y,const SIMD_double & delr2z,SIMD_double & sv0,SIMD_double & sv1,SIMD_double & sv2,SIMD_double & sv3,SIMD_double & sv4,SIMD_double & sv5)1708   inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag,
1709                  const SIMD_double &fj0, const SIMD_double &fj1,
1710                  const SIMD_double &fj2, const SIMD_double &fk0,
1711                  const SIMD_double &fk1, const SIMD_double &fk2,
1712                  const SIMD_double &delx, const SIMD_double &dely,
1713                  const SIMD_double &delz, const SIMD_double &delr2x,
1714                  const SIMD_double &delr2y, const SIMD_double &delr2z,
1715                  SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
1716                  SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
1717     if (vflag == 1) {
1718       sv0 = SIMD_add(sv0, m, sv0, delx * fj0 + delr2x * fk0);
1719       sv1 = SIMD_add(sv1, m, sv1, dely * fj1 + delr2y * fk1);
1720       sv2 = SIMD_add(sv2, m, sv2, delz * fj2 + delr2z * fk2);
1721       sv3 = SIMD_add(sv3, m, sv3, delx * fj1 + delr2x * fk1);
1722       sv4 = SIMD_add(sv4, m, sv4, delx * fj2 + delr2x * fk2);
1723       sv5 = SIMD_add(sv5, m, sv5, dely * fj2 + delr2y * fk2);
1724     }
1725   }
1726 
SIMD_ev_tally_nbor3v(const SIMD_mask & m,const int vflag,const SIMD_float & fj0,const SIMD_float & fj1,const SIMD_float & fj2,const SIMD_float & fk0,const SIMD_float & fk1,const SIMD_float & fk2,const SIMD_float & delx,const SIMD_float & dely,const SIMD_float & delz,const SIMD_float & delr2x,const SIMD_float & delr2y,const SIMD_float & delr2z,SIMD_double & sv0,SIMD_double & sv1,SIMD_double & sv2,SIMD_double & sv3,SIMD_double & sv4,SIMD_double & sv5)1727   inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag,
1728                  const SIMD_float &fj0, const SIMD_float &fj1,
1729                  const SIMD_float &fj2, const SIMD_float &fk0,
1730                  const SIMD_float &fk1, const SIMD_float &fk2,
1731                  const SIMD_float &delx, const SIMD_float &dely,
1732                  const SIMD_float &delz, const SIMD_float &delr2x,
1733                  const SIMD_float &delr2y, const SIMD_float &delr2z,
1734                  SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2,
1735                  SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) {
1736     if (vflag == 1) {
1737       const SIMD_mask m2 = m >> 8;
1738       SIMD_float dpair = delx * fj0 + delr2x * fk0;
1739       SIMD_double dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
1740       sv0 = SIMD_add(sv0, m, sv0, dpaird);
1741       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
1742                                 _mm512_shuffle_f32x4(dpair,dpair,238)));
1743       sv0 = SIMD_add(sv0, m2, sv0, dpaird);
1744 
1745       dpair = dely * fj1 + delr2y * fk1;
1746       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
1747       sv1 = SIMD_add(sv1, m, sv1, dpaird);
1748       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
1749                               _mm512_shuffle_f32x4(dpair,dpair,238)));
1750       sv1 = SIMD_add(sv1, m2, sv1, dpaird);
1751 
1752       dpair = delz * fj2 + delr2z * fk2;
1753       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
1754       sv2 = SIMD_add(sv2, m, sv2, dpaird);
1755       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
1756                               _mm512_shuffle_f32x4(dpair,dpair,238)));
1757       sv2 = SIMD_add(sv2, m2, sv2, dpaird);
1758 
1759       dpair = delx * fj1 + delr2x * fk1;
1760       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
1761       sv3 = SIMD_add(sv3, m, sv3, dpaird);
1762       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
1763                               _mm512_shuffle_f32x4(dpair,dpair,238)));
1764       sv3 = SIMD_add(sv3, m2, sv3, dpaird);
1765 
1766       dpair = delx * fj2 + delr2x * fk2;
1767       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
1768       sv4 = SIMD_add(sv4, m, sv4, dpaird);
1769       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
1770                               _mm512_shuffle_f32x4(dpair,dpair,238)));
1771       sv4 = SIMD_add(sv4, m2, sv4, dpaird);
1772 
1773       dpair = dely * fj2 + delr2y * fk2;
1774       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair));
1775       sv5 = SIMD_add(sv5, m, sv5, dpaird);
1776       dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(
1777                               _mm512_shuffle_f32x4(dpair,dpair,238)));
1778       sv5 = SIMD_add(sv5, m2, sv5, dpaird);
1779     }
1780   }
1781 
SIMD_safe_force_accumulate(const SIMD_mask & rmask,float * force,const SIMD_int & joffset,SIMD_float & amx,SIMD_float & amy,SIMD_float & amz,SIMD_float & fxtmp,SIMD_float & fytmp,SIMD_float & fztmp,SIMD_float & fxtmp2,SIMD_float & fytmp2,SIMD_float & fztmp2)1782   inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask,
1783          float *force, const SIMD_int &joffset, SIMD_float &amx,
1784          SIMD_float &amy, SIMD_float &amz, SIMD_float &fxtmp,
1785          SIMD_float &fytmp, SIMD_float &fztmp, SIMD_float &fxtmp2,
1786          SIMD_float &fytmp2, SIMD_float &fztmp2) {
1787     fxtmp = SIMD_add(fxtmp, rmask, fxtmp, amx);
1788     fytmp = SIMD_add(fytmp, rmask, fytmp, amy);
1789     fztmp = SIMD_add(fztmp, rmask, fztmp, amz);
1790     SIMD_conflict_pi_reduce3(rmask, joffset, amx, amy, amz);
1791     SIMD_jforce_update(rmask, force, joffset, amx, amy, amz);
1792   }
1793 
SIMD_safe_force_accumulate(const SIMD_mask & rmask,double * force,const SIMD_int & joffset,SIMD_double & amx,SIMD_double & amy,SIMD_double & amz,SIMD_double & fxtmp,SIMD_double & fytmp,SIMD_double & fztmp,SIMD_double & fxtmp2,SIMD_double & fytmp2,SIMD_double & fztmp2)1794   inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask,
1795          double *force, const SIMD_int &joffset, SIMD_double &amx,
1796          SIMD_double &amy, SIMD_double &amz, SIMD_double &fxtmp,
1797          SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2,
1798          SIMD_double &fytmp2, SIMD_double &fztmp2) {
1799     fxtmp = SIMD_add(fxtmp, rmask, fxtmp, amx);
1800     fytmp = SIMD_add(fytmp, rmask, fytmp, amy);
1801     fztmp = SIMD_add(fztmp, rmask, fztmp, amz);
1802     SIMD_conflict_pi_reduce3(rmask, joffset, amx, amy, amz);
1803     SIMD_jforce_update(rmask, force, joffset, amx, amy, amz);
1804   }
1805 
SIMD_safe_force_accumulate(const SIMD_mask & rmask,double * force,const SIMD_int & joffset,SIMD_float & amx,SIMD_float & amy,SIMD_float & amz,SIMD_double & fxtmp,SIMD_double & fytmp,SIMD_double & fztmp,SIMD_double & fxtmp2,SIMD_double & fytmp2,SIMD_double & fztmp2)1806   inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask,
1807          double *force, const SIMD_int &joffset, SIMD_float &amx,
1808          SIMD_float &amy, SIMD_float &amz, SIMD_double &fxtmp,
1809          SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2,
1810          SIMD_double &fytmp2, SIMD_double &fztmp2) {
1811     SIMD_double amxd, amyd, amzd;
1812     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx));
1813     fxtmp = SIMD_add(fxtmp, rmask, fxtmp, amxd);
1814     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(amy));
1815     fytmp = SIMD_add(fytmp, rmask, fytmp, amyd);
1816     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz));
1817     fztmp = SIMD_add(fztmp, rmask, fztmp, amzd);
1818     SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd);
1819     SIMD_jforce_update(rmask, force, joffset, amxd, amyd, amzd);
1820 
1821     SIMD_mask rmask2 = rmask >> 8;
1822     amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1823                                 _mm512_shuffle_f32x4(amx,amx,238)));
1824     fxtmp2 = SIMD_add(fxtmp2, rmask2, fxtmp2, amxd);
1825     amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1826                                 _mm512_shuffle_f32x4(amy,amy,238)));
1827     fytmp2 = SIMD_add(fytmp2, rmask2, fytmp2, amyd);
1828     amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(
1829                                 _mm512_shuffle_f32x4(amz,amz,238)));
1830     fztmp2 = SIMD_add(fztmp2, rmask2, fztmp2, amzd);
1831     SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238);
1832     SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd);
1833     SIMD_jforce_update(rmask2, force, joffset2, amxd, amyd, amzd);
1834   }
1835 
SIMD_iforce_update(const SIMD_mask & m,float * force,const SIMD_int & i,const SIMD_float & fx,const SIMD_float & fy,const SIMD_float & fz,const int EFLAG,const int eatom,const SIMD_float & fwtmp)1836   inline void SIMD_iforce_update(const SIMD_mask &m, float *force,
1837                                  const SIMD_int &i, const SIMD_float &fx,
1838                                  const SIMD_float &fy, const SIMD_float &fz,
1839                                  const int EFLAG, const int eatom,
1840                                  const SIMD_float &fwtmp) {
1841     SIMD_float jfrc;
1842     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force,
1843                                     _MM_SCALE_1);
1844     jfrc = jfrc + fx;
1845     _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1);
1846     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1,
1847                                     _MM_SCALE_1);
1848     jfrc = jfrc + fy;
1849     _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1);
1850     jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2,
1851                                     _MM_SCALE_1);
1852     jfrc = jfrc + fz;
1853     _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1);
1854     if (EFLAG) {
1855       if (eatom) {
1856         jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3,
1857                                         _MM_SCALE_1);
1858         jfrc = jfrc + fwtmp;
1859         _mm512_mask_i32scatter_ps(force+3, m, i, jfrc, _MM_SCALE_1);
1860       }
1861     }
1862   }
1863 
SIMD_iforce_update(const SIMD_mask & m,double * force,const SIMD_int & i,const SIMD_double & fx,const SIMD_double & fy,const SIMD_double & fz,const int EFLAG,const int eatom,const SIMD_double & fwtmp)1864   inline void SIMD_iforce_update(const SIMD_mask &m, double *force,
1865                                  const SIMD_int &i, const SIMD_double &fx,
1866                                  const SIMD_double &fy, const SIMD_double &fz,
1867                                  const int EFLAG, const int eatom,
1868                                  const SIMD_double &fwtmp) {
1869     SIMD_double jfrc;
1870     jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
1871                                     _mm512_castsi512_si256(i), force,
1872                                     _MM_SCALE_2);
1873     jfrc = jfrc + fx;
1874     _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc,
1875                               _MM_SCALE_2);
1876     jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
1877                                     _mm512_castsi512_si256(i), force + 1,
1878                                     _MM_SCALE_2);
1879     jfrc = jfrc + fy;
1880     _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc,
1881                               _MM_SCALE_2);
1882     jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
1883                                     _mm512_castsi512_si256(i), force + 2,
1884                                     _MM_SCALE_2);
1885     jfrc = jfrc + fz;
1886     _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc,
1887                               _MM_SCALE_2);
1888     if (EFLAG) {
1889       if (eatom) {
1890         jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m,
1891                                         _mm512_castsi512_si256(i),
1892                                         force + 3, _MM_SCALE_2);
1893         jfrc = jfrc + fwtmp;
1894         _mm512_mask_i32scatter_pd(force+3, m, _mm512_castsi512_si256(i), jfrc,
1895                                   _MM_SCALE_2);
1896       }
1897     }
1898   }
1899 
1900   #ifdef SW_GATHER_TEST
1901   template <class atom_t>
SIMD_atom_gather(const SIMD_mask & m,const atom_t * atom,const SIMD_int & i,SIMD_float & x,SIMD_float & y,SIMD_float & z,SIMD_int & type)1902   inline void SIMD_atom_gather(const SIMD_mask &m, const atom_t *atom,
1903                                const SIMD_int &i, SIMD_float &x, SIMD_float &y,
1904                                SIMD_float &z, SIMD_int &type) {
1905     int jv_scalar[16] __attribute__((aligned(64)));
1906     int jm_scalar[16] __attribute__((aligned(64)));
1907     _mm512_store_epi32(jv_scalar, i);
1908 
1909     SIMD_float pl1, pl2, pl3, pl4;
1910 
1911     int js = jv_scalar[0];
1912     pl1 = _mm512_loadu_ps((float *)((char *)atom + js));
1913     js = jv_scalar[1];
1914     pl1 = _mm512_insertf32x4(pl1, _mm_load_ps((float *)((char *)atom +
1915                                                         js)), 1);
1916     js = jv_scalar[2];
1917     pl1 = _mm512_insertf32x4(pl1, _mm_load_ps((float *)((char *)atom +
1918                                                         js)), 2);
1919     js = jv_scalar[3];
1920     pl1 = _mm512_insertf32x4(pl1, _mm_load_ps((float *)((char *)atom +
1921                                                         js)), 3);
1922 
1923     js = jv_scalar[4];
1924     pl2 = _mm512_loadu_ps((float *)((char *)atom + js));
1925     js = jv_scalar[5];
1926     pl2 = _mm512_insertf32x4(pl2, _mm_load_ps((float *)((char *)atom +
1927                                                         js)), 1);
1928     js = jv_scalar[6];
1929     pl2 = _mm512_insertf32x4(pl2, _mm_load_ps((float *)((char *)atom +
1930                                                         js)), 2);
1931     js = jv_scalar[7];
1932     pl2 = _mm512_insertf32x4(pl2, _mm_load_ps((float *)((char *)atom +
1933                                                         js)), 3);
1934 
1935     js = jv_scalar[8];
1936     pl3 = _mm512_loadu_ps((float *)((char *)atom + js));
1937     js = jv_scalar[9];
1938     pl3 = _mm512_insertf32x4(pl3, _mm_load_ps((float *)((char *)atom +
1939                                                         js)), 1);
1940     js = jv_scalar[10];
1941     pl3 = _mm512_insertf32x4(pl3, _mm_load_ps((float *)((char *)atom +
1942                                                         js)), 2);
1943     js = jv_scalar[11];
1944     pl3 = _mm512_insertf32x4(pl3, _mm_load_ps((float *)((char *)atom +
1945                                                         js)), 3);
1946 
1947     js = jv_scalar[12];
1948     pl4 = _mm512_loadu_ps((float *)((char *)atom + js));
1949     js = jv_scalar[13];
1950     pl4 = _mm512_insertf32x4(pl4, _mm_load_ps((float *)((char *)atom +
1951                                                         js)), 1);
1952     js = jv_scalar[14];
1953     pl4 = _mm512_insertf32x4(pl4, _mm_load_ps((float *)((char *)atom +
1954                                                         js)), 2);
1955     js = jv_scalar[15];
1956     pl4 = _mm512_insertf32x4(pl4, _mm_load_ps((float *)((char *)atom +
1957                                                         js)), 3);
1958 
1959     SIMD_int c0 = _mm512_setr_epi32(0x0,0x4,0x8,0xc,0x10,0x14,0x18,0x1c,
1960                                     0x1,0x5,0x9,0xd,0x11,0x15,0x19,0x1d);
1961     SIMD_int c1 = _mm512_setr_epi32(0x1,0x5,0x9,0xd,0x11,0x15,0x19,0x1d,
1962                                     0x0,0x4,0x8,0xc,0x10,0x14,0x18,0x1c);
1963     SIMD_int c2 = _mm512_setr_epi32(0x2,0x6,0xa,0xe,0x12,0x16,0x1a,0x1e,
1964                                     0x3,0x7,0xb,0xf,0x13,0x17,0x1b,0x1f);
1965     SIMD_int c3 = _mm512_setr_epi32(0x3,0x7,0xb,0xf,0x13,0x17,0x1b,0x1f,
1966                                     0x2,0x6,0xa,0xe,0x12,0x16,0x1a,0x1e);
1967     SIMD_mask k_1 = _mm512_int2mask(65280);
1968 
1969     SIMD_float sl1 = _mm512_permutex2var_ps(pl3, c0, pl4);
1970     SIMD_float sl2 = _mm512_permutex2var_ps(pl1, c1, pl2);
1971     SIMD_float sl3 = _mm512_permutex2var_ps(pl3, c2, pl4);
1972     SIMD_float sl4 = _mm512_permutex2var_ps(pl1, c3, pl2);
1973 
1974     x = _mm512_shuffle_f32x4(sl2, sl1, 78);
1975     z = _mm512_shuffle_f32x4(sl4, sl3, 78);
1976     y = _mm512_mask_blend_ps(k_1, sl2, sl1);
1977     type = _mm512_castps_si512(_mm512_mask_blend_ps(k_1, sl4, sl3));
1978   }
1979   #endif
1980 }
1981 
1982 #endif
1983 
1984 #endif
1985