1 // clang-format off 2 /* -*- c++ -*- ------------------------------------------------------------- 3 LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator 4 https://www.lammps.org/, Sandia National Laboratories 5 Steve Plimpton, sjplimp@sandia.gov 6 7 Copyright (2003) Sandia Corporation. Under the terms of Contract 8 DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains 9 certain rights in this software. This software is distributed under 10 the GNU General Public License. 11 12 See the README file in the top-level LAMMPS directory. 13 ------------------------------------------------------------------------- */ 14 15 /* ---------------------------------------------------------------------- 16 Contributing author: W. Michael Brown (Intel) 17 ------------------------------------------------------------------------- */ 18 19 /* ---------------------------------------------------------------------- 20 21 Vector intrinsics are temporarily being used for the Stillinger-Weber 22 potential to allow for advanced features in the AVX512 instruction set to 23 be exploited on early hardware. We hope to see compiler improvements for 24 AVX512 that will eliminate this requirement, so it is not recommended to 25 develop code based on the intrinsics implementation. Please e-mail the 26 authors for more details. 27 28 ------------------------------------------------------------------------- */ 29 30 #ifndef INTEL_SIMD_H 31 #define INTEL_SIMD_H 32 33 #include "intel_preprocess.h" 34 #include "immintrin.h" 35 36 #ifdef __AVX512F__ 37 38 #ifndef _MM_SCALE_1 39 #define _MM_SCALE_1 1 40 #define _MM_SCALE_2 2 41 #define _MM_SCALE_4 4 42 #define _MM_SCALE_8 8 43 #endif 44 45 namespace ip_simd { 46 47 typedef __mmask16 SIMD_mask; 48 49 struct SIMD_int { 50 __m512i v; SIMD_intSIMD_int51 SIMD_int() {} SIMD_intSIMD_int52 SIMD_int(const __m512i in) : v(in) {} __m512iSIMD_int53 operator __m512i() const { return v;} 54 }; 55 56 struct SIMD_float { 57 __m512 v; SIMD_floatSIMD_float58 SIMD_float() {} SIMD_floatSIMD_float59 SIMD_float(const __m512 in) : v(in) {} __m512SIMD_float60 operator __m512() const { return v;} 61 }; 62 63 struct SIMD_double { 64 __m512d v; SIMD_doubleSIMD_double65 SIMD_double() {} SIMD_doubleSIMD_double66 SIMD_double(const __m512d in) : v(in) {} __m512dSIMD_double67 operator __m512d() const { return v;} 68 }; 69 70 template<class flt_t> 71 class SIMD_type { 72 }; 73 74 template<> 75 class SIMD_type<float> { 76 public: 77 typedef SIMD_float SIMD_vec; width()78 static inline int width() { return 16; } 79 }; 80 81 template<> 82 class SIMD_type<double> { 83 public: 84 typedef SIMD_double SIMD_vec; width()85 static inline int width() { return 8; } 86 }; 87 88 template<class flt_t, class acc_t> 89 class is_same { 90 public: 91 static const int value = 1; 92 }; 93 94 template<> 95 class is_same<float,double> { 96 public: 97 static const int value = 0; 98 }; 99 100 // ------- Set Operations 101 SIMD_set(const int l0,const int l1,const int l2,const int l3,const int l4,const int l5,const int l6,const int l7,const int l8,const int l9,const int l10,const int l11,const int l12,const int l13,const int l14,const int l15)102 inline SIMD_int SIMD_set(const int l0, const int l1, const int l2, 103 const int l3, const int l4, const int l5, 104 const int l6, const int l7, const int l8, 105 const int l9, const int l10, const int l11, 106 const int l12, const int l13, const int l14, 107 const int l15) { 108 return _mm512_setr_epi32(l0,l1,l2,l3,l4,l5,l6,l7, 109 l8,l9,l10,l11,l12,l13,l14,l15); 110 } 111 SIMD_set(const int l)112 inline SIMD_int SIMD_set(const int l) { 113 return _mm512_set1_epi32(l); 114 } 115 SIMD_set(const float l)116 inline SIMD_float SIMD_set(const float l) { 117 return _mm512_set1_ps(l); 118 } 119 SIMD_set(const double l)120 inline SIMD_double SIMD_set(const double l) { 121 return _mm512_set1_pd(l); 122 } 123 SIMD_zero_masked(const SIMD_mask & m,const SIMD_int & one)124 inline SIMD_int SIMD_zero_masked(const SIMD_mask &m, const SIMD_int &one) { 125 return _mm512_maskz_mov_epi32(m, one); 126 } 127 SIMD_zero_masked(const SIMD_mask & m,const SIMD_float & one)128 inline SIMD_float SIMD_zero_masked(const SIMD_mask &m, 129 const SIMD_float &one) { 130 return _mm512_maskz_mov_ps(m, one); 131 } 132 SIMD_zero_masked(const SIMD_mask & m,const SIMD_double & one)133 inline SIMD_double SIMD_zero_masked(const SIMD_mask &m, 134 const SIMD_double &one) { 135 return _mm512_maskz_mov_pd(m, one); 136 } 137 SIMD_set(const SIMD_float & src,const SIMD_mask & m,const SIMD_float & one)138 inline SIMD_float SIMD_set(const SIMD_float &src, const SIMD_mask &m, 139 const SIMD_float &one) { 140 return _mm512_mask_mov_ps(src,m,one); 141 } 142 SIMD_set(const SIMD_double & src,const SIMD_mask & m,const SIMD_double & one)143 inline SIMD_double SIMD_set(const SIMD_double &src, const SIMD_mask &m, 144 const SIMD_double &one) { 145 return _mm512_mask_mov_pd(src,m,one); 146 } 147 148 // -------- Load Operations 149 SIMD_load(const int * p)150 inline SIMD_int SIMD_load(const int *p) { 151 return _mm512_load_epi32(p); 152 } 153 SIMD_load(const float * p)154 inline SIMD_float SIMD_load(const float *p) { 155 return _mm512_load_ps(p); 156 } 157 SIMD_load(const double * p)158 inline SIMD_double SIMD_load(const double *p) { 159 return _mm512_load_pd(p); 160 } 161 SIMD_loadz(const SIMD_mask & m,const int * p)162 inline SIMD_int SIMD_loadz(const SIMD_mask &m, const int *p) { 163 return _mm512_maskz_load_epi32(m, p); 164 } 165 SIMD_loadz(const SIMD_mask & m,const float * p)166 inline SIMD_float SIMD_loadz(const SIMD_mask &m, const float *p) { 167 return _mm512_maskz_load_ps(m, p); 168 } 169 SIMD_loadz(const SIMD_mask & m,const double * p)170 inline SIMD_double SIMD_loadz(const SIMD_mask &m, const double *p) { 171 return _mm512_maskz_load_pd(m, p); 172 } 173 SIMD_gather(const int * p,const SIMD_int & i)174 inline SIMD_int SIMD_gather(const int *p, const SIMD_int &i) { 175 return _mm512_i32gather_epi32(i, p, _MM_SCALE_4); 176 } 177 SIMD_gather(const float * p,const SIMD_int & i)178 inline SIMD_float SIMD_gather(const float *p, const SIMD_int &i) { 179 return _mm512_i32gather_ps(i, p, _MM_SCALE_4); 180 } 181 SIMD_gather(const double * p,const SIMD_int & i)182 inline SIMD_double SIMD_gather(const double *p, const SIMD_int &i) { 183 return _mm512_i32gather_pd(_mm512_castsi512_si256(i), p, _MM_SCALE_8); 184 } 185 SIMD_gather(const SIMD_mask & m,const int * p,const SIMD_int & i)186 inline SIMD_int SIMD_gather(const SIMD_mask &m, const int *p, 187 const SIMD_int &i) { 188 return _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, p, 189 _MM_SCALE_4); 190 } 191 SIMD_gather(const SIMD_mask & m,const float * p,const SIMD_int & i)192 inline SIMD_float SIMD_gather(const SIMD_mask &m, const float *p, 193 const SIMD_int &i) { 194 return _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, p, 195 _MM_SCALE_4); 196 } 197 SIMD_gather(const SIMD_mask & m,const double * p,const SIMD_int & i)198 inline SIMD_double SIMD_gather(const SIMD_mask &m, const double *p, 199 const SIMD_int &i) { 200 return _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 201 _mm512_castsi512_si256(i), p, _MM_SCALE_8); 202 } 203 204 template <typename T> SIMD_gatherz_offset(const SIMD_mask & m,const int * p,const SIMD_int & i)205 inline SIMD_int SIMD_gatherz_offset(const SIMD_mask &m, const int *p, 206 const SIMD_int &i) { 207 } 208 209 template <> 210 inline SIMD_int SIMD_gatherz_offset<float>(const SIMD_mask &m, const int *p, 211 const SIMD_int &i) { 212 return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p, 213 _MM_SCALE_4); 214 } 215 216 template <> 217 inline SIMD_int SIMD_gatherz_offset<double>(const SIMD_mask &m, const int *p, 218 const SIMD_int &i) { 219 return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p, 220 _MM_SCALE_8); 221 } 222 SIMD_gatherz(const SIMD_mask & m,const int * p,const SIMD_int & i)223 inline SIMD_int SIMD_gatherz(const SIMD_mask &m, const int *p, 224 const SIMD_int &i) { 225 return _mm512_mask_i32gather_epi32( _mm512_set1_epi32(0), m, i, p, 226 _MM_SCALE_4); 227 } 228 SIMD_gatherz(const SIMD_mask & m,const float * p,const SIMD_int & i)229 inline SIMD_float SIMD_gatherz(const SIMD_mask &m, const float *p, 230 const SIMD_int &i) { 231 return _mm512_mask_i32gather_ps( _mm512_set1_ps((float)0), m, i, p, 232 _MM_SCALE_4); 233 } 234 SIMD_gatherz(const SIMD_mask & m,const double * p,const SIMD_int & i)235 inline SIMD_double SIMD_gatherz(const SIMD_mask &m, const double *p, 236 const SIMD_int &i) { 237 return _mm512_mask_i32gather_pd( _mm512_set1_pd(0.0), m, 238 _mm512_castsi512_si256(i),p, _MM_SCALE_8); 239 } 240 241 // ------- Store Operations 242 SIMD_store(int * p,const SIMD_int & one)243 inline void SIMD_store(int *p, const SIMD_int &one) { 244 return _mm512_store_epi32(p,one); 245 } 246 SIMD_store(float * p,const SIMD_float & one)247 inline void SIMD_store(float *p, const SIMD_float &one) { 248 return _mm512_store_ps(p,one); 249 } 250 SIMD_store(double * p,const SIMD_double & one)251 inline void SIMD_store(double *p, const SIMD_double &one) { 252 return _mm512_store_pd(p,one); 253 } 254 SIMD_scatter(const SIMD_mask & m,int * p,const SIMD_int & i,const SIMD_int & vec)255 inline void SIMD_scatter(const SIMD_mask &m, int *p, 256 const SIMD_int &i, const SIMD_int &vec) { 257 _mm512_mask_i32scatter_epi32(p, m, i, vec, _MM_SCALE_4); 258 } 259 SIMD_scatter(const SIMD_mask & m,float * p,const SIMD_int & i,const SIMD_float & vec)260 inline void SIMD_scatter(const SIMD_mask &m, float *p, 261 const SIMD_int &i, const SIMD_float &vec) { 262 _mm512_mask_i32scatter_ps(p, m, i, vec, _MM_SCALE_4); 263 } 264 SIMD_scatter(const SIMD_mask & m,double * p,const SIMD_int & i,const SIMD_double & vec)265 inline void SIMD_scatter(const SIMD_mask &m, double *p, 266 const SIMD_int &i, const SIMD_double &vec) { 267 _mm512_mask_i32scatter_pd(p, m, _mm512_castsi512_si256(i), vec, 268 _MM_SCALE_8); 269 } 270 271 // ------- Arithmetic Operations 272 273 inline SIMD_int operator+(const SIMD_int &one, const SIMD_int &two) { 274 return _mm512_add_epi32(one,two); 275 } 276 277 inline SIMD_float operator+(const SIMD_float &one, const SIMD_float &two) { 278 return _mm512_add_ps(one,two); 279 } 280 281 inline SIMD_double operator+(const SIMD_double &one, const SIMD_double &two) { 282 return _mm512_add_pd(one,two); 283 } 284 285 inline SIMD_int operator+(const SIMD_int &one, const int two) { 286 return _mm512_add_epi32(one,SIMD_set(two)); 287 } 288 289 inline SIMD_float operator+(const SIMD_float &one, const float two) { 290 return _mm512_add_ps(one,SIMD_set(two)); 291 } 292 293 inline SIMD_double operator+(const SIMD_double &one, const double two) { 294 return _mm512_add_pd(one,SIMD_set(two)); 295 } 296 SIMD_add(const SIMD_mask & m,const SIMD_int & one,const int two)297 inline SIMD_int SIMD_add(const SIMD_mask &m, 298 const SIMD_int &one, const int two) { 299 return _mm512_mask_add_epi32(one,m,one,SIMD_set(two)); 300 } 301 SIMD_add(const SIMD_mask & m,const SIMD_float & one,const float two)302 inline SIMD_float SIMD_add(const SIMD_mask &m, 303 const SIMD_float &one, const float two) { 304 return _mm512_mask_add_ps(one,m,one,SIMD_set(two)); 305 } 306 SIMD_add(const SIMD_mask & m,const SIMD_double & one,const double two)307 inline SIMD_double SIMD_add(const SIMD_mask &m, 308 const SIMD_double &one, const double two) { 309 return _mm512_mask_add_pd(one,m,one,SIMD_set(two)); 310 } 311 SIMD_add(const SIMD_int & s,const SIMD_mask & m,const SIMD_int & one,const SIMD_int & two)312 inline SIMD_int SIMD_add(const SIMD_int &s, const SIMD_mask &m, 313 const SIMD_int &one, const SIMD_int &two) { 314 return _mm512_mask_add_epi32(s,m,one,two); 315 } 316 SIMD_add(const SIMD_float & s,const SIMD_mask & m,const SIMD_float & one,const SIMD_float & two)317 inline SIMD_float SIMD_add(const SIMD_float &s, const SIMD_mask &m, 318 const SIMD_float &one, const SIMD_float &two) { 319 return _mm512_mask_add_ps(s,m,one,two); 320 } 321 SIMD_add(const SIMD_double & s,const SIMD_mask & m,const SIMD_double & one,const SIMD_double & two)322 inline SIMD_double SIMD_add(const SIMD_double &s, const SIMD_mask &m, 323 const SIMD_double &one, const SIMD_double &two) { 324 return _mm512_mask_add_pd(s,m,one,two); 325 } 326 SIMD_sub(const SIMD_int & s,const SIMD_mask & m,const SIMD_int & one,const SIMD_int & two)327 inline SIMD_int SIMD_sub(const SIMD_int &s, const SIMD_mask &m, 328 const SIMD_int &one, const SIMD_int &two) { 329 return _mm512_mask_sub_epi32(s,m,one,two); 330 } 331 SIMD_sub(const SIMD_float & s,const SIMD_mask & m,const SIMD_float & one,const SIMD_float & two)332 inline SIMD_float SIMD_sub(const SIMD_float &s, const SIMD_mask &m, 333 const SIMD_float &one, const SIMD_float &two) { 334 return _mm512_mask_sub_ps(s,m,one,two); 335 } 336 SIMD_sub(const SIMD_double & s,const SIMD_mask & m,const SIMD_double & one,const SIMD_double & two)337 inline SIMD_double SIMD_sub(const SIMD_double &s, const SIMD_mask &m, 338 const SIMD_double &one, const SIMD_double &two) { 339 return _mm512_mask_sub_pd(s,m,one,two); 340 } 341 342 inline SIMD_int operator-(const SIMD_int &one) { 343 return _mm512_sub_epi32(SIMD_set((int)0),one); 344 } 345 346 inline SIMD_float operator-(const SIMD_float &one) { 347 return _mm512_sub_ps(SIMD_set((float)0),one); 348 } 349 350 inline SIMD_double operator-(const SIMD_double &one) { 351 return _mm512_sub_pd(SIMD_set((double)0),one); 352 } 353 354 inline SIMD_int operator-(const SIMD_int &one, const SIMD_int &two) { 355 return _mm512_sub_epi32(one,two); 356 } 357 358 inline SIMD_float operator-(const SIMD_float &one, const SIMD_float &two) { 359 return _mm512_sub_ps(one,two); 360 } 361 362 inline SIMD_double operator-(const SIMD_double &one, const SIMD_double &two) { 363 return _mm512_sub_pd(one,two); 364 } 365 366 inline SIMD_int operator-(const SIMD_int &one, const int two) { 367 return _mm512_sub_epi32(one,SIMD_set(two)); 368 } 369 370 inline SIMD_float operator-(const SIMD_float &one, const float two) { 371 return _mm512_sub_ps(one,SIMD_set(two)); 372 } 373 374 inline SIMD_double operator-(const SIMD_double &one, const double two) { 375 return _mm512_sub_pd(one,SIMD_set(two)); 376 } 377 378 inline SIMD_int operator*(const SIMD_int &one, const SIMD_int &two) { 379 return _mm512_mullo_epi32(one,two); 380 } 381 382 inline SIMD_float operator*(const SIMD_float &one, const SIMD_float &two) { 383 return _mm512_mul_ps(one,two); 384 } 385 386 inline SIMD_double operator*(const SIMD_double &one, const SIMD_double &two) { 387 return _mm512_mul_pd(one,two); 388 } 389 390 inline SIMD_int operator*(const SIMD_int &one, const int two) { 391 return _mm512_mullo_epi32(one,SIMD_set(two)); 392 } 393 394 inline SIMD_float operator*(const SIMD_float &one, const float two) { 395 return _mm512_mul_ps(one,SIMD_set(two)); 396 } 397 398 inline SIMD_double operator*(const SIMD_double &one, const double two) { 399 return _mm512_mul_pd(one,SIMD_set(two)); 400 } 401 402 inline SIMD_float operator/(const SIMD_float &one, const SIMD_float &two) { 403 return _mm512_div_ps(one,two); 404 } 405 406 inline SIMD_double operator/(const SIMD_double &one, const SIMD_double &two) { 407 return _mm512_div_pd(one,two); 408 } 409 SIMD_fma(const SIMD_float & one,const SIMD_float & two,const SIMD_float & three)410 inline SIMD_float SIMD_fma(const SIMD_float &one, const SIMD_float &two, 411 const SIMD_float &three) { 412 return _mm512_fmadd_ps(one,two,three); 413 } 414 SIMD_fma(const SIMD_double & one,const SIMD_double & two,const SIMD_double & three)415 inline SIMD_double SIMD_fma(const SIMD_double &one, const SIMD_double &two, 416 const SIMD_double &three) { 417 return _mm512_fmadd_pd(one,two,three); 418 } 419 SIMD_fms(const SIMD_float & one,const SIMD_float & two,const SIMD_float & three)420 inline SIMD_float SIMD_fms(const SIMD_float &one, const SIMD_float &two, 421 const SIMD_float &three) { 422 return _mm512_fmsub_ps(one,two,three); 423 } 424 SIMD_fms(const SIMD_double & one,const SIMD_double & two,const SIMD_double & three)425 inline SIMD_double SIMD_fms(const SIMD_double &one, const SIMD_double &two, 426 const SIMD_double &three) { 427 return _mm512_fmsub_pd(one,two,three); 428 } 429 430 // ------- SVML operations 431 SIMD_rcp(const SIMD_float & one)432 inline SIMD_float SIMD_rcp(const SIMD_float &one) { 433 #ifdef __AVX512ER__ 434 return _mm512_rcp28_ps(one); 435 #else 436 return _mm512_recip_ps(one); 437 #endif 438 } 439 SIMD_rcp(const SIMD_double & one)440 inline SIMD_double SIMD_rcp(const SIMD_double &one) { 441 #ifdef __AVX512ER__ 442 return _mm512_rcp28_pd(one); 443 #else 444 return _mm512_recip_pd(one); 445 #endif 446 } 447 SIMD_rcpz(const SIMD_mask & m,const SIMD_float & one)448 inline SIMD_float SIMD_rcpz(const SIMD_mask &m, const SIMD_float &one) { 449 #ifdef __AVX512ER__ 450 return _mm512_maskz_rcp28_ps(m, one); 451 #else 452 return _mm512_mask_recip_ps(_mm512_set1_ps(0), m, one); 453 #endif 454 } 455 SIMD_rcpz(const SIMD_mask & m,const SIMD_double & one)456 inline SIMD_double SIMD_rcpz(const SIMD_mask &m, const SIMD_double &one) { 457 #ifdef __AVX512ER__ 458 return _mm512_maskz_rcp28_pd(m, one); 459 #else 460 return _mm512_mask_recip_pd(_mm512_set1_pd(0), m, one); 461 #endif 462 } 463 SIMD_sqrt(const SIMD_float & one)464 inline SIMD_float SIMD_sqrt(const SIMD_float &one) { 465 return _mm512_sqrt_ps(one); 466 } 467 SIMD_sqrt(const SIMD_double & one)468 inline SIMD_double SIMD_sqrt(const SIMD_double &one) { 469 return _mm512_sqrt_pd(one); 470 } 471 SIMD_invsqrt(const SIMD_float & one)472 inline SIMD_float SIMD_invsqrt(const SIMD_float &one) { 473 #ifdef __AVX512ER__ 474 return _mm512_rsqrt28_ps(one); 475 #else 476 return _mm512_invsqrt_ps(one); 477 #endif 478 } 479 SIMD_invsqrt(const SIMD_double & one)480 inline SIMD_double SIMD_invsqrt(const SIMD_double &one) { 481 #ifdef __AVX512ER__ 482 return _mm512_rsqrt28_pd(one); 483 #else 484 return _mm512_invsqrt_pd(one); 485 #endif 486 } 487 SIMD_pow(const SIMD_float & one,const SIMD_float & two)488 inline SIMD_float SIMD_pow(const SIMD_float &one, const SIMD_float &two) { 489 return _mm512_pow_ps(one, two); 490 } 491 SIMD_pow(const SIMD_double & one,const SIMD_double & two)492 inline SIMD_double SIMD_pow(const SIMD_double &one, const SIMD_double &two) { 493 return _mm512_pow_pd(one, two); 494 } 495 SIMD_exp(const SIMD_float & one)496 inline SIMD_float SIMD_exp(const SIMD_float &one) { 497 return _mm512_exp_ps(one); 498 } 499 SIMD_exp(const SIMD_double & one)500 inline SIMD_double SIMD_exp(const SIMD_double &one) { 501 return _mm512_exp_pd(one); 502 } 503 504 // ------- Comparison operations 505 SIMD_lt(SIMD_mask m,const SIMD_int & one,const SIMD_int & two)506 inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_int &one, 507 const SIMD_int &two) { 508 return _mm512_mask_cmplt_epi32_mask(m, one, two); 509 } 510 SIMD_lt(SIMD_mask m,const SIMD_float & one,const SIMD_float & two)511 inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_float &one, 512 const SIMD_float &two) { 513 return _mm512_mask_cmplt_ps_mask(m, one, two); 514 } 515 SIMD_lt(SIMD_mask m,const SIMD_double & one,const SIMD_double & two)516 inline SIMD_mask SIMD_lt(SIMD_mask m, const SIMD_double &one, 517 const SIMD_double &two) { 518 return _mm512_mask_cmplt_pd_mask(m, one, two); 519 } 520 SIMD_lt(SIMD_mask m,const int one,const SIMD_int & two)521 inline SIMD_mask SIMD_lt(SIMD_mask m, const int one, 522 const SIMD_int &two) { 523 return _mm512_mask_cmplt_epi32_mask(m, SIMD_set(one), two); 524 } 525 SIMD_lt(SIMD_mask m,const float one,const SIMD_float & two)526 inline SIMD_mask SIMD_lt(SIMD_mask m, const float one, 527 const SIMD_float &two) { 528 return _mm512_mask_cmplt_ps_mask(m, SIMD_set(one), two); 529 } 530 SIMD_lt(SIMD_mask m,const double one,const SIMD_double & two)531 inline SIMD_mask SIMD_lt(SIMD_mask m, const double one, 532 const SIMD_double &two) { 533 return _mm512_mask_cmplt_pd_mask(m, SIMD_set(one), two); 534 } 535 536 inline SIMD_mask operator<(const SIMD_int &one, const SIMD_int &two) { 537 return _mm512_cmplt_epi32_mask(one,two); 538 } 539 540 inline SIMD_mask operator<(const SIMD_float &one, const SIMD_float &two) { 541 return _mm512_cmplt_ps_mask(one,two); 542 } 543 544 inline SIMD_mask operator<(const SIMD_double &one, const SIMD_double &two) { 545 return _mm512_cmplt_pd_mask(one,two); 546 } 547 548 inline SIMD_mask operator<(const SIMD_int &one, const int two) { 549 return _mm512_cmplt_epi32_mask(one,SIMD_set(two)); 550 } 551 552 inline SIMD_mask operator<(const SIMD_float &one, const float two) { 553 return _mm512_cmplt_ps_mask(one,SIMD_set(two)); 554 } 555 556 inline SIMD_mask operator<(const SIMD_double &one, const double two) { 557 return _mm512_cmplt_pd_mask(one,SIMD_set(two)); 558 } 559 560 inline SIMD_mask operator<(const int one, const SIMD_int &two) { 561 return _mm512_cmplt_epi32_mask(SIMD_set(one),two); 562 } 563 564 inline SIMD_mask operator<(const float one, const SIMD_float &two) { 565 return _mm512_cmplt_ps_mask(SIMD_set(one),two); 566 } 567 568 inline SIMD_mask operator<(const double one, const SIMD_double &two) { 569 return _mm512_cmplt_pd_mask(SIMD_set(one),two); 570 } 571 572 inline SIMD_mask operator<=(const int one, const SIMD_int &two) { 573 return _mm512_cmple_epi32_mask(SIMD_set(one), two); 574 } 575 576 inline SIMD_mask operator<=(const float one, const SIMD_float &two) { 577 return _mm512_cmple_ps_mask(SIMD_set(one), two); 578 } 579 580 inline SIMD_mask operator<=(const double one, const SIMD_double &two) { 581 return _mm512_cmple_pd_mask(SIMD_set(one), two); 582 } 583 584 inline SIMD_mask operator>(const SIMD_int &one, const SIMD_int &two) { 585 return _mm512_cmpgt_epi32_mask(one,two); 586 } 587 588 inline SIMD_mask operator>(const SIMD_float &one, const SIMD_float &two) { 589 return _mm512_cmplt_ps_mask(two,one); 590 } 591 592 inline SIMD_mask operator>(const SIMD_double &one, const SIMD_double &two) { 593 return _mm512_cmplt_pd_mask(two,one); 594 } 595 596 inline SIMD_mask operator==(const SIMD_int &one, const SIMD_int &two) { 597 return _mm512_cmpeq_epi32_mask(one,two); 598 } 599 600 inline SIMD_mask operator==(const SIMD_float &one, const SIMD_float &two) { 601 return _mm512_cmpeq_ps_mask(one,two); 602 } 603 604 inline SIMD_mask operator==(const SIMD_double &one, const SIMD_double &two) { 605 return _mm512_cmpeq_pd_mask(one,two); 606 } 607 608 // ------- Typecast operations 609 SIMD_cast(const SIMD_int & one,SIMD_float & two)610 inline void SIMD_cast(const SIMD_int &one, SIMD_float &two) { 611 two = _mm512_cvtepi32_ps(one); 612 } 613 SIMD_cast(const SIMD_int & one,SIMD_double & two)614 inline void SIMD_cast(const SIMD_int &one, SIMD_double &two) { 615 two = _mm512_cvtepi32lo_pd(one); 616 } 617 618 // ------- Reduction operations 619 SIMD_max(const SIMD_int & i)620 inline int SIMD_max(const SIMD_int &i) { 621 return _mm512_reduce_max_epi32(i); 622 } 623 SIMD_max(const SIMD_float & i)624 inline float SIMD_max(const SIMD_float &i) { 625 return _mm512_reduce_max_ps(i); 626 } 627 SIMD_max(const SIMD_double & i)628 inline double SIMD_max(const SIMD_double &i) { 629 return _mm512_reduce_max_pd(i); 630 } 631 SIMD_sum(const SIMD_int & i)632 inline int SIMD_sum(const SIMD_int &i) { 633 return _mm512_reduce_add_epi32(i); 634 } 635 SIMD_sum(const SIMD_float & i)636 inline float SIMD_sum(const SIMD_float &i) { 637 return _mm512_reduce_add_ps(i); 638 } 639 SIMD_sum(const SIMD_double & i)640 inline double SIMD_sum(const SIMD_double &i) { 641 return _mm512_reduce_add_pd(i); 642 } 643 644 // i indices should be positive SIMD_conflict_pi_reduce1(const SIMD_mask & m,const SIMD_int & i,SIMD_float & v1)645 inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i, 646 SIMD_float &v1) { 647 SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i); 648 SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc); 649 SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1)); 650 if (todo_mask) { 651 SIMD_int lz = _mm512_lzcnt_epi32(cd); 652 SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31), 653 _mm512_lzcnt_epi32(cd)); 654 655 while(todo_mask) { 656 SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask); 657 SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, 658 todo_bcast); 659 SIMD_float am_perm; 660 am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(), 661 now_mask, lid, v1); 662 v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm); 663 todo_mask = _mm512_kxor(todo_mask, now_mask); 664 } 665 } 666 } 667 668 // i indices should be positive SIMD_conflict_pi_reduce1(const SIMD_mask & m,const SIMD_int & i,SIMD_double & v1)669 inline void SIMD_conflict_pi_reduce1(const SIMD_mask &m, const SIMD_int &i, 670 SIMD_double &v1) { 671 SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i); 672 SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc); 673 SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1)); 674 if (todo_mask) { 675 SIMD_int lz = _mm512_lzcnt_epi32(cd); 676 SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31), 677 _mm512_lzcnt_epi32(cd)); 678 lid = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(lid)); 679 680 while(todo_mask) { 681 SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask); 682 SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, 683 todo_bcast); 684 SIMD_double am_perm; 685 am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(), 686 now_mask, lid, v1); 687 v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm); 688 todo_mask = _mm512_kxor(todo_mask, now_mask); 689 } 690 } 691 } 692 693 // i indices should be positive SIMD_conflict_pi_reduce3(const SIMD_mask & m,const SIMD_int & i,SIMD_float & v1,SIMD_float & v2,SIMD_float & v3)694 inline void SIMD_conflict_pi_reduce3(const SIMD_mask &m, const SIMD_int &i, 695 SIMD_float &v1, SIMD_float &v2, 696 SIMD_float &v3) { 697 SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i); 698 SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc); 699 SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1)); 700 if (todo_mask) { 701 SIMD_int lz = _mm512_lzcnt_epi32(cd); 702 SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31), 703 _mm512_lzcnt_epi32(cd)); 704 705 while(todo_mask) { 706 SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask); 707 SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, 708 todo_bcast); 709 SIMD_float am_perm; 710 am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(), 711 now_mask, lid, v1); 712 v1 = _mm512_mask_add_ps(v1, now_mask, v1, am_perm); 713 am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(), 714 now_mask, lid, v2); 715 v2 = _mm512_mask_add_ps(v2, now_mask, v2, am_perm); 716 am_perm = _mm512_mask_permutexvar_ps(_mm512_undefined_ps(), 717 now_mask, lid, v3); 718 v3 = _mm512_mask_add_ps(v3, now_mask, v3, am_perm); 719 todo_mask = _mm512_kxor(todo_mask, now_mask); 720 } 721 } 722 } 723 724 // i indices should be positive SIMD_conflict_pi_reduce3(const SIMD_mask & m,const SIMD_int & i,SIMD_double & v1,SIMD_double & v2,SIMD_double & v3)725 inline void SIMD_conflict_pi_reduce3(const SIMD_mask &m, const SIMD_int &i, 726 SIMD_double &v1, SIMD_double &v2, 727 SIMD_double &v3) { 728 SIMD_int jc = _mm512_mask_mov_epi32(_mm512_set1_epi32(-1), m, i); 729 SIMD_int cd = _mm512_maskz_conflict_epi32(m, jc); 730 SIMD_mask todo_mask = _mm512_test_epi32_mask(cd, _mm512_set1_epi32(-1)); 731 if (todo_mask) { 732 SIMD_int lz = _mm512_lzcnt_epi32(cd); 733 SIMD_int lid = _mm512_sub_epi32(_mm512_set1_epi32(31), 734 _mm512_lzcnt_epi32(cd)); 735 lid = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(lid)); 736 737 while(todo_mask) { 738 SIMD_int todo_bcast = _mm512_broadcastmw_epi32(todo_mask); 739 SIMD_mask now_mask = _mm512_mask_testn_epi32_mask(todo_mask, cd, 740 todo_bcast); 741 SIMD_double am_perm; 742 am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(), 743 now_mask, lid, v1); 744 v1 = _mm512_mask_add_pd(v1, now_mask, v1, am_perm); 745 am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(), 746 now_mask, lid, v2); 747 v2 = _mm512_mask_add_pd(v2, now_mask, v2, am_perm); 748 am_perm = _mm512_mask_permutexvar_pd(_mm512_undefined_pd(), 749 now_mask, lid, v3); 750 v3 = _mm512_mask_add_pd(v3, now_mask, v3, am_perm); 751 todo_mask = _mm512_kxor(todo_mask, now_mask); 752 } 753 } 754 } 755 756 // ------- Bit shift operations 757 758 inline SIMD_int operator&(const SIMD_int &one, const SIMD_int &two) { 759 return _mm512_and_epi32(one,two); 760 } 761 762 inline SIMD_int operator>>(const SIMD_int &one, const SIMD_int &two) { 763 return _mm512_srlv_epi32(one,two); 764 } 765 766 inline SIMD_int operator<<(const SIMD_int &one, const unsigned two) { 767 return _mm512_slli_epi32(one,two); 768 } 769 770 // -------- I/O operations 771 SIMD_print(const __m512i & vec)772 inline void SIMD_print(const __m512i &vec) { 773 for (int i = 0; i < 16; i++) 774 printf("%d ",(*((int*)&(vec) + (i)))); 775 } 776 SIMD_print(const __m512 & vec)777 inline void SIMD_print(const __m512 &vec) { 778 for (int i = 0; i < 16; i++) 779 printf("%f ",(*((float*)&(vec) + (i)))); 780 } 781 SIMD_print(const __m512d & vec)782 inline void SIMD_print(const __m512d &vec) { 783 for (int i = 0; i < 8; i++) 784 printf("%f ",(*((double*)&(vec) + (i)))); 785 } 786 SIMD_print(const SIMD_mask & mask)787 inline void SIMD_print(const SIMD_mask &mask) { 788 SIMD_print(_mm512_maskz_mov_epi32(mask,SIMD_set(1))); 789 } 790 SIMD_print(const char * id,const SIMD_mask & mask)791 inline void SIMD_print(const char *id, const SIMD_mask &mask) { 792 printf("%s ",id); 793 SIMD_print(mask); 794 printf("\n"); 795 } 796 SIMD_print(const char * id,const SIMD_int & vec)797 inline void SIMD_print(const char *id, const SIMD_int &vec) { 798 printf("%s ",id); 799 SIMD_print(vec); 800 printf("\n"); 801 } 802 SIMD_print(const char * id,const SIMD_float & vec)803 inline void SIMD_print(const char *id, const SIMD_float &vec) { 804 printf("%s ",id); 805 SIMD_print(vec); 806 printf("\n"); 807 } 808 SIMD_print(const char * id,const SIMD_double & vec)809 inline void SIMD_print(const char *id, const SIMD_double &vec) { 810 printf("%s ",id); 811 SIMD_print(vec); 812 printf("\n"); 813 } 814 815 // ---------- LAMMPS operations 816 #ifndef SW_GATHER_TEST SIMD_atom_gather(const SIMD_mask & m,const float * atom,const SIMD_int & i,SIMD_float & x,SIMD_float & y,SIMD_float & z)817 inline void SIMD_atom_gather(const SIMD_mask &m, const float *atom, 818 const SIMD_int &i, SIMD_float &x, SIMD_float &y, 819 SIMD_float &z) { 820 x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom, 821 _MM_SCALE_1); 822 y = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+1, 823 _MM_SCALE_1); 824 z = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+2, 825 _MM_SCALE_1); 826 } 827 SIMD_atom_gather(const SIMD_mask & m,const float * atom,const SIMD_int & i,SIMD_float & x,SIMD_float & y,SIMD_float & z,SIMD_int & type)828 inline void SIMD_atom_gather(const SIMD_mask &m, const float *atom, 829 const SIMD_int &i, SIMD_float &x, SIMD_float &y, 830 SIMD_float &z, SIMD_int &type) { 831 x = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom, 832 _MM_SCALE_1); 833 y = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+1, 834 _MM_SCALE_1); 835 z = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, atom+2, 836 _MM_SCALE_1); 837 type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3, 838 _MM_SCALE_1); 839 } 840 #endif 841 SIMD_atom_gather(const SIMD_mask & m,const double * atom,const SIMD_int & i,SIMD_double & x,SIMD_double & y,SIMD_double & z)842 inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom, 843 const SIMD_int &i, SIMD_double &x, 844 SIMD_double &y, SIMD_double &z) { 845 x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 846 _mm512_castsi512_si256(i), atom, 847 _MM_SCALE_2); 848 y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 849 _mm512_castsi512_si256(i), atom+1, 850 _MM_SCALE_2); 851 z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 852 _mm512_castsi512_si256(i), atom+2, 853 _MM_SCALE_2); 854 } 855 SIMD_atom_gather(const SIMD_mask & m,const double * atom,const SIMD_int & i,SIMD_double & x,SIMD_double & y,SIMD_double & z,SIMD_int & type)856 inline void SIMD_atom_gather(const SIMD_mask &m, const double *atom, 857 const SIMD_int &i, SIMD_double &x, 858 SIMD_double &y, SIMD_double &z, SIMD_int &type) { 859 x = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 860 _mm512_castsi512_si256(i), atom, 861 _MM_SCALE_2); 862 y = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 863 _mm512_castsi512_si256(i), atom+1, 864 _MM_SCALE_2); 865 z = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 866 _mm512_castsi512_si256(i), atom+2, 867 _MM_SCALE_2); 868 type = _mm512_mask_i32gather_epi32(_mm512_undefined_epi32(), m, i, atom+3, 869 _MM_SCALE_2); 870 } 871 SIMD_ev_add(const SIMD_float & one,const SIMD_float & two)872 inline SIMD_float SIMD_ev_add(const SIMD_float &one, 873 const SIMD_float &two) { 874 return _mm512_add_ps(one,two); 875 } 876 SIMD_ev_add(const SIMD_double & one,const SIMD_double & two)877 inline SIMD_double SIMD_ev_add(const SIMD_double &one, 878 const SIMD_double &two) { 879 return _mm512_add_pd(one,two); 880 } 881 SIMD_ev_add(const SIMD_double & one,const SIMD_float & two)882 inline SIMD_double SIMD_ev_add(const SIMD_double &one, 883 const SIMD_float &two) { 884 SIMD_double twod = _mm512_cvtps_pd(_mm512_castps512_ps256(two)); 885 SIMD_double ans = _mm512_add_pd(one,twod); 886 twod = _mm512_cvtps_pd(_mm512_castps512_ps256( 887 _mm512_shuffle_f32x4(two,two,238))); 888 return _mm512_add_pd(ans,twod); 889 } 890 SIMD_jeng_update(const SIMD_mask & rmask,float * force,const SIMD_int & joffset,SIMD_float & eng)891 inline void SIMD_jeng_update(const SIMD_mask &rmask, float *force, 892 const SIMD_int &joffset, SIMD_float &eng) { 893 SIMD_float jeng; 894 SIMD_conflict_pi_reduce1(rmask, joffset, eng); 895 jeng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), rmask, joffset, 896 force, _MM_SCALE_1); 897 jeng = jeng + eng; 898 _mm512_mask_i32scatter_ps(force, rmask, joffset, jeng, _MM_SCALE_1); 899 } 900 SIMD_jeng_update(const SIMD_mask & rmask,double * force,const SIMD_int & joffset,SIMD_double & eng)901 inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force, 902 const SIMD_int &joffset, SIMD_double &eng) { 903 SIMD_double jeng; 904 SIMD_conflict_pi_reduce1(rmask, joffset, eng); 905 jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 906 _mm512_castsi512_si256(joffset), 907 force, _MM_SCALE_2); 908 jeng = jeng + eng; 909 _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset), 910 jeng, _MM_SCALE_2); 911 } 912 SIMD_jeng_update(const SIMD_mask & rmask,double * force,const SIMD_int & joffset,SIMD_float & eng)913 inline void SIMD_jeng_update(const SIMD_mask &rmask, double *force, 914 const SIMD_int &joffset, SIMD_float &eng) { 915 SIMD_double engd, jeng; 916 engd = _mm512_cvtps_pd(_mm512_castps512_ps256(eng)); 917 SIMD_conflict_pi_reduce1(rmask, joffset, engd); 918 jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 919 _mm512_castsi512_si256(joffset), 920 force, _MM_SCALE_2); 921 jeng = jeng + engd; 922 _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset), 923 jeng, _MM_SCALE_2); 924 925 SIMD_mask rmask2 = rmask >> 8; 926 engd = _mm512_cvtps_pd(_mm512_castps512_ps256( 927 _mm512_shuffle_f32x4(eng,eng,238))); 928 SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238); 929 SIMD_conflict_pi_reduce1(rmask2, joffset2, engd); 930 jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 931 _mm512_castsi512_si256(joffset2), 932 force, _MM_SCALE_2); 933 jeng = jeng + engd; 934 _mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2), 935 jeng, _MM_SCALE_2); 936 } 937 SIMD_jeng_update_hi(const SIMD_mask & mask,float * force,const SIMD_int & joffset1,SIMD_float & eng)938 inline void SIMD_jeng_update_hi(const SIMD_mask &mask, float *force, 939 const SIMD_int &joffset1, SIMD_float &eng) { 940 } 941 SIMD_jeng_update_hi(const SIMD_mask & mask,double * force,const SIMD_int & joffset1,SIMD_double & eng)942 inline void SIMD_jeng_update_hi(const SIMD_mask &mask, double *force, 943 const SIMD_int &joffset1, SIMD_double &eng) { 944 SIMD_mask rmask = mask >> 8; 945 SIMD_int joffset = _mm512_shuffle_i32x4(joffset1, joffset1, 238); 946 947 SIMD_double jeng; 948 SIMD_conflict_pi_reduce1(rmask, joffset, eng); 949 jeng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 950 _mm512_castsi512_si256(joffset), 951 force, _MM_SCALE_2); 952 jeng = jeng + eng; 953 _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset), 954 jeng, _MM_SCALE_2); 955 } 956 SIMD_safe_jforce(const SIMD_mask & m,float * force,const SIMD_int & i,SIMD_float & fx,SIMD_float & fy,SIMD_float & fz)957 inline void SIMD_safe_jforce(const SIMD_mask &m, float *force, 958 const SIMD_int &i, SIMD_float &fx, 959 SIMD_float &fy, SIMD_float &fz) { 960 SIMD_conflict_pi_reduce3(m, i, fx, fy, fz); 961 SIMD_float jfrc; 962 jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, 963 _MM_SCALE_1); 964 jfrc = jfrc + fx; 965 _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1); 966 jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, 967 _MM_SCALE_1); 968 jfrc = jfrc + fy; 969 _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1); 970 jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2, 971 _MM_SCALE_1); 972 jfrc = jfrc + fz; 973 _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1); 974 } 975 SIMD_safe_jforce(const SIMD_mask & m,double * force,const SIMD_int & i,SIMD_double & fx,SIMD_double & fy,SIMD_double & fz)976 inline void SIMD_safe_jforce(const SIMD_mask &m, double *force, 977 const SIMD_int &i, SIMD_double &fx, 978 SIMD_double &fy, SIMD_double &fz) { 979 SIMD_conflict_pi_reduce3(m, i, fx, fy, fz); 980 SIMD_double jfrc; 981 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 982 _mm512_castsi512_si256(i), force, 983 _MM_SCALE_2); 984 jfrc = jfrc + fx; 985 _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc, 986 _MM_SCALE_2); 987 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 988 _mm512_castsi512_si256(i), force + 1, 989 _MM_SCALE_2); 990 jfrc = jfrc + fy; 991 _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc, 992 _MM_SCALE_2); 993 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 994 _mm512_castsi512_si256(i), force + 2, 995 _MM_SCALE_2); 996 jfrc = jfrc + fz; 997 _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc, 998 _MM_SCALE_2); 999 } 1000 SIMD_safe_jforce(const SIMD_mask & rmask,double * force,const SIMD_int & joffset,SIMD_float & amx,SIMD_float & amy,SIMD_float & amz)1001 inline void SIMD_safe_jforce(const SIMD_mask &rmask, double *force, 1002 const SIMD_int &joffset, SIMD_float &amx, 1003 SIMD_float &amy, SIMD_float &amz) { 1004 SIMD_double amxd, amyd, amzd; 1005 amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx)); 1006 amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(amy)); 1007 amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz)); 1008 SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd); 1009 SIMD_double jfrc; 1010 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 1011 _mm512_castsi512_si256(joffset), 1012 force, _MM_SCALE_2); 1013 jfrc = jfrc + amxd; 1014 _mm512_mask_i32scatter_pd(force, rmask, _mm512_castsi512_si256(joffset), 1015 jfrc, _MM_SCALE_2); 1016 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 1017 _mm512_castsi512_si256(joffset), 1018 force + 1, _MM_SCALE_2); 1019 jfrc = jfrc + amyd; 1020 _mm512_mask_i32scatter_pd(force+1, rmask, _mm512_castsi512_si256(joffset), 1021 jfrc, _MM_SCALE_2); 1022 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask, 1023 _mm512_castsi512_si256(joffset), 1024 force + 2, _MM_SCALE_2); 1025 jfrc = jfrc + amzd; 1026 _mm512_mask_i32scatter_pd(force+2, rmask, _mm512_castsi512_si256(joffset), 1027 jfrc, _MM_SCALE_2); 1028 1029 SIMD_mask rmask2 = rmask >> 8; 1030 amxd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1031 _mm512_shuffle_f32x4(amx,amx,238))); 1032 amyd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1033 _mm512_shuffle_f32x4(amy,amy,238))); 1034 amzd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1035 _mm512_shuffle_f32x4(amz,amz,238))); 1036 SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238); 1037 SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd); 1038 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 1039 _mm512_castsi512_si256(joffset2), 1040 force, _MM_SCALE_2); 1041 jfrc = jfrc + amxd; 1042 _mm512_mask_i32scatter_pd(force, rmask2, _mm512_castsi512_si256(joffset2), 1043 jfrc, _MM_SCALE_2); 1044 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 1045 _mm512_castsi512_si256(joffset2), 1046 force + 1, _MM_SCALE_2); 1047 jfrc = jfrc + amyd; 1048 _mm512_mask_i32scatter_pd(force+1, rmask2, 1049 _mm512_castsi512_si256(joffset2), jfrc, 1050 _MM_SCALE_2); 1051 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), rmask2, 1052 _mm512_castsi512_si256(joffset2), 1053 force + 2, _MM_SCALE_2); 1054 jfrc = jfrc + amzd; 1055 _mm512_mask_i32scatter_pd(force+2, rmask2, 1056 _mm512_castsi512_si256(joffset2), jfrc, 1057 _MM_SCALE_2); 1058 } 1059 SIMD_jforce_update(const SIMD_mask & m,float * force,const SIMD_int & i,const SIMD_float & fx,const SIMD_float & fy,const SIMD_float & fz)1060 inline void SIMD_jforce_update(const SIMD_mask &m, float *force, 1061 const SIMD_int &i, const SIMD_float &fx, 1062 const SIMD_float &fy, const SIMD_float &fz) { 1063 SIMD_float jfrc; 1064 jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, 1065 _MM_SCALE_1); 1066 jfrc = jfrc - fx; 1067 _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1); 1068 jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, 1069 _MM_SCALE_1); 1070 jfrc = jfrc - fy; 1071 _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1); 1072 jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2, 1073 _MM_SCALE_1); 1074 jfrc = jfrc - fz; 1075 _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1); 1076 } 1077 1078 template <class ft> SIMD_scalar_update(const int jj,const int * ejnum,ft * force,const int * i,const double * fx,const double * fy,const double * fz,const double * fx2,const double * fy2,const double * fz2)1079 inline void SIMD_scalar_update(const int jj, const int* ejnum, ft *force, 1080 const int* i, const double *fx, 1081 const double *fy, const double *fz, 1082 const double *fx2, const double *fy2, 1083 const double *fz2) { 1084 #pragma novector 1085 for (int k=0; k<8; k++) { 1086 if (jj < ejnum[k]) { 1087 const int j = i[k]; 1088 force[j].x -= fx[k]; 1089 force[j].y -= fy[k]; 1090 force[j].z -= fz[k]; 1091 } 1092 } 1093 1094 #pragma novector 1095 for (int k=8; k<16; k++) { 1096 if (jj < ejnum[k]) { 1097 const int j = i[k]; 1098 force[j].x -= fx2[k-8]; 1099 force[j].y -= fy2[k-8]; 1100 force[j].z -= fz2[k-8]; 1101 } 1102 } 1103 } 1104 SIMD_jforce_update(const SIMD_mask & m,double * force,const SIMD_int & i,const SIMD_double & fx,const SIMD_double & fy,const SIMD_double & fz)1105 inline void SIMD_jforce_update(const SIMD_mask &m, double *force, 1106 const SIMD_int &i, const SIMD_double &fx, 1107 const SIMD_double &fy, const SIMD_double &fz) { 1108 SIMD_double jfrc; 1109 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 1110 _mm512_castsi512_si256(i), force, 1111 _MM_SCALE_2); 1112 jfrc = jfrc - fx; 1113 _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc, 1114 _MM_SCALE_2); 1115 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 1116 _mm512_castsi512_si256(i), force + 1, 1117 _MM_SCALE_2); 1118 jfrc = jfrc - fy; 1119 _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc, 1120 _MM_SCALE_2); 1121 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 1122 _mm512_castsi512_si256(i), force + 2, 1123 _MM_SCALE_2); 1124 jfrc = jfrc - fz; 1125 _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc, 1126 _MM_SCALE_2); 1127 } 1128 SIMD_jforce_update(const SIMD_mask & rmask,double * force,const SIMD_int & joffset,SIMD_float & amx,SIMD_float & amy,SIMD_float & amz)1129 inline void SIMD_jforce_update(const SIMD_mask &rmask, 1130 double *force, const SIMD_int &joffset, SIMD_float &amx, 1131 SIMD_float &amy, SIMD_float &amz) { 1132 SIMD_double amxd, amyd, amzd; 1133 amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx)); 1134 amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(amy)); 1135 amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz)); 1136 SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd); 1137 SIMD_jforce_update(rmask, force, joffset, amxd, amyd, amzd); 1138 1139 SIMD_mask rmask2 = rmask >> 8; 1140 amxd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1141 _mm512_shuffle_f32x4(amx,amx,238))); 1142 amyd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1143 _mm512_shuffle_f32x4(amy,amy,238))); 1144 amzd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1145 _mm512_shuffle_f32x4(amz,amz,238))); 1146 SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238); 1147 SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd); 1148 SIMD_jforce_update(rmask2, force, joffset2, amxd, amyd, amzd); 1149 } 1150 SIMD_cache3(float * pr,const int offset,const SIMD_float & fx,const SIMD_float & fy,const SIMD_float & fz)1151 inline void SIMD_cache3(float *pr, const int offset, 1152 const SIMD_float &fx, 1153 const SIMD_float &fy, const SIMD_float &fz) { 1154 float *p = pr; 1155 SIMD_float t; 1156 t = SIMD_load(p); 1157 t = t + fx; 1158 SIMD_store(p,t); 1159 p = p + offset; 1160 t = SIMD_load(p); 1161 t = t + fy; 1162 SIMD_store(p, t); 1163 p = p + offset; 1164 t = SIMD_load(p); 1165 t = t + fz; 1166 SIMD_store(p, t); 1167 } 1168 SIMD_cache3(double * pr,const int offset,const SIMD_double & fx,const SIMD_double & fy,const SIMD_double & fz)1169 inline void SIMD_cache3(double *pr, const int offset, 1170 const SIMD_double &fx, 1171 const SIMD_double &fy, const SIMD_double &fz) { 1172 double *p = pr; 1173 SIMD_double t; 1174 t = SIMD_load(p); 1175 t = t + fx; 1176 SIMD_store(p,t); 1177 p = p + offset; 1178 t = SIMD_load(p); 1179 t = t + fy; 1180 SIMD_store(p, t); 1181 p = p + offset; 1182 t = SIMD_load(p); 1183 t = t + fz; 1184 SIMD_store(p, t); 1185 } 1186 SIMD_cache3(double * pr,const int foffset,const SIMD_float & fx,const SIMD_float & fy,const SIMD_float & fz)1187 inline void SIMD_cache3(double *pr, const int foffset, 1188 const SIMD_float &fx, 1189 const SIMD_float &fy, const SIMD_float &fz) { 1190 const int offset = foffset >> 1; 1191 double *p = pr; 1192 SIMD_double t, fd; 1193 1194 fd = _mm512_cvtps_pd(_mm512_castps512_ps256(fx)); 1195 t = SIMD_load(p); 1196 t = t + fd; 1197 SIMD_store(p,t); 1198 fd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1199 _mm512_shuffle_f32x4(fx,fx,238))); 1200 p = p + offset; 1201 t = SIMD_load(p); 1202 t = t + fd; 1203 SIMD_store(p,t); 1204 1205 fd = _mm512_cvtps_pd(_mm512_castps512_ps256(fy)); 1206 p = p + offset; 1207 t = SIMD_load(p); 1208 t = t + fd; 1209 SIMD_store(p,t); 1210 fd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1211 _mm512_shuffle_f32x4(fy,fy,238))); 1212 p = p + offset; 1213 t = SIMD_load(p); 1214 t = t + fd; 1215 SIMD_store(p,t); 1216 1217 fd = _mm512_cvtps_pd(_mm512_castps512_ps256(fz)); 1218 p = p + offset; 1219 t = SIMD_load(p); 1220 t = t + fd; 1221 SIMD_store(p,t); 1222 fd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1223 _mm512_shuffle_f32x4(fz,fz,238))); 1224 p = p + offset; 1225 t = SIMD_load(p); 1226 t = t + fd; 1227 SIMD_store(p,t); 1228 } 1229 SIMD_cache3(float * pr,const int offset,const SIMD_float & fx,const SIMD_float & fy,const SIMD_float & fz,const SIMD_float & fx2,const SIMD_float & fy2,const SIMD_float & fz2)1230 inline void SIMD_cache3(float *pr, const int offset, 1231 const SIMD_float &fx, const SIMD_float &fy, 1232 const SIMD_float &fz, const SIMD_float &fx2, 1233 const SIMD_float &fy2, const SIMD_float &fz2) { 1234 } 1235 SIMD_cache3(double * pr,const int foffset,const SIMD_double & fx,const SIMD_double & fy,const SIMD_double & fz,const SIMD_double & fx2,const SIMD_double & fy2,const SIMD_double & fz2)1236 inline void SIMD_cache3(double *pr, const int foffset, 1237 const SIMD_double &fx, const SIMD_double &fy, 1238 const SIMD_double &fz, const SIMD_double &fx2, 1239 const SIMD_double &fy2, const SIMD_double &fz2) { 1240 const int offset = foffset >> 1; 1241 double *p = pr; 1242 SIMD_double t; 1243 1244 t = SIMD_load(p); 1245 t = t + fx; 1246 SIMD_store(p,t); 1247 p = p + offset; 1248 t = SIMD_load(p); 1249 t = t + fx2; 1250 SIMD_store(p,t); 1251 1252 p = p + offset; 1253 t = SIMD_load(p); 1254 t = t + fy; 1255 SIMD_store(p,t); 1256 p = p + offset; 1257 t = SIMD_load(p); 1258 t = t + fy2; 1259 SIMD_store(p,t); 1260 1261 p = p + offset; 1262 t = SIMD_load(p); 1263 t = t + fz; 1264 SIMD_store(p,t); 1265 p = p + offset; 1266 t = SIMD_load(p); 1267 t = t + fz2; 1268 SIMD_store(p,t); 1269 } 1270 SIMD_accumulate3(const SIMD_mask & kmask,const SIMD_float & fjx,const SIMD_float & fjy,const SIMD_float & fjz,SIMD_float & fxtmp,SIMD_float & fytmp,SIMD_float & fztmp,SIMD_float & fjxtmp,SIMD_float & fjytmp,SIMD_float & fjztmp,SIMD_float & fxtmp2,SIMD_float & fytmp2,SIMD_float & fztmp2,SIMD_float & fjxtmp2,SIMD_float & fjytmp2,SIMD_float & fjztmp2)1271 inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx, 1272 const SIMD_float &fjy, const SIMD_float &fjz, 1273 SIMD_float &fxtmp, SIMD_float &fytmp, 1274 SIMD_float &fztmp, SIMD_float &fjxtmp, 1275 SIMD_float &fjytmp, SIMD_float &fjztmp, 1276 SIMD_float &fxtmp2, SIMD_float &fytmp2, 1277 SIMD_float &fztmp2, SIMD_float &fjxtmp2, 1278 SIMD_float &fjytmp2, SIMD_float &fjztmp2) { 1279 fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx); 1280 fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx); 1281 fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy); 1282 fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, fjy); 1283 fztmp = SIMD_sub(fztmp, kmask, fztmp, fjz); 1284 fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, fjz); 1285 } 1286 SIMD_accumulate3(const SIMD_mask & kmask,const SIMD_double & fjx,const SIMD_double & fjy,const SIMD_double & fjz,SIMD_double & fxtmp,SIMD_double & fytmp,SIMD_double & fztmp,SIMD_double & fjxtmp,SIMD_double & fjytmp,SIMD_double & fjztmp,SIMD_double & fxtmp2,SIMD_double & fytmp2,SIMD_double & fztmp2,SIMD_double & fjxtmp2,SIMD_double & fjytmp2,SIMD_double & fjztmp2)1287 inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_double &fjx, 1288 const SIMD_double &fjy, const SIMD_double &fjz, 1289 SIMD_double &fxtmp, SIMD_double &fytmp, 1290 SIMD_double &fztmp, SIMD_double &fjxtmp, 1291 SIMD_double &fjytmp, SIMD_double &fjztmp, 1292 SIMD_double &fxtmp2, SIMD_double &fytmp2, 1293 SIMD_double &fztmp2, SIMD_double &fjxtmp2, 1294 SIMD_double &fjytmp2, SIMD_double &fjztmp2) { 1295 fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx); 1296 fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx); 1297 fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy); 1298 fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, fjy); 1299 fztmp = SIMD_sub(fztmp, kmask, fztmp, fjz); 1300 fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, fjz); 1301 } 1302 SIMD_accumulate3(const SIMD_mask & kmask,const SIMD_float & fjx,const SIMD_float & fjy,const SIMD_float & fjz,SIMD_double & fxtmp,SIMD_double & fytmp,SIMD_double & fztmp,SIMD_double & fjxtmp,SIMD_double & fjytmp,SIMD_double & fjztmp,SIMD_double & fxtmp2,SIMD_double & fytmp2,SIMD_double & fztmp2,SIMD_double & fjxtmp2,SIMD_double & fjytmp2,SIMD_double & fjztmp2)1303 inline void SIMD_accumulate3(const SIMD_mask &kmask, const SIMD_float &fjx, 1304 const SIMD_float &fjy, const SIMD_float &fjz, 1305 SIMD_double &fxtmp, SIMD_double &fytmp, 1306 SIMD_double &fztmp, SIMD_double &fjxtmp, 1307 SIMD_double &fjytmp, SIMD_double &fjztmp, 1308 SIMD_double &fxtmp2, SIMD_double &fytmp2, 1309 SIMD_double &fztmp2, SIMD_double &fjxtmp2, 1310 SIMD_double &fjytmp2, SIMD_double &fjztmp2) { 1311 SIMD_mask kmask2 = kmask >> 8; 1312 SIMD_double delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(fjx)); 1313 fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, delfd); 1314 fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, delfd); 1315 delfd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1316 _mm512_shuffle_f32x4(fjx,fjx,238))); 1317 fxtmp2 = SIMD_sub(fxtmp2, kmask2, fxtmp2, delfd); 1318 fjxtmp2 = SIMD_sub(fjxtmp2, kmask2, fjxtmp2, delfd); 1319 1320 delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(fjy)); 1321 fytmp = SIMD_sub(fytmp, kmask, fytmp, delfd); 1322 fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, delfd); 1323 delfd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1324 _mm512_shuffle_f32x4(fjy,fjy,238))); 1325 fytmp2 = SIMD_sub(fytmp2, kmask2, fytmp2, delfd); 1326 fjytmp2 = SIMD_sub(fjytmp2, kmask2, fjytmp2, delfd); 1327 1328 delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(fjz)); 1329 fztmp = SIMD_sub(fztmp, kmask, fztmp, delfd); 1330 fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, delfd); 1331 delfd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1332 _mm512_shuffle_f32x4(fjz,fjz,238))); 1333 fztmp2 = SIMD_sub(fztmp2, kmask2, fztmp2, delfd); 1334 fjztmp2 = SIMD_sub(fjztmp2, kmask2, fjztmp2, delfd); 1335 } 1336 SIMD_acc_cache3(const SIMD_mask & kmask,const SIMD_float & fjx,const SIMD_float & fjy,const SIMD_float & fjz,const SIMD_float & fkx,const SIMD_float & fky,const SIMD_float & fkz,SIMD_float & fxtmp,SIMD_float & fytmp,SIMD_float & fztmp,SIMD_float & fjxtmp,SIMD_float & fjytmp,SIMD_float & fjztmp,SIMD_float & fxtmp2,SIMD_float & fytmp2,SIMD_float & fztmp2,SIMD_float & fjxtmp2,SIMD_float & fjytmp2,SIMD_float & fjztmp2,float * pr,const int offset)1337 inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx, 1338 const SIMD_float &fjy, const SIMD_float &fjz, 1339 const SIMD_float &fkx, const SIMD_float &fky, 1340 const SIMD_float &fkz, 1341 SIMD_float &fxtmp, SIMD_float &fytmp, 1342 SIMD_float &fztmp, SIMD_float &fjxtmp, 1343 SIMD_float &fjytmp, SIMD_float &fjztmp, 1344 SIMD_float &fxtmp2, SIMD_float &fytmp2, 1345 SIMD_float &fztmp2, SIMD_float &fjxtmp2, 1346 SIMD_float &fjytmp2, SIMD_float &fjztmp2, 1347 float *pr, const int offset) { 1348 fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx - fkx); 1349 fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx); 1350 fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy - fky); 1351 fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, fjy); 1352 fztmp = SIMD_sub(fztmp, kmask, fztmp, fjz - fkz); 1353 fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, fjz); 1354 float *p = pr; 1355 SIMD_float t; 1356 t = SIMD_load(p); 1357 t = t + fkx; 1358 SIMD_store(p,t); 1359 p = p + offset; 1360 t = SIMD_load(p); 1361 t = t + fky; 1362 SIMD_store(p, t); 1363 p = p + offset; 1364 t = SIMD_load(p); 1365 t = t + fkz; 1366 SIMD_store(p, t); 1367 } 1368 SIMD_acc_cache3(const SIMD_mask & kmask,const SIMD_double & fjx,const SIMD_double & fjy,const SIMD_double & fjz,const SIMD_double & fkx,const SIMD_double & fky,const SIMD_double & fkz,SIMD_double & fxtmp,SIMD_double & fytmp,SIMD_double & fztmp,SIMD_double & fjxtmp,SIMD_double & fjytmp,SIMD_double & fjztmp,SIMD_double & fxtmp2,SIMD_double & fytmp2,SIMD_double & fztmp2,SIMD_double & fjxtmp2,SIMD_double & fjytmp2,SIMD_double & fjztmp2,double * pr,const int offset)1369 inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_double &fjx, 1370 const SIMD_double &fjy, const SIMD_double &fjz, 1371 const SIMD_double &fkx, const SIMD_double &fky, 1372 const SIMD_double &fkz, 1373 SIMD_double &fxtmp, SIMD_double &fytmp, 1374 SIMD_double &fztmp, SIMD_double &fjxtmp, 1375 SIMD_double &fjytmp, SIMD_double &fjztmp, 1376 SIMD_double &fxtmp2, SIMD_double &fytmp2, 1377 SIMD_double &fztmp2, SIMD_double &fjxtmp2, 1378 SIMD_double &fjytmp2, SIMD_double &fjztmp2, 1379 double *pr, const int offset) { 1380 fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, fjx - fkx); 1381 fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, fjx); 1382 fytmp = SIMD_sub(fytmp, kmask, fytmp, fjy - fky); 1383 fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, fjy); 1384 fztmp = SIMD_sub(fztmp, kmask, fztmp, fjz - fkz); 1385 fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, fjz); 1386 double *p = pr; 1387 SIMD_double t; 1388 t = SIMD_load(p); 1389 t = t + fkx; 1390 SIMD_store(p,t); 1391 p = p + offset; 1392 t = SIMD_load(p); 1393 t = t + fky; 1394 SIMD_store(p, t); 1395 p = p + offset; 1396 t = SIMD_load(p); 1397 t = t + fkz; 1398 SIMD_store(p, t); 1399 } 1400 SIMD_acc_cache3(const SIMD_mask & kmask,const SIMD_float & fjx,const SIMD_float & fjy,const SIMD_float & fjz,const SIMD_float & fkx,const SIMD_float & fky,const SIMD_float & fkz,SIMD_double & fxtmp,SIMD_double & fytmp,SIMD_double & fztmp,SIMD_double & fjxtmp,SIMD_double & fjytmp,SIMD_double & fjztmp,SIMD_double & fxtmp2,SIMD_double & fytmp2,SIMD_double & fztmp2,SIMD_double & fjxtmp2,SIMD_double & fjytmp2,SIMD_double & fjztmp2,double * pr,const int foffset)1401 inline void SIMD_acc_cache3(const SIMD_mask &kmask, const SIMD_float &fjx, 1402 const SIMD_float &fjy, const SIMD_float &fjz, 1403 const SIMD_float &fkx, const SIMD_float &fky, 1404 const SIMD_float &fkz, 1405 SIMD_double &fxtmp, SIMD_double &fytmp, 1406 SIMD_double &fztmp, SIMD_double &fjxtmp, 1407 SIMD_double &fjytmp, SIMD_double &fjztmp, 1408 SIMD_double &fxtmp2, SIMD_double &fytmp2, 1409 SIMD_double &fztmp2, SIMD_double &fjxtmp2, 1410 SIMD_double &fjytmp2, SIMD_double &fjztmp2, 1411 double *pr, const int foffset) { 1412 SIMD_mask kmask2 = kmask >> 8; 1413 const int offset = foffset >> 1; 1414 double *p = pr; 1415 SIMD_double t; 1416 1417 SIMD_double delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(fjx)); 1418 SIMD_double delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256(fkx)); 1419 t = SIMD_load(p); 1420 t = t + delfdk; 1421 SIMD_store(p,t); 1422 fxtmp = SIMD_sub(fxtmp, kmask, fxtmp, delfd - delfdk); 1423 fjxtmp = SIMD_sub(fjxtmp, kmask, fjxtmp, delfd); 1424 delfd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1425 _mm512_shuffle_f32x4(fjx,fjx,238))); 1426 delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256( 1427 _mm512_shuffle_f32x4(fkx,fkx,238))); 1428 p = p + offset; 1429 t = SIMD_load(p); 1430 t = t + delfdk; 1431 SIMD_store(p,t); 1432 fxtmp2 = SIMD_sub(fxtmp2, kmask2, fxtmp2, delfd - delfdk); 1433 fjxtmp2 = SIMD_sub(fjxtmp2, kmask2, fjxtmp2, delfd); 1434 1435 delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(fjy)); 1436 delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256(fky)); 1437 p = p + offset; 1438 t = SIMD_load(p); 1439 t = t + delfdk; 1440 SIMD_store(p,t); 1441 fytmp = SIMD_sub(fytmp, kmask, fytmp, delfd - delfdk); 1442 fjytmp = SIMD_sub(fjytmp, kmask, fjytmp, delfd); 1443 delfd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1444 _mm512_shuffle_f32x4(fjy,fjy,238))); 1445 delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256( 1446 _mm512_shuffle_f32x4(fky,fky,238))); 1447 p = p + offset; 1448 t = SIMD_load(p); 1449 t = t + delfdk; 1450 SIMD_store(p,t); 1451 fytmp2 = SIMD_sub(fytmp2, kmask2, fytmp2, delfd - delfdk); 1452 fjytmp2 = SIMD_sub(fjytmp2, kmask2, fjytmp2, delfd); 1453 1454 delfd = _mm512_cvtps_pd(_mm512_castps512_ps256(fjz)); 1455 delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256(fkz)); 1456 p = p + offset; 1457 t = SIMD_load(p); 1458 t = t + delfdk; 1459 SIMD_store(p,t); 1460 fztmp = SIMD_sub(fztmp, kmask, fztmp, delfd - delfdk); 1461 fjztmp = SIMD_sub(fjztmp, kmask, fjztmp, delfd); 1462 delfd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1463 _mm512_shuffle_f32x4(fjz,fjz,238))); 1464 delfdk = _mm512_cvtps_pd(_mm512_castps512_ps256( 1465 _mm512_shuffle_f32x4(fkz,fkz,238))); 1466 p = p + offset; 1467 t = SIMD_load(p); 1468 t = t + delfdk; 1469 SIMD_store(p,t); 1470 fztmp2 = SIMD_sub(fztmp2, kmask2, fztmp2, delfd - delfdk); 1471 fjztmp2 = SIMD_sub(fjztmp2, kmask2, fjztmp2, delfd); 1472 } 1473 SIMD_acc_energy3(const SIMD_mask & hmask,const SIMD_float & evdwl,const int eatom,SIMD_float & sevdwl,SIMD_float & fwtmp,SIMD_float & fjtmp,SIMD_float & fwtmp2,SIMD_float & fjtmp2)1474 inline void SIMD_acc_energy3(const SIMD_mask &hmask, 1475 const SIMD_float &evdwl, const int eatom, 1476 SIMD_float &sevdwl, SIMD_float &fwtmp, 1477 SIMD_float &fjtmp, SIMD_float &fwtmp2, 1478 SIMD_float &fjtmp2) { 1479 sevdwl = SIMD_add(sevdwl, hmask, sevdwl, evdwl); 1480 if (eatom) { 1481 const SIMD_float hevdwl = evdwl * (float)0.5; 1482 fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl); 1483 fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl); 1484 } 1485 } 1486 SIMD_acc_energy3(const SIMD_mask & hmask,const SIMD_double & evdwl,const int eatom,SIMD_double & sevdwl,SIMD_double & fwtmp,SIMD_double & fjtmp,SIMD_double & fwtmp2,SIMD_double & fjtmp2)1487 inline void SIMD_acc_energy3(const SIMD_mask &hmask, 1488 const SIMD_double &evdwl, const int eatom, 1489 SIMD_double &sevdwl, SIMD_double &fwtmp, 1490 SIMD_double &fjtmp, SIMD_double &fwtmp2, 1491 SIMD_double &fjtmp2) { 1492 sevdwl = SIMD_add(sevdwl, hmask, sevdwl, evdwl); 1493 if (eatom) { 1494 const SIMD_double hevdwl = evdwl * (double)0.5; 1495 fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl); 1496 fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl); 1497 } 1498 } 1499 SIMD_acc_energy3(const SIMD_mask & hmask,const SIMD_float & evdwl,const int eatom,SIMD_double & sevdwl,SIMD_double & fwtmp,SIMD_double & fjtmp,SIMD_double & fwtmp2,SIMD_double & fjtmp2)1500 inline void SIMD_acc_energy3(const SIMD_mask &hmask, 1501 const SIMD_float &evdwl, const int eatom, 1502 SIMD_double &sevdwl, SIMD_double &fwtmp, 1503 SIMD_double &fjtmp, SIMD_double &fwtmp2, 1504 SIMD_double &fjtmp2) { 1505 SIMD_double evdwld; 1506 evdwld = _mm512_cvtps_pd(_mm512_castps512_ps256(evdwl)); 1507 sevdwl = SIMD_add(sevdwl, hmask, sevdwl, evdwld); 1508 if (eatom) { 1509 const SIMD_double hevdwl = evdwld * (double)0.5; 1510 fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl); 1511 fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl); 1512 } 1513 SIMD_mask hmask2 = hmask >> 8; 1514 evdwld = _mm512_cvtps_pd(_mm512_castps512_ps256( 1515 _mm512_shuffle_f32x4(evdwl,evdwl,238))); 1516 sevdwl = SIMD_add(sevdwl, hmask2, sevdwl, evdwld); 1517 if (eatom) { 1518 const SIMD_double hevdwl = evdwld * (double)0.5; 1519 fwtmp2 = SIMD_add(fwtmp2, hmask2, fwtmp2, hevdwl); 1520 fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl); 1521 } 1522 } 1523 SIMD_acc_three(const SIMD_mask & hmask,const SIMD_float & facrad,const int eatom,SIMD_float & sevdwl,SIMD_float & fwtmp,SIMD_float & fjtmp,SIMD_float & fwtmp2,SIMD_float & fjtmp2,const SIMD_int & k,float * force)1524 inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad, 1525 const int eatom, SIMD_float &sevdwl, 1526 SIMD_float &fwtmp, SIMD_float &fjtmp, 1527 SIMD_float &fwtmp2, SIMD_float &fjtmp2, 1528 const SIMD_int &k, float *force) { 1529 sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad); 1530 if (eatom) { 1531 SIMD_float hevdwl = facrad * SIMD_set((float)0.33333333); 1532 fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl); 1533 fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl); 1534 SIMD_conflict_pi_reduce1(hmask, k, hevdwl); 1535 SIMD_float keng = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), hmask, 1536 k, force + 3, _MM_SCALE_1); 1537 keng = keng + hevdwl; 1538 _mm512_mask_i32scatter_ps(force + 3, hmask, k, keng, _MM_SCALE_1); 1539 } 1540 } 1541 SIMD_acc_three(const SIMD_mask & hmask,const SIMD_double & facrad,const int eatom,SIMD_double & sevdwl,SIMD_double & fwtmp,SIMD_double & fjtmp,SIMD_double & fwtmp2,SIMD_double & fjtmp2,const SIMD_int & k,double * force)1542 inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_double &facrad, 1543 const int eatom, SIMD_double &sevdwl, 1544 SIMD_double &fwtmp, SIMD_double &fjtmp, 1545 SIMD_double &fwtmp2, SIMD_double &fjtmp2, 1546 const SIMD_int &k, double *force) { 1547 sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facrad); 1548 if (eatom) { 1549 SIMD_double hevdwl = facrad * SIMD_set((double)0.33333333); 1550 fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl); 1551 fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl); 1552 SIMD_conflict_pi_reduce1(hmask, k, hevdwl); 1553 SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask, 1554 _mm512_castsi512_si256(k), 1555 force + 3, _MM_SCALE_2); 1556 keng = keng + hevdwl; 1557 _mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k), 1558 keng, _MM_SCALE_2); 1559 } 1560 } 1561 SIMD_acc_three(const SIMD_mask & hmask,const SIMD_float & facrad,const int eatom,SIMD_double & sevdwl,SIMD_double & fwtmp,SIMD_double & fjtmp,SIMD_double & fwtmp2,SIMD_double & fjtmp2,const SIMD_int & k,double * force)1562 inline void SIMD_acc_three(const SIMD_mask &hmask, const SIMD_float &facrad, 1563 const int eatom, SIMD_double &sevdwl, 1564 SIMD_double &fwtmp, SIMD_double &fjtmp, 1565 SIMD_double &fwtmp2, SIMD_double &fjtmp2, 1566 const SIMD_int &k, double *force) { 1567 SIMD_double facradd; 1568 facradd = _mm512_cvtps_pd(_mm512_castps512_ps256(facrad)); 1569 sevdwl = SIMD_add(sevdwl, hmask, sevdwl, facradd); 1570 if (eatom) { 1571 SIMD_double hevdwl = facradd * SIMD_set((double)0.33333333); 1572 fwtmp = SIMD_add(fwtmp, hmask, fwtmp, hevdwl); 1573 fjtmp = SIMD_add(fjtmp, hmask, fjtmp, hevdwl); 1574 SIMD_conflict_pi_reduce1(hmask, k, hevdwl); 1575 SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), hmask, 1576 _mm512_castsi512_si256(k), 1577 force + 3, _MM_SCALE_2); 1578 keng = keng + hevdwl; 1579 _mm512_mask_i32scatter_pd(force + 3, hmask, _mm512_castsi512_si256(k), 1580 keng, _MM_SCALE_2); 1581 } 1582 SIMD_mask hmask2 = hmask >> 8; 1583 facradd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1584 _mm512_shuffle_f32x4(facrad,facrad,238))); 1585 sevdwl = SIMD_add(sevdwl, hmask2, sevdwl, facradd); 1586 if (eatom) { 1587 SIMD_double hevdwl = facradd * SIMD_set((double)0.33333333); 1588 fwtmp2 = SIMD_add(fwtmp2, hmask2, fwtmp2, hevdwl); 1589 fjtmp2 = SIMD_add(fjtmp2, hmask2, fjtmp2, hevdwl); 1590 SIMD_int k2 = _mm512_shuffle_i32x4(k, k, 238); 1591 SIMD_conflict_pi_reduce1(hmask2, k2, hevdwl); 1592 SIMD_double keng = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), 1593 hmask2, 1594 _mm512_castsi512_si256(k2), 1595 force + 3, _MM_SCALE_2); 1596 keng = keng + hevdwl; 1597 _mm512_mask_i32scatter_pd(force + 3, hmask2, _mm512_castsi512_si256(k2), 1598 keng, _MM_SCALE_2); 1599 } 1600 } 1601 SIMD_ev_tally_nbor(const SIMD_mask & m,const int vflag,const float ev_pre,const SIMD_float & fpair,const SIMD_float & delx,const SIMD_float & dely,const SIMD_float & delz,SIMD_float & sv0,SIMD_float & sv1,SIMD_float & sv2,SIMD_float & sv3,SIMD_float & sv4,SIMD_float & sv5)1602 inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag, 1603 const float ev_pre, 1604 const SIMD_float &fpair, const SIMD_float &delx, 1605 const SIMD_float &dely, const SIMD_float &delz, 1606 SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2, 1607 SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) { 1608 if (vflag == 1) { 1609 const SIMD_float prefpair = SIMD_set(ev_pre) * fpair; 1610 sv0 = SIMD_add(sv0, m, sv0, delx * delx * prefpair); 1611 sv1 = SIMD_add(sv1, m, sv1, dely * dely * prefpair); 1612 sv2 = SIMD_add(sv2, m, sv2, delz * delz * prefpair); 1613 sv3 = SIMD_add(sv3, m, sv3, delx * dely * prefpair); 1614 sv4 = SIMD_add(sv4, m, sv4, delx * delz * prefpair); 1615 sv5 = SIMD_add(sv5, m, sv5, dely * delz * prefpair); 1616 } 1617 } 1618 SIMD_ev_tally_nbor(const SIMD_mask & m,const int vflag,const double ev_pre,const SIMD_double & fpair,const SIMD_double & delx,const SIMD_double & dely,const SIMD_double & delz,SIMD_double & sv0,SIMD_double & sv1,SIMD_double & sv2,SIMD_double & sv3,SIMD_double & sv4,SIMD_double & sv5)1619 inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag, 1620 const double ev_pre, 1621 const SIMD_double &fpair, const SIMD_double &delx, 1622 const SIMD_double &dely, const SIMD_double &delz, 1623 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2, 1624 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) { 1625 if (vflag == 1) { 1626 const SIMD_double prefpair = SIMD_set(ev_pre) * fpair; 1627 sv0 = SIMD_add(sv0, m, sv0, delx * delx * prefpair); 1628 sv1 = SIMD_add(sv1, m, sv1, dely * dely * prefpair); 1629 sv2 = SIMD_add(sv2, m, sv2, delz * delz * prefpair); 1630 sv3 = SIMD_add(sv3, m, sv3, delx * dely * prefpair); 1631 sv4 = SIMD_add(sv4, m, sv4, delx * delz * prefpair); 1632 sv5 = SIMD_add(sv5, m, sv5, dely * delz * prefpair); 1633 } 1634 } 1635 SIMD_ev_tally_nbor(const SIMD_mask & m,const int vflag,const float ev_pre,const SIMD_float & fpair,const SIMD_float & delx,const SIMD_float & dely,const SIMD_float & delz,SIMD_double & sv0,SIMD_double & sv1,SIMD_double & sv2,SIMD_double & sv3,SIMD_double & sv4,SIMD_double & sv5)1636 inline void SIMD_ev_tally_nbor(const SIMD_mask &m, const int vflag, 1637 const float ev_pre, 1638 const SIMD_float &fpair, const SIMD_float &delx, 1639 const SIMD_float &dely, const SIMD_float &delz, 1640 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2, 1641 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) { 1642 if (vflag == 1) { 1643 const SIMD_mask m2 = m >> 8; 1644 const SIMD_float prefpair = SIMD_set(ev_pre) * fpair; 1645 SIMD_float dpair = delx * delx * prefpair; 1646 SIMD_double dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); 1647 sv0 = SIMD_add(sv0, m, sv0, dpaird); 1648 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( 1649 _mm512_shuffle_f32x4(dpair,dpair,238))); 1650 sv0 = SIMD_add(sv0, m2, sv0, dpaird); 1651 1652 dpair = dely * dely * prefpair; 1653 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); 1654 sv1 = SIMD_add(sv1, m, sv1, dpaird); 1655 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( 1656 _mm512_shuffle_f32x4(dpair,dpair,238))); 1657 sv1 = SIMD_add(sv1, m2, sv1, dpaird); 1658 1659 dpair = delz * delz * prefpair; 1660 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); 1661 sv2 = SIMD_add(sv2, m, sv2, dpaird); 1662 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( 1663 _mm512_shuffle_f32x4(dpair,dpair,238))); 1664 sv2 = SIMD_add(sv2, m2, sv2, dpaird); 1665 1666 dpair = delx * dely * prefpair; 1667 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); 1668 sv3 = SIMD_add(sv3, m, sv3, dpaird); 1669 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( 1670 _mm512_shuffle_f32x4(dpair,dpair,238))); 1671 sv3 = SIMD_add(sv3, m2, sv3, dpaird); 1672 1673 dpair = delx * delz * prefpair; 1674 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); 1675 sv4 = SIMD_add(sv4, m, sv4, dpaird); 1676 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( 1677 _mm512_shuffle_f32x4(dpair,dpair,238))); 1678 sv4 = SIMD_add(sv4, m2, sv4, dpaird); 1679 1680 dpair = dely * delz * prefpair; 1681 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); 1682 sv5 = SIMD_add(sv5, m, sv5, dpaird); 1683 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( 1684 _mm512_shuffle_f32x4(dpair,dpair,238))); 1685 sv5 = SIMD_add(sv5, m2, sv5, dpaird); 1686 } 1687 } 1688 SIMD_ev_tally_nbor3v(const SIMD_mask & m,const int vflag,const SIMD_float & fj0,const SIMD_float & fj1,const SIMD_float & fj2,const SIMD_float & fk0,const SIMD_float & fk1,const SIMD_float & fk2,const SIMD_float & delx,const SIMD_float & dely,const SIMD_float & delz,const SIMD_float & delr2x,const SIMD_float & delr2y,const SIMD_float & delr2z,SIMD_float & sv0,SIMD_float & sv1,SIMD_float & sv2,SIMD_float & sv3,SIMD_float & sv4,SIMD_float & sv5)1689 inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag, 1690 const SIMD_float &fj0, const SIMD_float &fj1, 1691 const SIMD_float &fj2, const SIMD_float &fk0, 1692 const SIMD_float &fk1, const SIMD_float &fk2, 1693 const SIMD_float &delx, const SIMD_float &dely, 1694 const SIMD_float &delz, const SIMD_float &delr2x, 1695 const SIMD_float &delr2y, const SIMD_float &delr2z, 1696 SIMD_float &sv0, SIMD_float &sv1, SIMD_float &sv2, 1697 SIMD_float &sv3, SIMD_float &sv4, SIMD_float &sv5) { 1698 if (vflag == 1) { 1699 sv0 = SIMD_add(sv0, m, sv0, delx * fj0 + delr2x * fk0); 1700 sv1 = SIMD_add(sv1, m, sv1, dely * fj1 + delr2y * fk1); 1701 sv2 = SIMD_add(sv2, m, sv2, delz * fj2 + delr2z * fk2); 1702 sv3 = SIMD_add(sv3, m, sv3, delx * fj1 + delr2x * fk1); 1703 sv4 = SIMD_add(sv4, m, sv4, delx * fj2 + delr2x * fk2); 1704 sv5 = SIMD_add(sv5, m, sv5, dely * fj2 + delr2y * fk2); 1705 } 1706 } 1707 SIMD_ev_tally_nbor3v(const SIMD_mask & m,const int vflag,const SIMD_double & fj0,const SIMD_double & fj1,const SIMD_double & fj2,const SIMD_double & fk0,const SIMD_double & fk1,const SIMD_double & fk2,const SIMD_double & delx,const SIMD_double & dely,const SIMD_double & delz,const SIMD_double & delr2x,const SIMD_double & delr2y,const SIMD_double & delr2z,SIMD_double & sv0,SIMD_double & sv1,SIMD_double & sv2,SIMD_double & sv3,SIMD_double & sv4,SIMD_double & sv5)1708 inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag, 1709 const SIMD_double &fj0, const SIMD_double &fj1, 1710 const SIMD_double &fj2, const SIMD_double &fk0, 1711 const SIMD_double &fk1, const SIMD_double &fk2, 1712 const SIMD_double &delx, const SIMD_double &dely, 1713 const SIMD_double &delz, const SIMD_double &delr2x, 1714 const SIMD_double &delr2y, const SIMD_double &delr2z, 1715 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2, 1716 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) { 1717 if (vflag == 1) { 1718 sv0 = SIMD_add(sv0, m, sv0, delx * fj0 + delr2x * fk0); 1719 sv1 = SIMD_add(sv1, m, sv1, dely * fj1 + delr2y * fk1); 1720 sv2 = SIMD_add(sv2, m, sv2, delz * fj2 + delr2z * fk2); 1721 sv3 = SIMD_add(sv3, m, sv3, delx * fj1 + delr2x * fk1); 1722 sv4 = SIMD_add(sv4, m, sv4, delx * fj2 + delr2x * fk2); 1723 sv5 = SIMD_add(sv5, m, sv5, dely * fj2 + delr2y * fk2); 1724 } 1725 } 1726 SIMD_ev_tally_nbor3v(const SIMD_mask & m,const int vflag,const SIMD_float & fj0,const SIMD_float & fj1,const SIMD_float & fj2,const SIMD_float & fk0,const SIMD_float & fk1,const SIMD_float & fk2,const SIMD_float & delx,const SIMD_float & dely,const SIMD_float & delz,const SIMD_float & delr2x,const SIMD_float & delr2y,const SIMD_float & delr2z,SIMD_double & sv0,SIMD_double & sv1,SIMD_double & sv2,SIMD_double & sv3,SIMD_double & sv4,SIMD_double & sv5)1727 inline void SIMD_ev_tally_nbor3v(const SIMD_mask &m, const int vflag, 1728 const SIMD_float &fj0, const SIMD_float &fj1, 1729 const SIMD_float &fj2, const SIMD_float &fk0, 1730 const SIMD_float &fk1, const SIMD_float &fk2, 1731 const SIMD_float &delx, const SIMD_float &dely, 1732 const SIMD_float &delz, const SIMD_float &delr2x, 1733 const SIMD_float &delr2y, const SIMD_float &delr2z, 1734 SIMD_double &sv0, SIMD_double &sv1, SIMD_double &sv2, 1735 SIMD_double &sv3, SIMD_double &sv4, SIMD_double &sv5) { 1736 if (vflag == 1) { 1737 const SIMD_mask m2 = m >> 8; 1738 SIMD_float dpair = delx * fj0 + delr2x * fk0; 1739 SIMD_double dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); 1740 sv0 = SIMD_add(sv0, m, sv0, dpaird); 1741 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( 1742 _mm512_shuffle_f32x4(dpair,dpair,238))); 1743 sv0 = SIMD_add(sv0, m2, sv0, dpaird); 1744 1745 dpair = dely * fj1 + delr2y * fk1; 1746 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); 1747 sv1 = SIMD_add(sv1, m, sv1, dpaird); 1748 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( 1749 _mm512_shuffle_f32x4(dpair,dpair,238))); 1750 sv1 = SIMD_add(sv1, m2, sv1, dpaird); 1751 1752 dpair = delz * fj2 + delr2z * fk2; 1753 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); 1754 sv2 = SIMD_add(sv2, m, sv2, dpaird); 1755 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( 1756 _mm512_shuffle_f32x4(dpair,dpair,238))); 1757 sv2 = SIMD_add(sv2, m2, sv2, dpaird); 1758 1759 dpair = delx * fj1 + delr2x * fk1; 1760 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); 1761 sv3 = SIMD_add(sv3, m, sv3, dpaird); 1762 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( 1763 _mm512_shuffle_f32x4(dpair,dpair,238))); 1764 sv3 = SIMD_add(sv3, m2, sv3, dpaird); 1765 1766 dpair = delx * fj2 + delr2x * fk2; 1767 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); 1768 sv4 = SIMD_add(sv4, m, sv4, dpaird); 1769 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( 1770 _mm512_shuffle_f32x4(dpair,dpair,238))); 1771 sv4 = SIMD_add(sv4, m2, sv4, dpaird); 1772 1773 dpair = dely * fj2 + delr2y * fk2; 1774 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256(dpair)); 1775 sv5 = SIMD_add(sv5, m, sv5, dpaird); 1776 dpaird = _mm512_cvtps_pd(_mm512_castps512_ps256( 1777 _mm512_shuffle_f32x4(dpair,dpair,238))); 1778 sv5 = SIMD_add(sv5, m2, sv5, dpaird); 1779 } 1780 } 1781 SIMD_safe_force_accumulate(const SIMD_mask & rmask,float * force,const SIMD_int & joffset,SIMD_float & amx,SIMD_float & amy,SIMD_float & amz,SIMD_float & fxtmp,SIMD_float & fytmp,SIMD_float & fztmp,SIMD_float & fxtmp2,SIMD_float & fytmp2,SIMD_float & fztmp2)1782 inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask, 1783 float *force, const SIMD_int &joffset, SIMD_float &amx, 1784 SIMD_float &amy, SIMD_float &amz, SIMD_float &fxtmp, 1785 SIMD_float &fytmp, SIMD_float &fztmp, SIMD_float &fxtmp2, 1786 SIMD_float &fytmp2, SIMD_float &fztmp2) { 1787 fxtmp = SIMD_add(fxtmp, rmask, fxtmp, amx); 1788 fytmp = SIMD_add(fytmp, rmask, fytmp, amy); 1789 fztmp = SIMD_add(fztmp, rmask, fztmp, amz); 1790 SIMD_conflict_pi_reduce3(rmask, joffset, amx, amy, amz); 1791 SIMD_jforce_update(rmask, force, joffset, amx, amy, amz); 1792 } 1793 SIMD_safe_force_accumulate(const SIMD_mask & rmask,double * force,const SIMD_int & joffset,SIMD_double & amx,SIMD_double & amy,SIMD_double & amz,SIMD_double & fxtmp,SIMD_double & fytmp,SIMD_double & fztmp,SIMD_double & fxtmp2,SIMD_double & fytmp2,SIMD_double & fztmp2)1794 inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask, 1795 double *force, const SIMD_int &joffset, SIMD_double &amx, 1796 SIMD_double &amy, SIMD_double &amz, SIMD_double &fxtmp, 1797 SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2, 1798 SIMD_double &fytmp2, SIMD_double &fztmp2) { 1799 fxtmp = SIMD_add(fxtmp, rmask, fxtmp, amx); 1800 fytmp = SIMD_add(fytmp, rmask, fytmp, amy); 1801 fztmp = SIMD_add(fztmp, rmask, fztmp, amz); 1802 SIMD_conflict_pi_reduce3(rmask, joffset, amx, amy, amz); 1803 SIMD_jforce_update(rmask, force, joffset, amx, amy, amz); 1804 } 1805 SIMD_safe_force_accumulate(const SIMD_mask & rmask,double * force,const SIMD_int & joffset,SIMD_float & amx,SIMD_float & amy,SIMD_float & amz,SIMD_double & fxtmp,SIMD_double & fytmp,SIMD_double & fztmp,SIMD_double & fxtmp2,SIMD_double & fytmp2,SIMD_double & fztmp2)1806 inline void SIMD_safe_force_accumulate(const SIMD_mask &rmask, 1807 double *force, const SIMD_int &joffset, SIMD_float &amx, 1808 SIMD_float &amy, SIMD_float &amz, SIMD_double &fxtmp, 1809 SIMD_double &fytmp, SIMD_double &fztmp, SIMD_double &fxtmp2, 1810 SIMD_double &fytmp2, SIMD_double &fztmp2) { 1811 SIMD_double amxd, amyd, amzd; 1812 amxd = _mm512_cvtps_pd(_mm512_castps512_ps256(amx)); 1813 fxtmp = SIMD_add(fxtmp, rmask, fxtmp, amxd); 1814 amyd = _mm512_cvtps_pd(_mm512_castps512_ps256(amy)); 1815 fytmp = SIMD_add(fytmp, rmask, fytmp, amyd); 1816 amzd = _mm512_cvtps_pd(_mm512_castps512_ps256(amz)); 1817 fztmp = SIMD_add(fztmp, rmask, fztmp, amzd); 1818 SIMD_conflict_pi_reduce3(rmask, joffset, amxd, amyd, amzd); 1819 SIMD_jforce_update(rmask, force, joffset, amxd, amyd, amzd); 1820 1821 SIMD_mask rmask2 = rmask >> 8; 1822 amxd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1823 _mm512_shuffle_f32x4(amx,amx,238))); 1824 fxtmp2 = SIMD_add(fxtmp2, rmask2, fxtmp2, amxd); 1825 amyd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1826 _mm512_shuffle_f32x4(amy,amy,238))); 1827 fytmp2 = SIMD_add(fytmp2, rmask2, fytmp2, amyd); 1828 amzd = _mm512_cvtps_pd(_mm512_castps512_ps256( 1829 _mm512_shuffle_f32x4(amz,amz,238))); 1830 fztmp2 = SIMD_add(fztmp2, rmask2, fztmp2, amzd); 1831 SIMD_int joffset2 = _mm512_shuffle_i32x4(joffset, joffset, 238); 1832 SIMD_conflict_pi_reduce3(rmask2, joffset2, amxd, amyd, amzd); 1833 SIMD_jforce_update(rmask2, force, joffset2, amxd, amyd, amzd); 1834 } 1835 SIMD_iforce_update(const SIMD_mask & m,float * force,const SIMD_int & i,const SIMD_float & fx,const SIMD_float & fy,const SIMD_float & fz,const int EFLAG,const int eatom,const SIMD_float & fwtmp)1836 inline void SIMD_iforce_update(const SIMD_mask &m, float *force, 1837 const SIMD_int &i, const SIMD_float &fx, 1838 const SIMD_float &fy, const SIMD_float &fz, 1839 const int EFLAG, const int eatom, 1840 const SIMD_float &fwtmp) { 1841 SIMD_float jfrc; 1842 jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force, 1843 _MM_SCALE_1); 1844 jfrc = jfrc + fx; 1845 _mm512_mask_i32scatter_ps(force, m, i, jfrc, _MM_SCALE_1); 1846 jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 1, 1847 _MM_SCALE_1); 1848 jfrc = jfrc + fy; 1849 _mm512_mask_i32scatter_ps(force+1, m, i, jfrc, _MM_SCALE_1); 1850 jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 2, 1851 _MM_SCALE_1); 1852 jfrc = jfrc + fz; 1853 _mm512_mask_i32scatter_ps(force+2, m, i, jfrc, _MM_SCALE_1); 1854 if (EFLAG) { 1855 if (eatom) { 1856 jfrc = _mm512_mask_i32gather_ps(_mm512_undefined_ps(), m, i, force + 3, 1857 _MM_SCALE_1); 1858 jfrc = jfrc + fwtmp; 1859 _mm512_mask_i32scatter_ps(force+3, m, i, jfrc, _MM_SCALE_1); 1860 } 1861 } 1862 } 1863 SIMD_iforce_update(const SIMD_mask & m,double * force,const SIMD_int & i,const SIMD_double & fx,const SIMD_double & fy,const SIMD_double & fz,const int EFLAG,const int eatom,const SIMD_double & fwtmp)1864 inline void SIMD_iforce_update(const SIMD_mask &m, double *force, 1865 const SIMD_int &i, const SIMD_double &fx, 1866 const SIMD_double &fy, const SIMD_double &fz, 1867 const int EFLAG, const int eatom, 1868 const SIMD_double &fwtmp) { 1869 SIMD_double jfrc; 1870 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 1871 _mm512_castsi512_si256(i), force, 1872 _MM_SCALE_2); 1873 jfrc = jfrc + fx; 1874 _mm512_mask_i32scatter_pd(force, m, _mm512_castsi512_si256(i), jfrc, 1875 _MM_SCALE_2); 1876 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 1877 _mm512_castsi512_si256(i), force + 1, 1878 _MM_SCALE_2); 1879 jfrc = jfrc + fy; 1880 _mm512_mask_i32scatter_pd(force+1, m, _mm512_castsi512_si256(i), jfrc, 1881 _MM_SCALE_2); 1882 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 1883 _mm512_castsi512_si256(i), force + 2, 1884 _MM_SCALE_2); 1885 jfrc = jfrc + fz; 1886 _mm512_mask_i32scatter_pd(force+2, m, _mm512_castsi512_si256(i), jfrc, 1887 _MM_SCALE_2); 1888 if (EFLAG) { 1889 if (eatom) { 1890 jfrc = _mm512_mask_i32gather_pd(_mm512_undefined_pd(), m, 1891 _mm512_castsi512_si256(i), 1892 force + 3, _MM_SCALE_2); 1893 jfrc = jfrc + fwtmp; 1894 _mm512_mask_i32scatter_pd(force+3, m, _mm512_castsi512_si256(i), jfrc, 1895 _MM_SCALE_2); 1896 } 1897 } 1898 } 1899 1900 #ifdef SW_GATHER_TEST 1901 template <class atom_t> SIMD_atom_gather(const SIMD_mask & m,const atom_t * atom,const SIMD_int & i,SIMD_float & x,SIMD_float & y,SIMD_float & z,SIMD_int & type)1902 inline void SIMD_atom_gather(const SIMD_mask &m, const atom_t *atom, 1903 const SIMD_int &i, SIMD_float &x, SIMD_float &y, 1904 SIMD_float &z, SIMD_int &type) { 1905 int jv_scalar[16] __attribute__((aligned(64))); 1906 int jm_scalar[16] __attribute__((aligned(64))); 1907 _mm512_store_epi32(jv_scalar, i); 1908 1909 SIMD_float pl1, pl2, pl3, pl4; 1910 1911 int js = jv_scalar[0]; 1912 pl1 = _mm512_loadu_ps((float *)((char *)atom + js)); 1913 js = jv_scalar[1]; 1914 pl1 = _mm512_insertf32x4(pl1, _mm_load_ps((float *)((char *)atom + 1915 js)), 1); 1916 js = jv_scalar[2]; 1917 pl1 = _mm512_insertf32x4(pl1, _mm_load_ps((float *)((char *)atom + 1918 js)), 2); 1919 js = jv_scalar[3]; 1920 pl1 = _mm512_insertf32x4(pl1, _mm_load_ps((float *)((char *)atom + 1921 js)), 3); 1922 1923 js = jv_scalar[4]; 1924 pl2 = _mm512_loadu_ps((float *)((char *)atom + js)); 1925 js = jv_scalar[5]; 1926 pl2 = _mm512_insertf32x4(pl2, _mm_load_ps((float *)((char *)atom + 1927 js)), 1); 1928 js = jv_scalar[6]; 1929 pl2 = _mm512_insertf32x4(pl2, _mm_load_ps((float *)((char *)atom + 1930 js)), 2); 1931 js = jv_scalar[7]; 1932 pl2 = _mm512_insertf32x4(pl2, _mm_load_ps((float *)((char *)atom + 1933 js)), 3); 1934 1935 js = jv_scalar[8]; 1936 pl3 = _mm512_loadu_ps((float *)((char *)atom + js)); 1937 js = jv_scalar[9]; 1938 pl3 = _mm512_insertf32x4(pl3, _mm_load_ps((float *)((char *)atom + 1939 js)), 1); 1940 js = jv_scalar[10]; 1941 pl3 = _mm512_insertf32x4(pl3, _mm_load_ps((float *)((char *)atom + 1942 js)), 2); 1943 js = jv_scalar[11]; 1944 pl3 = _mm512_insertf32x4(pl3, _mm_load_ps((float *)((char *)atom + 1945 js)), 3); 1946 1947 js = jv_scalar[12]; 1948 pl4 = _mm512_loadu_ps((float *)((char *)atom + js)); 1949 js = jv_scalar[13]; 1950 pl4 = _mm512_insertf32x4(pl4, _mm_load_ps((float *)((char *)atom + 1951 js)), 1); 1952 js = jv_scalar[14]; 1953 pl4 = _mm512_insertf32x4(pl4, _mm_load_ps((float *)((char *)atom + 1954 js)), 2); 1955 js = jv_scalar[15]; 1956 pl4 = _mm512_insertf32x4(pl4, _mm_load_ps((float *)((char *)atom + 1957 js)), 3); 1958 1959 SIMD_int c0 = _mm512_setr_epi32(0x0,0x4,0x8,0xc,0x10,0x14,0x18,0x1c, 1960 0x1,0x5,0x9,0xd,0x11,0x15,0x19,0x1d); 1961 SIMD_int c1 = _mm512_setr_epi32(0x1,0x5,0x9,0xd,0x11,0x15,0x19,0x1d, 1962 0x0,0x4,0x8,0xc,0x10,0x14,0x18,0x1c); 1963 SIMD_int c2 = _mm512_setr_epi32(0x2,0x6,0xa,0xe,0x12,0x16,0x1a,0x1e, 1964 0x3,0x7,0xb,0xf,0x13,0x17,0x1b,0x1f); 1965 SIMD_int c3 = _mm512_setr_epi32(0x3,0x7,0xb,0xf,0x13,0x17,0x1b,0x1f, 1966 0x2,0x6,0xa,0xe,0x12,0x16,0x1a,0x1e); 1967 SIMD_mask k_1 = _mm512_int2mask(65280); 1968 1969 SIMD_float sl1 = _mm512_permutex2var_ps(pl3, c0, pl4); 1970 SIMD_float sl2 = _mm512_permutex2var_ps(pl1, c1, pl2); 1971 SIMD_float sl3 = _mm512_permutex2var_ps(pl3, c2, pl4); 1972 SIMD_float sl4 = _mm512_permutex2var_ps(pl1, c3, pl2); 1973 1974 x = _mm512_shuffle_f32x4(sl2, sl1, 78); 1975 z = _mm512_shuffle_f32x4(sl4, sl3, 78); 1976 y = _mm512_mask_blend_ps(k_1, sl2, sl1); 1977 type = _mm512_castps_si512(_mm512_mask_blend_ps(k_1, sl4, sl3)); 1978 } 1979 #endif 1980 } 1981 1982 #endif 1983 1984 #endif 1985