1 // Copyright 2009-2021 Intel Corporation 2 // SPDX-License-Identifier: Apache-2.0 3 4 #pragma once 5 6 #include "platform.h" 7 8 #if defined(__WIN32__) 9 #include <intrin.h> 10 #endif 11 12 #if defined(__ARM_NEON) 13 #include "../simd/arm/emulation.h" 14 #else 15 #include <immintrin.h> 16 #endif 17 18 #if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER) 19 #if !defined(_tzcnt_u32) 20 #define _tzcnt_u32 __tzcnt_u32 21 #endif 22 #if !defined(_tzcnt_u64) 23 #define _tzcnt_u64 __tzcnt_u64 24 #endif 25 #endif 26 27 #if defined(__LZCNT__) 28 #if !defined(_lzcnt_u32) 29 #define _lzcnt_u32 __lzcnt32 30 #endif 31 #if !defined(_lzcnt_u64) 32 #define _lzcnt_u64 __lzcnt64 33 #endif 34 #endif 35 36 #if defined(__WIN32__) 37 # define NOMINMAX 38 # include <windows.h> 39 #endif 40 41 /* normally defined in pmmintrin.h, but we always need this */ 42 #if !defined(_MM_SET_DENORMALS_ZERO_MODE) 43 #define _MM_DENORMALS_ZERO_ON (0x0040) 44 #define _MM_DENORMALS_ZERO_OFF (0x0000) 45 #define _MM_DENORMALS_ZERO_MASK (0x0040) 46 #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) 47 #endif 48 49 namespace embree 50 { 51 52 //////////////////////////////////////////////////////////////////////////////// 53 /// Windows Platform 54 //////////////////////////////////////////////////////////////////////////////// 55 56 #if defined(__WIN32__) 57 read_tsc()58 __forceinline size_t read_tsc() 59 { 60 LARGE_INTEGER li; 61 QueryPerformanceCounter(&li); 62 return (size_t)li.QuadPart; 63 } 64 bsf(int v)65 __forceinline int bsf(int v) { 66 #if defined(__AVX2__) 67 return _tzcnt_u32(v); 68 #else 69 unsigned long r = 0; _BitScanForward(&r,v); return r; 70 #endif 71 } 72 bsf(unsigned v)73 __forceinline unsigned bsf(unsigned v) { 74 #if defined(__AVX2__) 75 return _tzcnt_u32(v); 76 #else 77 unsigned long r = 0; _BitScanForward(&r,v); return r; 78 #endif 79 } 80 81 #if defined(__X86_64__) bsf(size_t v)82 __forceinline size_t bsf(size_t v) { 83 #if defined(__AVX2__) 84 return _tzcnt_u64(v); 85 #else 86 unsigned long r = 0; _BitScanForward64(&r,v); return r; 87 #endif 88 } 89 #endif 90 bscf(int & v)91 __forceinline int bscf(int& v) 92 { 93 int i = bsf(v); 94 v &= v-1; 95 return i; 96 } 97 bscf(unsigned & v)98 __forceinline unsigned bscf(unsigned& v) 99 { 100 unsigned i = bsf(v); 101 v &= v-1; 102 return i; 103 } 104 105 #if defined(__X86_64__) bscf(size_t & v)106 __forceinline size_t bscf(size_t& v) 107 { 108 size_t i = bsf(v); 109 v &= v-1; 110 return i; 111 } 112 #endif 113 bsr(int v)114 __forceinline int bsr(int v) { 115 #if defined(__AVX2__) 116 return 31 - _lzcnt_u32(v); 117 #else 118 unsigned long r = 0; _BitScanReverse(&r,v); return r; 119 #endif 120 } 121 bsr(unsigned v)122 __forceinline unsigned bsr(unsigned v) { 123 #if defined(__AVX2__) 124 return 31 - _lzcnt_u32(v); 125 #else 126 unsigned long r = 0; _BitScanReverse(&r,v); return r; 127 #endif 128 } 129 130 #if defined(__X86_64__) bsr(size_t v)131 __forceinline size_t bsr(size_t v) { 132 #if defined(__AVX2__) 133 return 63 -_lzcnt_u64(v); 134 #else 135 unsigned long r = 0; _BitScanReverse64(&r, v); return r; 136 #endif 137 } 138 #endif 139 lzcnt(const int x)140 __forceinline int lzcnt(const int x) 141 { 142 #if defined(__AVX2__) 143 return _lzcnt_u32(x); 144 #else 145 if (unlikely(x == 0)) return 32; 146 return 31 - bsr(x); 147 #endif 148 } 149 btc(int v,int i)150 __forceinline int btc(int v, int i) { 151 long r = v; _bittestandcomplement(&r,i); return r; 152 } 153 bts(int v,int i)154 __forceinline int bts(int v, int i) { 155 long r = v; _bittestandset(&r,i); return r; 156 } 157 btr(int v,int i)158 __forceinline int btr(int v, int i) { 159 long r = v; _bittestandreset(&r,i); return r; 160 } 161 162 #if defined(__X86_64__) 163 btc(size_t v,size_t i)164 __forceinline size_t btc(size_t v, size_t i) { 165 size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r; 166 } 167 bts(size_t v,size_t i)168 __forceinline size_t bts(size_t v, size_t i) { 169 __int64 r = v; _bittestandset64(&r,i); return r; 170 } 171 btr(size_t v,size_t i)172 __forceinline size_t btr(size_t v, size_t i) { 173 __int64 r = v; _bittestandreset64(&r,i); return r; 174 } 175 176 #endif 177 atomic_cmpxchg(volatile int32_t * p,const int32_t c,const int32_t v)178 __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) { 179 return _InterlockedCompareExchange((volatile long*)p,v,c); 180 } 181 182 //////////////////////////////////////////////////////////////////////////////// 183 /// Unix Platform 184 //////////////////////////////////////////////////////////////////////////////// 185 186 #else 187 188 #if defined(__i386__) && defined(__PIC__) 189 190 __forceinline void __cpuid(int out[4], int op) 191 { 192 asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" 193 "cpuid\n\t" 194 "xchg{l}\t{%%}ebx, %1\n\t" 195 : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) 196 : "0"(op)); 197 } 198 199 __forceinline void __cpuid_count(int out[4], int op1, int op2) 200 { 201 asm volatile ("xchg{l}\t{%%}ebx, %1\n\t" 202 "cpuid\n\t" 203 "xchg{l}\t{%%}ebx, %1\n\t" 204 : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3]) 205 : "0" (op1), "2" (op2)); 206 } 207 208 #elif defined(__X86_ASM__) 209 210 __forceinline void __cpuid(int out[4], int op) { 211 asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op)); 212 } 213 214 __forceinline void __cpuid_count(int out[4], int op1, int op2) { 215 asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2)); 216 } 217 218 #endif 219 220 __forceinline uint64_t read_tsc() { 221 #if defined(__X86_ASM__) 222 uint32_t high,low; 223 asm volatile ("rdtsc" : "=d"(high), "=a"(low)); 224 return (((uint64_t)high) << 32) + (uint64_t)low; 225 #else 226 /* Not supported yet, meaning measuring traversal cost per pixel does not work. */ 227 return 0; 228 #endif 229 } 230 231 __forceinline int bsf(int v) { 232 #if defined(__AVX2__) 233 return _tzcnt_u32(v); 234 #elif defined(__X86_ASM__) 235 int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; 236 #else 237 return __builtin_ctz(v); 238 #endif 239 } 240 241 #if defined(__64BIT__) 242 __forceinline unsigned bsf(unsigned v) 243 { 244 #if defined(__AVX2__) 245 return _tzcnt_u32(v); 246 #elif defined(__X86_ASM__) 247 unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; 248 #else 249 return __builtin_ctz(v); 250 #endif 251 } 252 #endif 253 254 __forceinline size_t bsf(size_t v) { 255 #if defined(__AVX2__) 256 #if defined(__X86_64__) 257 return _tzcnt_u64(v); 258 #else 259 return _tzcnt_u32(v); 260 #endif 261 #elif defined(__X86_ASM__) 262 size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r; 263 #else 264 return __builtin_ctzl(v); 265 #endif 266 } 267 268 __forceinline int bscf(int& v) 269 { 270 int i = bsf(v); 271 v &= v-1; 272 return i; 273 } 274 275 #if defined(__64BIT__) 276 __forceinline unsigned int bscf(unsigned int& v) 277 { 278 unsigned int i = bsf(v); 279 v &= v-1; 280 return i; 281 } 282 #endif 283 284 __forceinline size_t bscf(size_t& v) 285 { 286 size_t i = bsf(v); 287 v &= v-1; 288 return i; 289 } 290 291 __forceinline int bsr(int v) { 292 #if defined(__AVX2__) 293 return 31 - _lzcnt_u32(v); 294 #elif defined(__X86_ASM__) 295 int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; 296 #else 297 return __builtin_clz(v) ^ 31; 298 #endif 299 } 300 301 #if defined(__64BIT__) 302 __forceinline unsigned bsr(unsigned v) { 303 #if defined(__AVX2__) 304 return 31 - _lzcnt_u32(v); 305 #elif defined(__X86_ASM__) 306 unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; 307 #else 308 return __builtin_clz(v) ^ 31; 309 #endif 310 } 311 #endif 312 313 __forceinline size_t bsr(size_t v) { 314 #if defined(__AVX2__) 315 #if defined(__X86_64__) 316 return 63 - _lzcnt_u64(v); 317 #else 318 return 31 - _lzcnt_u32(v); 319 #endif 320 #elif defined(__X86_ASM__) 321 size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r; 322 #else 323 return (sizeof(v) * 8 - 1) - __builtin_clzl(v); 324 #endif 325 } 326 327 __forceinline int lzcnt(const int x) 328 { 329 #if defined(__AVX2__) 330 return _lzcnt_u32(x); 331 #else 332 if (unlikely(x == 0)) return 32; 333 return 31 - bsr(x); 334 #endif 335 } 336 337 __forceinline size_t blsr(size_t v) { 338 #if defined(__AVX2__) 339 #if defined(__INTEL_COMPILER) 340 return _blsr_u64(v); 341 #else 342 #if defined(__X86_64__) 343 return __blsr_u64(v); 344 #else 345 return __blsr_u32(v); 346 #endif 347 #endif 348 #else 349 return v & (v-1); 350 #endif 351 } 352 353 __forceinline int btc(int v, int i) { 354 #if defined(__X86_ASM__) 355 int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; 356 #else 357 return (v ^ (1 << i)); 358 #endif 359 } 360 361 __forceinline int bts(int v, int i) { 362 #if defined(__X86_ASM__) 363 int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; 364 #else 365 return (v | (v << i)); 366 #endif 367 } 368 369 __forceinline int btr(int v, int i) { 370 #if defined(__X86_ASM__) 371 int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; 372 #else 373 return (v & ~(v << i)); 374 #endif 375 } 376 377 __forceinline size_t btc(size_t v, size_t i) { 378 #if defined(__X86_ASM__) 379 size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r; 380 #else 381 return (v ^ (1 << i)); 382 #endif 383 } 384 385 __forceinline size_t bts(size_t v, size_t i) { 386 #if defined(__X86_ASM__) 387 size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; 388 #else 389 return (v | (v << i)); 390 #endif 391 } 392 393 __forceinline size_t btr(size_t v, size_t i) { 394 #if defined(__X86_ASM__) 395 size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r; 396 #else 397 return (v & ~(v << i)); 398 #endif 399 } 400 401 __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) { 402 return __sync_val_compare_and_swap(value, comparand, input); 403 } 404 405 #endif 406 407 //////////////////////////////////////////////////////////////////////////////// 408 /// All Platforms 409 //////////////////////////////////////////////////////////////////////////////// 410 411 #if defined(__clang__) || defined(__GNUC__) 412 #if !defined(_mm_undefined_ps) _mm_undefined_ps()413 __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); } 414 #endif 415 #if !defined(_mm_undefined_si128) _mm_undefined_si128()416 __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); } 417 #endif 418 #if !defined(_mm256_undefined_ps) && defined(__AVX__) _mm256_undefined_ps()419 __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); } 420 #endif 421 #if !defined(_mm256_undefined_si256) && defined(__AVX__) _mm256_undefined_si256()422 __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); } 423 #endif 424 #if !defined(_mm512_undefined_ps) && defined(__AVX512F__) _mm512_undefined_ps()425 __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); } 426 #endif 427 #if !defined(_mm512_undefined_epi32) && defined(__AVX512F__) _mm512_undefined_epi32()428 __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); } 429 #endif 430 #endif 431 432 #if defined(__SSE4_2__) 433 popcnt(int in)434 __forceinline int popcnt(int in) { 435 return _mm_popcnt_u32(in); 436 } 437 popcnt(unsigned in)438 __forceinline unsigned popcnt(unsigned in) { 439 return _mm_popcnt_u32(in); 440 } 441 442 #if defined(__64BIT__) popcnt(size_t in)443 __forceinline size_t popcnt(size_t in) { 444 return _mm_popcnt_u64(in); 445 } 446 #endif 447 448 #endif 449 450 #if defined(__X86_ASM__) rdtsc()451 __forceinline uint64_t rdtsc() 452 { 453 int dummy[4]; 454 __cpuid(dummy,0); 455 uint64_t clock = read_tsc(); 456 __cpuid(dummy,0); 457 return clock; 458 } 459 #endif 460 461 __forceinline void pause_cpu(const size_t N = 8) 462 { 463 for (size_t i=0; i<N; i++) 464 _mm_pause(); 465 } 466 467 /* prefetches */ prefetchL1(const void * ptr)468 __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); } prefetchL2(const void * ptr)469 __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); } prefetchL3(const void * ptr)470 __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); } prefetchNTA(const void * ptr)471 __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); } prefetchEX(const void * ptr)472 __forceinline void prefetchEX (const void* ptr) { 473 #if defined(__INTEL_COMPILER) 474 _mm_prefetch((const char*)ptr,_MM_HINT_ET0); 475 #else 476 _mm_prefetch((const char*)ptr,_MM_HINT_T0); 477 #endif 478 } 479 prefetchL1EX(const void * ptr)480 __forceinline void prefetchL1EX(const void* ptr) { 481 prefetchEX(ptr); 482 } 483 prefetchL2EX(const void * ptr)484 __forceinline void prefetchL2EX(const void* ptr) { 485 prefetchEX(ptr); 486 } 487 #if defined(__AVX2__) pext(unsigned int a,unsigned int b)488 __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); } pdep(unsigned int a,unsigned int b)489 __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); } 490 #if defined(__X86_64__) pext(size_t a,size_t b)491 __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); } pdep(size_t a,size_t b)492 __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); } 493 #endif 494 #endif 495 496 #if defined(__AVX512F__) 497 #if defined(__INTEL_COMPILER) mm512_cvtss_f32(__m512 v)498 __forceinline float mm512_cvtss_f32(__m512 v) { 499 return _mm512_cvtss_f32(v); 500 } mm512_mask2int(__mmask16 k1)501 __forceinline int mm512_mask2int(__mmask16 k1) { 502 return _mm512_mask2int(k1); 503 } mm512_int2mask(int mask)504 __forceinline __mmask16 mm512_int2mask(int mask) { 505 return _mm512_int2mask(mask); 506 } 507 #else mm512_cvtss_f32(__m512 v)508 __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3 509 return _mm_cvtss_f32(_mm512_castps512_ps128(v)); 510 } mm512_mask2int(__mmask16 k1)511 __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3 512 return (int)k1; 513 } mm512_int2mask(int mask)514 __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3 515 return (__mmask16)mask; 516 } 517 #endif 518 #endif 519 } 520