1 // Copyright 2009-2021 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 
4 #pragma once
5 
6 #include "platform.h"
7 
8 #if defined(__WIN32__)
9 #include <intrin.h>
10 #endif
11 
12 #if defined(__ARM_NEON)
13 #include "../simd/arm/emulation.h"
14 #else
15 #include <immintrin.h>
16 #endif
17 
18 #if defined(__BMI__) && defined(__GNUC__) && !defined(__INTEL_COMPILER)
19   #if !defined(_tzcnt_u32)
20     #define _tzcnt_u32 __tzcnt_u32
21   #endif
22   #if !defined(_tzcnt_u64)
23     #define _tzcnt_u64 __tzcnt_u64
24   #endif
25 #endif
26 
27 #if defined(__LZCNT__)
28   #if !defined(_lzcnt_u32)
29     #define _lzcnt_u32 __lzcnt32
30   #endif
31   #if !defined(_lzcnt_u64)
32     #define _lzcnt_u64 __lzcnt64
33   #endif
34 #endif
35 
36 #if defined(__WIN32__)
37 #  define NOMINMAX
38 #  include <windows.h>
39 #endif
40 
41 /* normally defined in pmmintrin.h, but we always need this */
42 #if !defined(_MM_SET_DENORMALS_ZERO_MODE)
43 #define _MM_DENORMALS_ZERO_ON   (0x0040)
44 #define _MM_DENORMALS_ZERO_OFF  (0x0000)
45 #define _MM_DENORMALS_ZERO_MASK (0x0040)
46 #define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x)))
47 #endif
48 
49 namespace embree
50 {
51 
52 ////////////////////////////////////////////////////////////////////////////////
53 /// Windows Platform
54 ////////////////////////////////////////////////////////////////////////////////
55 
56 #if defined(__WIN32__)
57 
read_tsc()58   __forceinline size_t read_tsc()
59   {
60     LARGE_INTEGER li;
61     QueryPerformanceCounter(&li);
62     return (size_t)li.QuadPart;
63   }
64 
bsf(int v)65   __forceinline int bsf(int v) {
66 #if defined(__AVX2__)
67     return _tzcnt_u32(v);
68 #else
69     unsigned long r = 0; _BitScanForward(&r,v); return r;
70 #endif
71   }
72 
bsf(unsigned v)73   __forceinline unsigned bsf(unsigned v) {
74 #if defined(__AVX2__)
75     return _tzcnt_u32(v);
76 #else
77     unsigned long r = 0; _BitScanForward(&r,v); return r;
78 #endif
79   }
80 
81 #if defined(__X86_64__)
bsf(size_t v)82   __forceinline size_t bsf(size_t v) {
83 #if defined(__AVX2__)
84     return _tzcnt_u64(v);
85 #else
86     unsigned long r = 0; _BitScanForward64(&r,v); return r;
87 #endif
88   }
89 #endif
90 
bscf(int & v)91   __forceinline int bscf(int& v)
92   {
93     int i = bsf(v);
94     v &= v-1;
95     return i;
96   }
97 
bscf(unsigned & v)98   __forceinline unsigned bscf(unsigned& v)
99   {
100     unsigned i = bsf(v);
101     v &= v-1;
102     return i;
103   }
104 
105 #if defined(__X86_64__)
bscf(size_t & v)106   __forceinline size_t bscf(size_t& v)
107   {
108     size_t i = bsf(v);
109     v &= v-1;
110     return i;
111   }
112 #endif
113 
bsr(int v)114   __forceinline int bsr(int v) {
115 #if defined(__AVX2__)
116     return 31 - _lzcnt_u32(v);
117 #else
118     unsigned long r = 0; _BitScanReverse(&r,v); return r;
119 #endif
120   }
121 
bsr(unsigned v)122   __forceinline unsigned bsr(unsigned v) {
123 #if defined(__AVX2__)
124     return 31 - _lzcnt_u32(v);
125 #else
126     unsigned long r = 0; _BitScanReverse(&r,v); return r;
127 #endif
128   }
129 
130 #if defined(__X86_64__)
bsr(size_t v)131   __forceinline size_t bsr(size_t v) {
132 #if defined(__AVX2__)
133     return 63 -_lzcnt_u64(v);
134 #else
135     unsigned long r = 0; _BitScanReverse64(&r, v); return r;
136 #endif
137   }
138 #endif
139 
lzcnt(const int x)140   __forceinline int lzcnt(const int x)
141   {
142 #if defined(__AVX2__)
143     return _lzcnt_u32(x);
144 #else
145     if (unlikely(x == 0)) return 32;
146     return 31 - bsr(x);
147 #endif
148   }
149 
btc(int v,int i)150   __forceinline int btc(int v, int i) {
151     long r = v; _bittestandcomplement(&r,i); return r;
152   }
153 
bts(int v,int i)154   __forceinline int bts(int v, int i) {
155     long r = v; _bittestandset(&r,i); return r;
156   }
157 
btr(int v,int i)158   __forceinline int btr(int v, int i) {
159     long r = v; _bittestandreset(&r,i); return r;
160   }
161 
162 #if defined(__X86_64__)
163 
btc(size_t v,size_t i)164   __forceinline size_t btc(size_t v, size_t i) {
165     size_t r = v; _bittestandcomplement64((__int64*)&r,i); return r;
166   }
167 
bts(size_t v,size_t i)168   __forceinline size_t bts(size_t v, size_t i) {
169     __int64 r = v; _bittestandset64(&r,i); return r;
170   }
171 
btr(size_t v,size_t i)172   __forceinline size_t btr(size_t v, size_t i) {
173     __int64 r = v; _bittestandreset64(&r,i); return r;
174   }
175 
176 #endif
177 
atomic_cmpxchg(volatile int32_t * p,const int32_t c,const int32_t v)178   __forceinline int32_t atomic_cmpxchg(volatile int32_t* p, const int32_t c, const int32_t v) {
179     return _InterlockedCompareExchange((volatile long*)p,v,c);
180   }
181 
182 ////////////////////////////////////////////////////////////////////////////////
183 /// Unix Platform
184 ////////////////////////////////////////////////////////////////////////////////
185 
186 #else
187 
188 #if defined(__i386__) && defined(__PIC__)
189 
190   __forceinline void __cpuid(int out[4], int op)
191   {
192     asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
193                   "cpuid\n\t"
194                   "xchg{l}\t{%%}ebx, %1\n\t"
195                   : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3])
196                   : "0"(op));
197   }
198 
199   __forceinline void __cpuid_count(int out[4], int op1, int op2)
200   {
201     asm volatile ("xchg{l}\t{%%}ebx, %1\n\t"
202                   "cpuid\n\t"
203                   "xchg{l}\t{%%}ebx, %1\n\t"
204                   : "=a" (out[0]), "=r" (out[1]), "=c" (out[2]), "=d" (out[3])
205                   : "0" (op1), "2" (op2));
206   }
207 
208 #elif defined(__X86_ASM__)
209 
210   __forceinline void __cpuid(int out[4], int op) {
211     asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op));
212   }
213 
214   __forceinline void __cpuid_count(int out[4], int op1, int op2) {
215     asm volatile ("cpuid" : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) : "a"(op1), "c"(op2));
216   }
217 
218 #endif
219 
220   __forceinline uint64_t read_tsc()  {
221 #if defined(__X86_ASM__)
222     uint32_t high,low;
223     asm volatile ("rdtsc" : "=d"(high), "=a"(low));
224     return (((uint64_t)high) << 32) + (uint64_t)low;
225 #else
226     /* Not supported yet, meaning measuring traversal cost per pixel does not work. */
227     return 0;
228 #endif
229   }
230 
231   __forceinline int bsf(int v) {
232 #if defined(__AVX2__)
233     return _tzcnt_u32(v);
234 #elif defined(__X86_ASM__)
235     int r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
236 #else
237     return __builtin_ctz(v);
238 #endif
239   }
240 
241 #if defined(__64BIT__)
242   __forceinline unsigned bsf(unsigned v)
243   {
244 #if defined(__AVX2__)
245     return _tzcnt_u32(v);
246 #elif defined(__X86_ASM__)
247     unsigned r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
248 #else
249     return __builtin_ctz(v);
250 #endif
251   }
252 #endif
253 
254   __forceinline size_t bsf(size_t v) {
255 #if defined(__AVX2__)
256 #if defined(__X86_64__)
257     return _tzcnt_u64(v);
258 #else
259     return _tzcnt_u32(v);
260 #endif
261 #elif defined(__X86_ASM__)
262     size_t r = 0; asm ("bsf %1,%0" : "=r"(r) : "r"(v)); return r;
263 #else
264     return __builtin_ctzl(v);
265 #endif
266   }
267 
268   __forceinline int bscf(int& v)
269   {
270     int i = bsf(v);
271     v &= v-1;
272     return i;
273   }
274 
275 #if defined(__64BIT__)
276   __forceinline unsigned int bscf(unsigned int& v)
277   {
278     unsigned int i = bsf(v);
279     v &= v-1;
280     return i;
281   }
282 #endif
283 
284   __forceinline size_t bscf(size_t& v)
285   {
286     size_t i = bsf(v);
287     v &= v-1;
288     return i;
289   }
290 
291   __forceinline int bsr(int v) {
292 #if defined(__AVX2__)
293     return 31 - _lzcnt_u32(v);
294 #elif defined(__X86_ASM__)
295     int r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
296 #else
297     return __builtin_clz(v) ^ 31;
298 #endif
299   }
300 
301 #if defined(__64BIT__)
302   __forceinline unsigned bsr(unsigned v) {
303 #if defined(__AVX2__)
304     return 31 - _lzcnt_u32(v);
305 #elif defined(__X86_ASM__)
306     unsigned r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
307 #else
308     return __builtin_clz(v) ^ 31;
309 #endif
310   }
311 #endif
312 
313   __forceinline size_t bsr(size_t v) {
314 #if defined(__AVX2__)
315 #if defined(__X86_64__)
316     return 63 - _lzcnt_u64(v);
317 #else
318     return 31 - _lzcnt_u32(v);
319 #endif
320 #elif defined(__X86_ASM__)
321     size_t r = 0; asm ("bsr %1,%0" : "=r"(r) : "r"(v)); return r;
322 #else
323     return (sizeof(v) * 8 - 1) - __builtin_clzl(v);
324 #endif
325   }
326 
327   __forceinline int lzcnt(const int x)
328   {
329 #if defined(__AVX2__)
330     return _lzcnt_u32(x);
331 #else
332     if (unlikely(x == 0)) return 32;
333     return 31 - bsr(x);
334 #endif
335   }
336 
337   __forceinline size_t blsr(size_t v) {
338 #if defined(__AVX2__)
339 #if defined(__INTEL_COMPILER)
340     return _blsr_u64(v);
341 #else
342 #if defined(__X86_64__)
343     return __blsr_u64(v);
344 #else
345     return __blsr_u32(v);
346 #endif
347 #endif
348 #else
349     return v & (v-1);
350 #endif
351   }
352 
353   __forceinline int btc(int v, int i) {
354 #if defined(__X86_ASM__)
355     int r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
356 #else
357     return (v ^ (1 << i));
358 #endif
359   }
360 
361   __forceinline int bts(int v, int i) {
362 #if defined(__X86_ASM__)
363     int r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
364 #else
365     return (v | (v << i));
366 #endif
367   }
368 
369   __forceinline int btr(int v, int i) {
370 #if defined(__X86_ASM__)
371     int r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
372 #else
373     return (v & ~(v << i));
374 #endif
375   }
376 
377   __forceinline size_t btc(size_t v, size_t i) {
378 #if defined(__X86_ASM__)
379     size_t r = 0; asm ("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags" ); return r;
380 #else
381     return (v ^ (1 << i));
382 #endif
383   }
384 
385   __forceinline size_t bts(size_t v, size_t i) {
386 #if defined(__X86_ASM__)
387     size_t r = 0; asm ("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
388 #else
389     return (v | (v << i));
390 #endif
391   }
392 
393   __forceinline size_t btr(size_t v, size_t i) {
394 #if defined(__X86_ASM__)
395     size_t r = 0; asm ("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags"); return r;
396 #else
397     return (v & ~(v << i));
398 #endif
399   }
400 
401   __forceinline int32_t atomic_cmpxchg(int32_t volatile* value, int32_t comparand, const int32_t input) {
402     return __sync_val_compare_and_swap(value, comparand, input);
403   }
404 
405 #endif
406 
407 ////////////////////////////////////////////////////////////////////////////////
408 /// All Platforms
409 ////////////////////////////////////////////////////////////////////////////////
410 
411 #if defined(__clang__) || defined(__GNUC__)
412 #if !defined(_mm_undefined_ps)
_mm_undefined_ps()413   __forceinline __m128 _mm_undefined_ps() { return _mm_setzero_ps(); }
414 #endif
415 #if !defined(_mm_undefined_si128)
_mm_undefined_si128()416   __forceinline __m128i _mm_undefined_si128() { return _mm_setzero_si128(); }
417 #endif
418 #if !defined(_mm256_undefined_ps) && defined(__AVX__)
_mm256_undefined_ps()419   __forceinline __m256 _mm256_undefined_ps() { return _mm256_setzero_ps(); }
420 #endif
421 #if !defined(_mm256_undefined_si256) && defined(__AVX__)
_mm256_undefined_si256()422   __forceinline __m256i _mm256_undefined_si256() { return _mm256_setzero_si256(); }
423 #endif
424 #if !defined(_mm512_undefined_ps) && defined(__AVX512F__)
_mm512_undefined_ps()425   __forceinline __m512 _mm512_undefined_ps() { return _mm512_setzero_ps(); }
426 #endif
427 #if !defined(_mm512_undefined_epi32) && defined(__AVX512F__)
_mm512_undefined_epi32()428   __forceinline __m512i _mm512_undefined_epi32() { return _mm512_setzero_si512(); }
429 #endif
430 #endif
431 
432 #if defined(__SSE4_2__)
433 
popcnt(int in)434   __forceinline int popcnt(int in) {
435     return _mm_popcnt_u32(in);
436   }
437 
popcnt(unsigned in)438   __forceinline unsigned popcnt(unsigned in) {
439     return _mm_popcnt_u32(in);
440   }
441 
442 #if defined(__64BIT__)
popcnt(size_t in)443   __forceinline size_t popcnt(size_t in) {
444     return _mm_popcnt_u64(in);
445   }
446 #endif
447 
448 #endif
449 
450 #if defined(__X86_ASM__)
rdtsc()451   __forceinline uint64_t rdtsc()
452   {
453     int dummy[4];
454     __cpuid(dummy,0);
455     uint64_t clock = read_tsc();
456     __cpuid(dummy,0);
457     return clock;
458   }
459 #endif
460 
461   __forceinline void pause_cpu(const size_t N = 8)
462   {
463     for (size_t i=0; i<N; i++)
464       _mm_pause();
465   }
466 
467   /* prefetches */
prefetchL1(const void * ptr)468   __forceinline void prefetchL1 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T0); }
prefetchL2(const void * ptr)469   __forceinline void prefetchL2 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T1); }
prefetchL3(const void * ptr)470   __forceinline void prefetchL3 (const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_T2); }
prefetchNTA(const void * ptr)471   __forceinline void prefetchNTA(const void* ptr) { _mm_prefetch((const char*)ptr,_MM_HINT_NTA); }
prefetchEX(const void * ptr)472   __forceinline void prefetchEX (const void* ptr) {
473 #if defined(__INTEL_COMPILER)
474     _mm_prefetch((const char*)ptr,_MM_HINT_ET0);
475 #else
476     _mm_prefetch((const char*)ptr,_MM_HINT_T0);
477 #endif
478   }
479 
prefetchL1EX(const void * ptr)480   __forceinline void prefetchL1EX(const void* ptr) {
481     prefetchEX(ptr);
482   }
483 
prefetchL2EX(const void * ptr)484   __forceinline void prefetchL2EX(const void* ptr) {
485     prefetchEX(ptr);
486   }
487 #if defined(__AVX2__)
pext(unsigned int a,unsigned int b)488    __forceinline unsigned int pext(unsigned int a, unsigned int b) { return _pext_u32(a, b); }
pdep(unsigned int a,unsigned int b)489    __forceinline unsigned int pdep(unsigned int a, unsigned int b) { return _pdep_u32(a, b); }
490 #if defined(__X86_64__)
pext(size_t a,size_t b)491    __forceinline size_t pext(size_t a, size_t b) { return _pext_u64(a, b); }
pdep(size_t a,size_t b)492    __forceinline size_t pdep(size_t a, size_t b) { return _pdep_u64(a, b); }
493 #endif
494 #endif
495 
496 #if defined(__AVX512F__)
497 #if defined(__INTEL_COMPILER)
mm512_cvtss_f32(__m512 v)498    __forceinline float mm512_cvtss_f32(__m512 v) {
499      return _mm512_cvtss_f32(v);
500    }
mm512_mask2int(__mmask16 k1)501    __forceinline int mm512_mask2int(__mmask16 k1) {
502      return _mm512_mask2int(k1);
503    }
mm512_int2mask(int mask)504    __forceinline __mmask16 mm512_int2mask(int mask) {
505      return _mm512_int2mask(mask);
506    }
507 #else
mm512_cvtss_f32(__m512 v)508    __forceinline float mm512_cvtss_f32(__m512 v) { // FIXME: _mm512_cvtss_f32 neither supported by clang v4.0.0 nor GCC 6.3
509      return _mm_cvtss_f32(_mm512_castps512_ps128(v));
510    }
mm512_mask2int(__mmask16 k1)511    __forceinline int mm512_mask2int(__mmask16 k1) { // FIXME: _mm512_mask2int not yet supported by GCC 6.3
512      return (int)k1;
513    }
mm512_int2mask(int mask)514    __forceinline __mmask16 mm512_int2mask(int mask) { // FIXME: _mm512_int2mask not yet supported by GCC 6.3
515      return (__mmask16)mask;
516    }
517 #endif
518 #endif
519 }
520