1 /*
2  * Copyright 2011-2013 Intel Corporation
3  * Modifications Copyright 2014, Blender Foundation.
4  *
5  * Licensed under the Apache License, Version 2.0(the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 #ifndef __UTIL_SIMD_TYPES_H__
19 #define __UTIL_SIMD_TYPES_H__
20 
21 #ifndef __KERNEL_GPU__
22 
23 #  include <limits>
24 
25 #  include "util/util_defines.h"
26 
27 /* SSE Intrinsics includes
28  *
29  * We assume __KERNEL_SSEX__ flags to have been defined at this point */
30 
31 /* SSE intrinsics headers */
32 #  ifndef FREE_WINDOWS64
33 
34 #    ifdef _MSC_VER
35 #      include <intrin.h>
36 #    elif (defined(__x86_64__) || defined(__i386__))
37 #      include <x86intrin.h>
38 #    endif
39 
40 #  else
41 
42 /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
43  * Since we can't avoid including <windows.h>, better only include that */
44 #    include "util/util_windows.h"
45 
46 #  endif
47 
48 #  if defined(__x86_64__) || defined(_M_X64)
49 #    define SIMD_SET_FLUSH_TO_ZERO \
50       _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
51       _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
52 #  else
53 #    define SIMD_SET_FLUSH_TO_ZERO
54 #  endif
55 
56 CCL_NAMESPACE_BEGIN
57 
58 #  ifdef __KERNEL_SSE2__
59 
60 extern const __m128 _mm_lookupmask_ps[16];
61 
62 /* Special Types */
63 
64 static struct TrueTy {
65   __forceinline operator bool() const
66   {
67     return true;
68   }
69 } True ccl_maybe_unused;
70 
71 static struct FalseTy {
72   __forceinline operator bool() const
73   {
74     return false;
75   }
76 } False ccl_maybe_unused;
77 
78 static struct ZeroTy {
79   __forceinline operator float() const
80   {
81     return 0;
82   }
83   __forceinline operator int() const
84   {
85     return 0;
86   }
87 } zero ccl_maybe_unused;
88 
89 static struct OneTy {
90   __forceinline operator float() const
91   {
92     return 1;
93   }
94   __forceinline operator int() const
95   {
96     return 1;
97   }
98 } one ccl_maybe_unused;
99 
100 static struct NegInfTy {
101   __forceinline operator float() const
102   {
103     return -std::numeric_limits<float>::infinity();
104   }
105   __forceinline operator int() const
106   {
107     return std::numeric_limits<int>::min();
108   }
109 } neg_inf ccl_maybe_unused;
110 
111 static struct PosInfTy {
112   __forceinline operator float() const
113   {
114     return std::numeric_limits<float>::infinity();
115   }
116   __forceinline operator int() const
117   {
118     return std::numeric_limits<int>::max();
119   }
120 } inf ccl_maybe_unused, pos_inf ccl_maybe_unused;
121 
122 static struct StepTy {
123 } step ccl_maybe_unused;
124 
125 /* Intrinsics Functions */
126 
127 #    if defined(__BMI__) && defined(__GNUC__)
128 #      ifndef _tzcnt_u32
129 #        define _tzcnt_u32 __tzcnt_u32
130 #      endif
131 #      ifndef _tzcnt_u64
132 #        define _tzcnt_u64 __tzcnt_u64
133 #      endif
134 #    endif
135 
136 #    if defined(__LZCNT__)
137 #      define _lzcnt_u32 __lzcnt32
138 #      define _lzcnt_u64 __lzcnt64
139 #    endif
140 
141 #    if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
142 
__popcnt(int in)143 __forceinline int __popcnt(int in)
144 {
145   return _mm_popcnt_u32(in);
146 }
147 
148 #      if !defined(_MSC_VER)
__popcnt(unsigned int in)149 __forceinline unsigned int __popcnt(unsigned int in)
150 {
151   return _mm_popcnt_u32(in);
152 }
153 #      endif
154 
155 #      if defined(__KERNEL_64_BIT__)
__popcnt(long long in)156 __forceinline long long __popcnt(long long in)
157 {
158   return _mm_popcnt_u64(in);
159 }
__popcnt(size_t in)160 __forceinline size_t __popcnt(size_t in)
161 {
162   return _mm_popcnt_u64(in);
163 }
164 #      endif
165 
__bsf(int v)166 __forceinline int __bsf(int v)
167 {
168 #      if defined(__KERNEL_AVX2__)
169   return _tzcnt_u32(v);
170 #      else
171   unsigned long r = 0;
172   _BitScanForward(&r, v);
173   return r;
174 #      endif
175 }
176 
__bsf(unsigned int v)177 __forceinline unsigned int __bsf(unsigned int v)
178 {
179 #      if defined(__KERNEL_AVX2__)
180   return _tzcnt_u32(v);
181 #      else
182   unsigned long r = 0;
183   _BitScanForward(&r, v);
184   return r;
185 #      endif
186 }
187 
__bsr(int v)188 __forceinline int __bsr(int v)
189 {
190   unsigned long r = 0;
191   _BitScanReverse(&r, v);
192   return r;
193 }
194 
__btc(int v,int i)195 __forceinline int __btc(int v, int i)
196 {
197   long r = v;
198   _bittestandcomplement(&r, i);
199   return r;
200 }
201 
__bts(int v,int i)202 __forceinline int __bts(int v, int i)
203 {
204   long r = v;
205   _bittestandset(&r, i);
206   return r;
207 }
208 
__btr(int v,int i)209 __forceinline int __btr(int v, int i)
210 {
211   long r = v;
212   _bittestandreset(&r, i);
213   return r;
214 }
215 
bitscan(int v)216 __forceinline int bitscan(int v)
217 {
218 #      if defined(__KERNEL_AVX2__)
219   return _tzcnt_u32(v);
220 #      else
221   return __bsf(v);
222 #      endif
223 }
224 
clz(const int x)225 __forceinline int clz(const int x)
226 {
227 #      if defined(__KERNEL_AVX2__)
228   return _lzcnt_u32(x);
229 #      else
230   if (UNLIKELY(x == 0))
231     return 32;
232   return 31 - __bsr(x);
233 #      endif
234 }
235 
__bscf(int & v)236 __forceinline int __bscf(int &v)
237 {
238   int i = __bsf(v);
239   v &= v - 1;
240   return i;
241 }
242 
__bscf(unsigned int & v)243 __forceinline unsigned int __bscf(unsigned int &v)
244 {
245   unsigned int i = __bsf(v);
246   v &= v - 1;
247   return i;
248 }
249 
250 #      if defined(__KERNEL_64_BIT__)
251 
__bsf(size_t v)252 __forceinline size_t __bsf(size_t v)
253 {
254 #        if defined(__KERNEL_AVX2__)
255   return _tzcnt_u64(v);
256 #        else
257   unsigned long r = 0;
258   _BitScanForward64(&r, v);
259   return r;
260 #        endif
261 }
262 
__bsr(size_t v)263 __forceinline size_t __bsr(size_t v)
264 {
265   unsigned long r = 0;
266   _BitScanReverse64(&r, v);
267   return r;
268 }
269 
__btc(size_t v,size_t i)270 __forceinline size_t __btc(size_t v, size_t i)
271 {
272   size_t r = v;
273   _bittestandcomplement64((__int64 *)&r, i);
274   return r;
275 }
276 
__bts(size_t v,size_t i)277 __forceinline size_t __bts(size_t v, size_t i)
278 {
279   __int64 r = v;
280   _bittestandset64(&r, i);
281   return r;
282 }
283 
__btr(size_t v,size_t i)284 __forceinline size_t __btr(size_t v, size_t i)
285 {
286   __int64 r = v;
287   _bittestandreset64(&r, i);
288   return r;
289 }
290 
bitscan(size_t v)291 __forceinline size_t bitscan(size_t v)
292 {
293 #        if defined(__KERNEL_AVX2__)
294 #          if defined(__KERNEL_64_BIT__)
295   return _tzcnt_u64(v);
296 #          else
297   return _tzcnt_u32(v);
298 #          endif
299 #        else
300   return __bsf(v);
301 #        endif
302 }
303 
__bscf(size_t & v)304 __forceinline size_t __bscf(size_t &v)
305 {
306   size_t i = __bsf(v);
307   v &= v - 1;
308   return i;
309 }
310 
311 #      endif /* __KERNEL_64_BIT__ */
312 
313 #    else /* _WIN32 */
314 
__popcnt(unsigned int in)315 __forceinline unsigned int __popcnt(unsigned int in)
316 {
317   int r = 0;
318   asm("popcnt %1,%0" : "=r"(r) : "r"(in));
319   return r;
320 }
321 
__bsf(int v)322 __forceinline int __bsf(int v)
323 {
324   int r = 0;
325   asm("bsf %1,%0" : "=r"(r) : "r"(v));
326   return r;
327 }
328 
__bsr(int v)329 __forceinline int __bsr(int v)
330 {
331   int r = 0;
332   asm("bsr %1,%0" : "=r"(r) : "r"(v));
333   return r;
334 }
335 
__btc(int v,int i)336 __forceinline int __btc(int v, int i)
337 {
338   int r = 0;
339   asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
340   return r;
341 }
342 
__bts(int v,int i)343 __forceinline int __bts(int v, int i)
344 {
345   int r = 0;
346   asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
347   return r;
348 }
349 
__btr(int v,int i)350 __forceinline int __btr(int v, int i)
351 {
352   int r = 0;
353   asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
354   return r;
355 }
356 
357 #      if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
358           !(defined(__ILP32__) && defined(__x86_64__))
__bsf(size_t v)359 __forceinline size_t __bsf(size_t v)
360 {
361   size_t r = 0;
362   asm("bsf %1,%0" : "=r"(r) : "r"(v));
363   return r;
364 }
365 #      endif
366 
__bsf(unsigned int v)367 __forceinline unsigned int __bsf(unsigned int v)
368 {
369   unsigned int r = 0;
370   asm("bsf %1,%0" : "=r"(r) : "r"(v));
371   return r;
372 }
373 
__bsr(size_t v)374 __forceinline size_t __bsr(size_t v)
375 {
376   size_t r = 0;
377   asm("bsr %1,%0" : "=r"(r) : "r"(v));
378   return r;
379 }
380 
__btc(size_t v,size_t i)381 __forceinline size_t __btc(size_t v, size_t i)
382 {
383   size_t r = 0;
384   asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
385   return r;
386 }
387 
__bts(size_t v,size_t i)388 __forceinline size_t __bts(size_t v, size_t i)
389 {
390   size_t r = 0;
391   asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
392   return r;
393 }
394 
__btr(size_t v,size_t i)395 __forceinline size_t __btr(size_t v, size_t i)
396 {
397   size_t r = 0;
398   asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
399   return r;
400 }
401 
bitscan(int v)402 __forceinline int bitscan(int v)
403 {
404 #      if defined(__KERNEL_AVX2__)
405   return _tzcnt_u32(v);
406 #      else
407   return __bsf(v);
408 #      endif
409 }
410 
bitscan(unsigned int v)411 __forceinline unsigned int bitscan(unsigned int v)
412 {
413 #      if defined(__KERNEL_AVX2__)
414   return _tzcnt_u32(v);
415 #      else
416   return __bsf(v);
417 #      endif
418 }
419 
420 #      if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
421           !(defined(__ILP32__) && defined(__x86_64__))
bitscan(size_t v)422 __forceinline size_t bitscan(size_t v)
423 {
424 #        if defined(__KERNEL_AVX2__)
425 #          if defined(__KERNEL_64_BIT__)
426   return _tzcnt_u64(v);
427 #          else
428   return _tzcnt_u32(v);
429 #          endif
430 #        else
431   return __bsf(v);
432 #        endif
433 }
434 #      endif
435 
clz(const int x)436 __forceinline int clz(const int x)
437 {
438 #      if defined(__KERNEL_AVX2__)
439   return _lzcnt_u32(x);
440 #      else
441   if (UNLIKELY(x == 0))
442     return 32;
443   return 31 - __bsr(x);
444 #      endif
445 }
446 
__bscf(int & v)447 __forceinline int __bscf(int &v)
448 {
449   int i = bitscan(v);
450 #      if defined(__KERNEL_AVX2__)
451   v &= v - 1;
452 #      else
453   v = __btc(v, i);
454 #      endif
455   return i;
456 }
457 
__bscf(unsigned int & v)458 __forceinline unsigned int __bscf(unsigned int &v)
459 {
460   unsigned int i = bitscan(v);
461   v &= v - 1;
462   return i;
463 }
464 
465 #      if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
466           !(defined(__ILP32__) && defined(__x86_64__))
__bscf(size_t & v)467 __forceinline size_t __bscf(size_t &v)
468 {
469   size_t i = bitscan(v);
470 #        if defined(__KERNEL_AVX2__)
471   v &= v - 1;
472 #        else
473   v = __btc(v, i);
474 #        endif
475   return i;
476 }
477 #      endif
478 
479 #    endif /* _WIN32 */
480 
481 /* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test
482  * __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other
483  * platforms when compiling code outside the kernel. */
484 #    if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__))
485 
486 /* Emulation of SSE4 functions with SSE2 */
487 
488 #      define _MM_FROUND_TO_NEAREST_INT 0x00
489 #      define _MM_FROUND_TO_NEG_INF 0x01
490 #      define _MM_FROUND_TO_POS_INF 0x02
491 #      define _MM_FROUND_TO_ZERO 0x03
492 #      define _MM_FROUND_CUR_DIRECTION 0x04
493 
494 #      undef _mm_blendv_ps
495 #      define _mm_blendv_ps _mm_blendv_ps_emu
_mm_blendv_ps_emu(__m128 value,__m128 input,__m128 mask)496 __forceinline __m128 _mm_blendv_ps_emu(__m128 value, __m128 input, __m128 mask)
497 {
498   __m128i isignmask = _mm_set1_epi32(0x80000000);
499   __m128 signmask = _mm_castsi128_ps(isignmask);
500   __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask));
501   __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask);
502   __m128 cmpmask = _mm_castsi128_ps(icmpmask);
503   return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value));
504 }
505 
506 #      undef _mm_blend_ps
507 #      define _mm_blend_ps _mm_blend_ps_emu
_mm_blend_ps_emu(__m128 value,__m128 input,const int mask)508 __forceinline __m128 _mm_blend_ps_emu(__m128 value, __m128 input, const int mask)
509 {
510   assert(mask < 0x10);
511   return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]);
512 }
513 
514 #      undef _mm_blendv_epi8
515 #      define _mm_blendv_epi8 _mm_blendv_epi8_emu
_mm_blendv_epi8_emu(__m128i value,__m128i input,__m128i mask)516 __forceinline __m128i _mm_blendv_epi8_emu(__m128i value, __m128i input, __m128i mask)
517 {
518   return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value));
519 }
520 
521 #      undef _mm_min_epi32
522 #      define _mm_min_epi32 _mm_min_epi32_emu
_mm_min_epi32_emu(__m128i value,__m128i input)523 __forceinline __m128i _mm_min_epi32_emu(__m128i value, __m128i input)
524 {
525   return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input));
526 }
527 
528 #      undef _mm_max_epi32
529 #      define _mm_max_epi32 _mm_max_epi32_emu
_mm_max_epi32_emu(__m128i value,__m128i input)530 __forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input)
531 {
532   return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
533 }
534 
535 #      undef _mm_extract_epi32
536 #      define _mm_extract_epi32 _mm_extract_epi32_emu
_mm_extract_epi32_emu(__m128i input,const int index)537 __forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
538 {
539   switch (index) {
540     case 0:
541       return _mm_cvtsi128_si32(input);
542     case 1:
543       return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1)));
544     case 2:
545       return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2)));
546     case 3:
547       return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(3, 3, 3, 3)));
548     default:
549       assert(false);
550       return 0;
551   }
552 }
553 
554 #      undef _mm_insert_epi32
555 #      define _mm_insert_epi32 _mm_insert_epi32_emu
_mm_insert_epi32_emu(__m128i value,int input,const int index)556 __forceinline __m128i _mm_insert_epi32_emu(__m128i value, int input, const int index)
557 {
558   assert(index >= 0 && index < 4);
559   ((int *)&value)[index] = input;
560   return value;
561 }
562 
563 #      undef _mm_insert_ps
564 #      define _mm_insert_ps _mm_insert_ps_emu
_mm_insert_ps_emu(__m128 value,__m128 input,const int index)565 __forceinline __m128 _mm_insert_ps_emu(__m128 value, __m128 input, const int index)
566 {
567   assert(index < 0x100);
568   ((float *)&value)[(index >> 4) & 0x3] = ((float *)&input)[index >> 6];
569   return _mm_andnot_ps(_mm_lookupmask_ps[index & 0xf], value);
570 }
571 
572 #      undef _mm_round_ps
573 #      define _mm_round_ps _mm_round_ps_emu
_mm_round_ps_emu(__m128 value,const int flags)574 __forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
575 {
576   switch (flags) {
577     case _MM_FROUND_TO_NEAREST_INT:
578       return _mm_cvtepi32_ps(_mm_cvtps_epi32(value));
579     case _MM_FROUND_TO_NEG_INF:
580       return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f))));
581     case _MM_FROUND_TO_POS_INF:
582       return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(0.5f))));
583     case _MM_FROUND_TO_ZERO:
584       return _mm_cvtepi32_ps(_mm_cvttps_epi32(value));
585   }
586   return value;
587 }
588 
589 #    endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
590 
591 /* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves.
592  * _mm256_castps256_ps128 generates no instructions so this is just as efficient. */
593 #    if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
594 #      undef _mm256_cvtss_f32
595 #      define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
596 #    endif
597 
598 #  else /* __KERNEL_SSE2__ */
599 
600 /* This section is for utility functions which operates on non-register data
601  * which might be used from a non-vectorized code.
602  */
603 
604 ccl_device_inline int bitscan(int value)
605 {
606   assert(value != 0);
607   int bit = 0;
608   while ((value & (1 << bit)) == 0) {
609     ++bit;
610   }
611   return bit;
612 }
613 
614 ccl_device_inline int __bsr(int value)
615 {
616   assert(value != 0);
617   int bit = 0;
618   while (value >>= 1) {
619     ++bit;
620   }
621   return bit;
622 }
623 
624 #  endif /* __KERNEL_SSE2__ */
625 
626 /* quiet unused define warnings */
627 #  if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
628       defined(__KERNEL_SSE41__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
629 /* do nothing */
630 #  endif
631 
632 CCL_NAMESPACE_END
633 
634 #endif /* __KERNEL_GPU__ */
635 
636 #endif /* __UTIL_SIMD_TYPES_H__ */
637