1 /*
2 * Copyright 2011-2013 Intel Corporation
3 * Modifications Copyright 2014, Blender Foundation.
4 *
5 * Licensed under the Apache License, Version 2.0(the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 #ifndef __UTIL_SIMD_TYPES_H__
19 #define __UTIL_SIMD_TYPES_H__
20
21 #ifndef __KERNEL_GPU__
22
23 # include <limits>
24
25 # include "util/util_defines.h"
26
27 /* SSE Intrinsics includes
28 *
29 * We assume __KERNEL_SSEX__ flags to have been defined at this point */
30
31 /* SSE intrinsics headers */
32 # ifndef FREE_WINDOWS64
33
34 # ifdef _MSC_VER
35 # include <intrin.h>
36 # elif (defined(__x86_64__) || defined(__i386__))
37 # include <x86intrin.h>
38 # endif
39
40 # else
41
42 /* MinGW64 has conflicting declarations for these SSE headers in <windows.h>.
43 * Since we can't avoid including <windows.h>, better only include that */
44 # include "util/util_windows.h"
45
46 # endif
47
48 # if defined(__x86_64__) || defined(_M_X64)
49 # define SIMD_SET_FLUSH_TO_ZERO \
50 _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); \
51 _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
52 # else
53 # define SIMD_SET_FLUSH_TO_ZERO
54 # endif
55
56 CCL_NAMESPACE_BEGIN
57
58 # ifdef __KERNEL_SSE2__
59
60 extern const __m128 _mm_lookupmask_ps[16];
61
62 /* Special Types */
63
64 static struct TrueTy {
65 __forceinline operator bool() const
66 {
67 return true;
68 }
69 } True ccl_maybe_unused;
70
71 static struct FalseTy {
72 __forceinline operator bool() const
73 {
74 return false;
75 }
76 } False ccl_maybe_unused;
77
78 static struct ZeroTy {
79 __forceinline operator float() const
80 {
81 return 0;
82 }
83 __forceinline operator int() const
84 {
85 return 0;
86 }
87 } zero ccl_maybe_unused;
88
89 static struct OneTy {
90 __forceinline operator float() const
91 {
92 return 1;
93 }
94 __forceinline operator int() const
95 {
96 return 1;
97 }
98 } one ccl_maybe_unused;
99
100 static struct NegInfTy {
101 __forceinline operator float() const
102 {
103 return -std::numeric_limits<float>::infinity();
104 }
105 __forceinline operator int() const
106 {
107 return std::numeric_limits<int>::min();
108 }
109 } neg_inf ccl_maybe_unused;
110
111 static struct PosInfTy {
112 __forceinline operator float() const
113 {
114 return std::numeric_limits<float>::infinity();
115 }
116 __forceinline operator int() const
117 {
118 return std::numeric_limits<int>::max();
119 }
120 } inf ccl_maybe_unused, pos_inf ccl_maybe_unused;
121
122 static struct StepTy {
123 } step ccl_maybe_unused;
124
125 /* Intrinsics Functions */
126
127 # if defined(__BMI__) && defined(__GNUC__)
128 # ifndef _tzcnt_u32
129 # define _tzcnt_u32 __tzcnt_u32
130 # endif
131 # ifndef _tzcnt_u64
132 # define _tzcnt_u64 __tzcnt_u64
133 # endif
134 # endif
135
136 # if defined(__LZCNT__)
137 # define _lzcnt_u32 __lzcnt32
138 # define _lzcnt_u64 __lzcnt64
139 # endif
140
141 # if defined(_WIN32) && !defined(__MINGW32__) && !defined(__clang__)
142
__popcnt(int in)143 __forceinline int __popcnt(int in)
144 {
145 return _mm_popcnt_u32(in);
146 }
147
148 # if !defined(_MSC_VER)
__popcnt(unsigned int in)149 __forceinline unsigned int __popcnt(unsigned int in)
150 {
151 return _mm_popcnt_u32(in);
152 }
153 # endif
154
155 # if defined(__KERNEL_64_BIT__)
__popcnt(long long in)156 __forceinline long long __popcnt(long long in)
157 {
158 return _mm_popcnt_u64(in);
159 }
__popcnt(size_t in)160 __forceinline size_t __popcnt(size_t in)
161 {
162 return _mm_popcnt_u64(in);
163 }
164 # endif
165
__bsf(int v)166 __forceinline int __bsf(int v)
167 {
168 # if defined(__KERNEL_AVX2__)
169 return _tzcnt_u32(v);
170 # else
171 unsigned long r = 0;
172 _BitScanForward(&r, v);
173 return r;
174 # endif
175 }
176
__bsf(unsigned int v)177 __forceinline unsigned int __bsf(unsigned int v)
178 {
179 # if defined(__KERNEL_AVX2__)
180 return _tzcnt_u32(v);
181 # else
182 unsigned long r = 0;
183 _BitScanForward(&r, v);
184 return r;
185 # endif
186 }
187
__bsr(int v)188 __forceinline int __bsr(int v)
189 {
190 unsigned long r = 0;
191 _BitScanReverse(&r, v);
192 return r;
193 }
194
__btc(int v,int i)195 __forceinline int __btc(int v, int i)
196 {
197 long r = v;
198 _bittestandcomplement(&r, i);
199 return r;
200 }
201
__bts(int v,int i)202 __forceinline int __bts(int v, int i)
203 {
204 long r = v;
205 _bittestandset(&r, i);
206 return r;
207 }
208
__btr(int v,int i)209 __forceinline int __btr(int v, int i)
210 {
211 long r = v;
212 _bittestandreset(&r, i);
213 return r;
214 }
215
bitscan(int v)216 __forceinline int bitscan(int v)
217 {
218 # if defined(__KERNEL_AVX2__)
219 return _tzcnt_u32(v);
220 # else
221 return __bsf(v);
222 # endif
223 }
224
clz(const int x)225 __forceinline int clz(const int x)
226 {
227 # if defined(__KERNEL_AVX2__)
228 return _lzcnt_u32(x);
229 # else
230 if (UNLIKELY(x == 0))
231 return 32;
232 return 31 - __bsr(x);
233 # endif
234 }
235
__bscf(int & v)236 __forceinline int __bscf(int &v)
237 {
238 int i = __bsf(v);
239 v &= v - 1;
240 return i;
241 }
242
__bscf(unsigned int & v)243 __forceinline unsigned int __bscf(unsigned int &v)
244 {
245 unsigned int i = __bsf(v);
246 v &= v - 1;
247 return i;
248 }
249
250 # if defined(__KERNEL_64_BIT__)
251
__bsf(size_t v)252 __forceinline size_t __bsf(size_t v)
253 {
254 # if defined(__KERNEL_AVX2__)
255 return _tzcnt_u64(v);
256 # else
257 unsigned long r = 0;
258 _BitScanForward64(&r, v);
259 return r;
260 # endif
261 }
262
__bsr(size_t v)263 __forceinline size_t __bsr(size_t v)
264 {
265 unsigned long r = 0;
266 _BitScanReverse64(&r, v);
267 return r;
268 }
269
__btc(size_t v,size_t i)270 __forceinline size_t __btc(size_t v, size_t i)
271 {
272 size_t r = v;
273 _bittestandcomplement64((__int64 *)&r, i);
274 return r;
275 }
276
__bts(size_t v,size_t i)277 __forceinline size_t __bts(size_t v, size_t i)
278 {
279 __int64 r = v;
280 _bittestandset64(&r, i);
281 return r;
282 }
283
__btr(size_t v,size_t i)284 __forceinline size_t __btr(size_t v, size_t i)
285 {
286 __int64 r = v;
287 _bittestandreset64(&r, i);
288 return r;
289 }
290
bitscan(size_t v)291 __forceinline size_t bitscan(size_t v)
292 {
293 # if defined(__KERNEL_AVX2__)
294 # if defined(__KERNEL_64_BIT__)
295 return _tzcnt_u64(v);
296 # else
297 return _tzcnt_u32(v);
298 # endif
299 # else
300 return __bsf(v);
301 # endif
302 }
303
__bscf(size_t & v)304 __forceinline size_t __bscf(size_t &v)
305 {
306 size_t i = __bsf(v);
307 v &= v - 1;
308 return i;
309 }
310
311 # endif /* __KERNEL_64_BIT__ */
312
313 # else /* _WIN32 */
314
__popcnt(unsigned int in)315 __forceinline unsigned int __popcnt(unsigned int in)
316 {
317 int r = 0;
318 asm("popcnt %1,%0" : "=r"(r) : "r"(in));
319 return r;
320 }
321
__bsf(int v)322 __forceinline int __bsf(int v)
323 {
324 int r = 0;
325 asm("bsf %1,%0" : "=r"(r) : "r"(v));
326 return r;
327 }
328
__bsr(int v)329 __forceinline int __bsr(int v)
330 {
331 int r = 0;
332 asm("bsr %1,%0" : "=r"(r) : "r"(v));
333 return r;
334 }
335
__btc(int v,int i)336 __forceinline int __btc(int v, int i)
337 {
338 int r = 0;
339 asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
340 return r;
341 }
342
__bts(int v,int i)343 __forceinline int __bts(int v, int i)
344 {
345 int r = 0;
346 asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
347 return r;
348 }
349
__btr(int v,int i)350 __forceinline int __btr(int v, int i)
351 {
352 int r = 0;
353 asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
354 return r;
355 }
356
357 # if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
358 !(defined(__ILP32__) && defined(__x86_64__))
__bsf(size_t v)359 __forceinline size_t __bsf(size_t v)
360 {
361 size_t r = 0;
362 asm("bsf %1,%0" : "=r"(r) : "r"(v));
363 return r;
364 }
365 # endif
366
__bsf(unsigned int v)367 __forceinline unsigned int __bsf(unsigned int v)
368 {
369 unsigned int r = 0;
370 asm("bsf %1,%0" : "=r"(r) : "r"(v));
371 return r;
372 }
373
__bsr(size_t v)374 __forceinline size_t __bsr(size_t v)
375 {
376 size_t r = 0;
377 asm("bsr %1,%0" : "=r"(r) : "r"(v));
378 return r;
379 }
380
__btc(size_t v,size_t i)381 __forceinline size_t __btc(size_t v, size_t i)
382 {
383 size_t r = 0;
384 asm("btc %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
385 return r;
386 }
387
__bts(size_t v,size_t i)388 __forceinline size_t __bts(size_t v, size_t i)
389 {
390 size_t r = 0;
391 asm("bts %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
392 return r;
393 }
394
__btr(size_t v,size_t i)395 __forceinline size_t __btr(size_t v, size_t i)
396 {
397 size_t r = 0;
398 asm("btr %1,%0" : "=r"(r) : "r"(i), "0"(v) : "flags");
399 return r;
400 }
401
bitscan(int v)402 __forceinline int bitscan(int v)
403 {
404 # if defined(__KERNEL_AVX2__)
405 return _tzcnt_u32(v);
406 # else
407 return __bsf(v);
408 # endif
409 }
410
bitscan(unsigned int v)411 __forceinline unsigned int bitscan(unsigned int v)
412 {
413 # if defined(__KERNEL_AVX2__)
414 return _tzcnt_u32(v);
415 # else
416 return __bsf(v);
417 # endif
418 }
419
420 # if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
421 !(defined(__ILP32__) && defined(__x86_64__))
bitscan(size_t v)422 __forceinline size_t bitscan(size_t v)
423 {
424 # if defined(__KERNEL_AVX2__)
425 # if defined(__KERNEL_64_BIT__)
426 return _tzcnt_u64(v);
427 # else
428 return _tzcnt_u32(v);
429 # endif
430 # else
431 return __bsf(v);
432 # endif
433 }
434 # endif
435
clz(const int x)436 __forceinline int clz(const int x)
437 {
438 # if defined(__KERNEL_AVX2__)
439 return _lzcnt_u32(x);
440 # else
441 if (UNLIKELY(x == 0))
442 return 32;
443 return 31 - __bsr(x);
444 # endif
445 }
446
__bscf(int & v)447 __forceinline int __bscf(int &v)
448 {
449 int i = bitscan(v);
450 # if defined(__KERNEL_AVX2__)
451 v &= v - 1;
452 # else
453 v = __btc(v, i);
454 # endif
455 return i;
456 }
457
__bscf(unsigned int & v)458 __forceinline unsigned int __bscf(unsigned int &v)
459 {
460 unsigned int i = bitscan(v);
461 v &= v - 1;
462 return i;
463 }
464
465 # if (defined(__KERNEL_64_BIT__) || defined(__APPLE__)) && \
466 !(defined(__ILP32__) && defined(__x86_64__))
__bscf(size_t & v)467 __forceinline size_t __bscf(size_t &v)
468 {
469 size_t i = bitscan(v);
470 # if defined(__KERNEL_AVX2__)
471 v &= v - 1;
472 # else
473 v = __btc(v, i);
474 # endif
475 return i;
476 }
477 # endif
478
479 # endif /* _WIN32 */
480
481 /* Test __KERNEL_SSE41__ for MSVC which does not define __SSE4_1__, and test
482 * __SSE4_1__ to avoid OpenImageIO conflicts with our emulation macros on other
483 * platforms when compiling code outside the kernel. */
484 # if !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__))
485
486 /* Emulation of SSE4 functions with SSE2 */
487
488 # define _MM_FROUND_TO_NEAREST_INT 0x00
489 # define _MM_FROUND_TO_NEG_INF 0x01
490 # define _MM_FROUND_TO_POS_INF 0x02
491 # define _MM_FROUND_TO_ZERO 0x03
492 # define _MM_FROUND_CUR_DIRECTION 0x04
493
494 # undef _mm_blendv_ps
495 # define _mm_blendv_ps _mm_blendv_ps_emu
_mm_blendv_ps_emu(__m128 value,__m128 input,__m128 mask)496 __forceinline __m128 _mm_blendv_ps_emu(__m128 value, __m128 input, __m128 mask)
497 {
498 __m128i isignmask = _mm_set1_epi32(0x80000000);
499 __m128 signmask = _mm_castsi128_ps(isignmask);
500 __m128i iandsign = _mm_castps_si128(_mm_and_ps(mask, signmask));
501 __m128i icmpmask = _mm_cmpeq_epi32(iandsign, isignmask);
502 __m128 cmpmask = _mm_castsi128_ps(icmpmask);
503 return _mm_or_ps(_mm_and_ps(cmpmask, input), _mm_andnot_ps(cmpmask, value));
504 }
505
506 # undef _mm_blend_ps
507 # define _mm_blend_ps _mm_blend_ps_emu
_mm_blend_ps_emu(__m128 value,__m128 input,const int mask)508 __forceinline __m128 _mm_blend_ps_emu(__m128 value, __m128 input, const int mask)
509 {
510 assert(mask < 0x10);
511 return _mm_blendv_ps(value, input, _mm_lookupmask_ps[mask]);
512 }
513
514 # undef _mm_blendv_epi8
515 # define _mm_blendv_epi8 _mm_blendv_epi8_emu
_mm_blendv_epi8_emu(__m128i value,__m128i input,__m128i mask)516 __forceinline __m128i _mm_blendv_epi8_emu(__m128i value, __m128i input, __m128i mask)
517 {
518 return _mm_or_si128(_mm_and_si128(mask, input), _mm_andnot_si128(mask, value));
519 }
520
521 # undef _mm_min_epi32
522 # define _mm_min_epi32 _mm_min_epi32_emu
_mm_min_epi32_emu(__m128i value,__m128i input)523 __forceinline __m128i _mm_min_epi32_emu(__m128i value, __m128i input)
524 {
525 return _mm_blendv_epi8(input, value, _mm_cmplt_epi32(value, input));
526 }
527
528 # undef _mm_max_epi32
529 # define _mm_max_epi32 _mm_max_epi32_emu
_mm_max_epi32_emu(__m128i value,__m128i input)530 __forceinline __m128i _mm_max_epi32_emu(__m128i value, __m128i input)
531 {
532 return _mm_blendv_epi8(value, input, _mm_cmplt_epi32(value, input));
533 }
534
535 # undef _mm_extract_epi32
536 # define _mm_extract_epi32 _mm_extract_epi32_emu
_mm_extract_epi32_emu(__m128i input,const int index)537 __forceinline int _mm_extract_epi32_emu(__m128i input, const int index)
538 {
539 switch (index) {
540 case 0:
541 return _mm_cvtsi128_si32(input);
542 case 1:
543 return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(1, 1, 1, 1)));
544 case 2:
545 return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(2, 2, 2, 2)));
546 case 3:
547 return _mm_cvtsi128_si32(_mm_shuffle_epi32(input, _MM_SHUFFLE(3, 3, 3, 3)));
548 default:
549 assert(false);
550 return 0;
551 }
552 }
553
554 # undef _mm_insert_epi32
555 # define _mm_insert_epi32 _mm_insert_epi32_emu
_mm_insert_epi32_emu(__m128i value,int input,const int index)556 __forceinline __m128i _mm_insert_epi32_emu(__m128i value, int input, const int index)
557 {
558 assert(index >= 0 && index < 4);
559 ((int *)&value)[index] = input;
560 return value;
561 }
562
563 # undef _mm_insert_ps
564 # define _mm_insert_ps _mm_insert_ps_emu
_mm_insert_ps_emu(__m128 value,__m128 input,const int index)565 __forceinline __m128 _mm_insert_ps_emu(__m128 value, __m128 input, const int index)
566 {
567 assert(index < 0x100);
568 ((float *)&value)[(index >> 4) & 0x3] = ((float *)&input)[index >> 6];
569 return _mm_andnot_ps(_mm_lookupmask_ps[index & 0xf], value);
570 }
571
572 # undef _mm_round_ps
573 # define _mm_round_ps _mm_round_ps_emu
_mm_round_ps_emu(__m128 value,const int flags)574 __forceinline __m128 _mm_round_ps_emu(__m128 value, const int flags)
575 {
576 switch (flags) {
577 case _MM_FROUND_TO_NEAREST_INT:
578 return _mm_cvtepi32_ps(_mm_cvtps_epi32(value));
579 case _MM_FROUND_TO_NEG_INF:
580 return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(-0.5f))));
581 case _MM_FROUND_TO_POS_INF:
582 return _mm_cvtepi32_ps(_mm_cvtps_epi32(_mm_add_ps(value, _mm_set1_ps(0.5f))));
583 case _MM_FROUND_TO_ZERO:
584 return _mm_cvtepi32_ps(_mm_cvttps_epi32(value));
585 }
586 return value;
587 }
588
589 # endif /* !(defined(__KERNEL_SSE41__) || defined(__SSE4_1__) || defined(__SSE4_2__)) */
590
591 /* Older GCC versions do not have _mm256_cvtss_f32 yet, so define it ourselves.
592 * _mm256_castps256_ps128 generates no instructions so this is just as efficient. */
593 # if defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
594 # undef _mm256_cvtss_f32
595 # define _mm256_cvtss_f32(a) (_mm_cvtss_f32(_mm256_castps256_ps128(a)))
596 # endif
597
598 # else /* __KERNEL_SSE2__ */
599
600 /* This section is for utility functions which operates on non-register data
601 * which might be used from a non-vectorized code.
602 */
603
604 ccl_device_inline int bitscan(int value)
605 {
606 assert(value != 0);
607 int bit = 0;
608 while ((value & (1 << bit)) == 0) {
609 ++bit;
610 }
611 return bit;
612 }
613
614 ccl_device_inline int __bsr(int value)
615 {
616 assert(value != 0);
617 int bit = 0;
618 while (value >>= 1) {
619 ++bit;
620 }
621 return bit;
622 }
623
624 # endif /* __KERNEL_SSE2__ */
625
626 /* quiet unused define warnings */
627 # if defined(__KERNEL_SSE2__) || defined(__KERNEL_SSE3__) || defined(__KERNEL_SSSE3__) || \
628 defined(__KERNEL_SSE41__) || defined(__KERNEL_AVX__) || defined(__KERNEL_AVX2__)
629 /* do nothing */
630 # endif
631
632 CCL_NAMESPACE_END
633
634 #endif /* __KERNEL_GPU__ */
635
636 #endif /* __UTIL_SIMD_TYPES_H__ */
637