1 /*
2 * Copyright 2011-2013 Intel Corporation
3 * Modifications Copyright 2014, Blender Foundation.
4 *
5 * Licensed under the Apache License, Version 2.0(the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 #ifndef __UTIL_SSEF_H__
19 #define __UTIL_SSEF_H__
20
21 #include "util_ssei.h"
22
23 CCL_NAMESPACE_BEGIN
24
25 #ifdef __KERNEL_SSE2__
26
27 struct sseb;
28 struct ssef;
29
30 /*! 4-wide SSE float type. */
31 struct ssef {
32 typedef sseb Mask; // mask type
33 typedef ssei Int; // int type
34 typedef ssef Float; // float type
35
36 enum { size = 4 }; // number of SIMD elements
37 union {
38 __m128 m128;
39 float f[4];
40 int i[4];
41 }; // data
42
43 ////////////////////////////////////////////////////////////////////////////////
44 /// Constructors, Assignment & Cast Operators
45 ////////////////////////////////////////////////////////////////////////////////
46
ssefssef47 __forceinline ssef()
48 {
49 }
ssefssef50 __forceinline ssef(const ssef &other)
51 {
52 m128 = other.m128;
53 }
54 __forceinline ssef &operator=(const ssef &other)
55 {
56 m128 = other.m128;
57 return *this;
58 }
59
ssefssef60 __forceinline ssef(const __m128 a) : m128(a)
61 {
62 }
63 __forceinline operator const __m128 &() const
64 {
65 return m128;
66 }
67 __forceinline operator __m128 &()
68 {
69 return m128;
70 }
71
ssefssef72 __forceinline ssef(float a) : m128(_mm_set1_ps(a))
73 {
74 }
ssefssef75 __forceinline ssef(float a, float b, float c, float d) : m128(_mm_setr_ps(a, b, c, d))
76 {
77 }
78
ssefssef79 __forceinline explicit ssef(const __m128i a) : m128(_mm_cvtepi32_ps(a))
80 {
81 }
82
83 ////////////////////////////////////////////////////////////////////////////////
84 /// Loads and Stores
85 ////////////////////////////////////////////////////////////////////////////////
86
87 # if defined(__KERNEL_AVX__)
broadcastssef88 static __forceinline ssef broadcast(const void *const a)
89 {
90 return _mm_broadcast_ss((float *)a);
91 }
92 # else
broadcastssef93 static __forceinline ssef broadcast(const void *const a)
94 {
95 return _mm_set1_ps(*(float *)a);
96 }
97 # endif
98
99 ////////////////////////////////////////////////////////////////////////////////
100 /// Array Access
101 ////////////////////////////////////////////////////////////////////////////////
102
103 __forceinline const float &operator[](const size_t i) const
104 {
105 assert(i < 4);
106 return f[i];
107 }
108 __forceinline float &operator[](const size_t i)
109 {
110 assert(i < 4);
111 return f[i];
112 }
113 };
114
115 ////////////////////////////////////////////////////////////////////////////////
116 /// Unary Operators
117 ////////////////////////////////////////////////////////////////////////////////
118
cast(const __m128i & a)119 __forceinline const ssef cast(const __m128i &a)
120 {
121 return _mm_castsi128_ps(a);
122 }
123 __forceinline const ssef operator+(const ssef &a)
124 {
125 return a;
126 }
127 __forceinline const ssef operator-(const ssef &a)
128 {
129 return _mm_xor_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
130 }
abs(const ssef & a)131 __forceinline const ssef abs(const ssef &a)
132 {
133 return _mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)));
134 }
135 # if defined(__KERNEL_SSE41__)
sign(const ssef & a)136 __forceinline const ssef sign(const ssef &a)
137 {
138 return _mm_blendv_ps(ssef(1.0f), -ssef(1.0f), _mm_cmplt_ps(a, ssef(0.0f)));
139 }
140 # endif
signmsk(const ssef & a)141 __forceinline const ssef signmsk(const ssef &a)
142 {
143 return _mm_and_ps(a.m128, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
144 }
145
rcp(const ssef & a)146 __forceinline const ssef rcp(const ssef &a)
147 {
148 const ssef r = _mm_rcp_ps(a.m128);
149 return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a));
150 }
sqr(const ssef & a)151 __forceinline const ssef sqr(const ssef &a)
152 {
153 return _mm_mul_ps(a, a);
154 }
mm_sqrt(const ssef & a)155 __forceinline const ssef mm_sqrt(const ssef &a)
156 {
157 return _mm_sqrt_ps(a.m128);
158 }
rsqrt(const ssef & a)159 __forceinline const ssef rsqrt(const ssef &a)
160 {
161 const ssef r = _mm_rsqrt_ps(a.m128);
162 return _mm_add_ps(
163 _mm_mul_ps(_mm_set_ps(1.5f, 1.5f, 1.5f, 1.5f), r),
164 _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set_ps(-0.5f, -0.5f, -0.5f, -0.5f)), r),
165 _mm_mul_ps(r, r)));
166 }
167
168 ////////////////////////////////////////////////////////////////////////////////
169 /// Binary Operators
170 ////////////////////////////////////////////////////////////////////////////////
171
172 __forceinline const ssef operator+(const ssef &a, const ssef &b)
173 {
174 return _mm_add_ps(a.m128, b.m128);
175 }
176 __forceinline const ssef operator+(const ssef &a, const float &b)
177 {
178 return a + ssef(b);
179 }
180 __forceinline const ssef operator+(const float &a, const ssef &b)
181 {
182 return ssef(a) + b;
183 }
184
185 __forceinline const ssef operator-(const ssef &a, const ssef &b)
186 {
187 return _mm_sub_ps(a.m128, b.m128);
188 }
189 __forceinline const ssef operator-(const ssef &a, const float &b)
190 {
191 return a - ssef(b);
192 }
193 __forceinline const ssef operator-(const float &a, const ssef &b)
194 {
195 return ssef(a) - b;
196 }
197
198 __forceinline const ssef operator*(const ssef &a, const ssef &b)
199 {
200 return _mm_mul_ps(a.m128, b.m128);
201 }
202 __forceinline const ssef operator*(const ssef &a, const float &b)
203 {
204 return a * ssef(b);
205 }
206 __forceinline const ssef operator*(const float &a, const ssef &b)
207 {
208 return ssef(a) * b;
209 }
210
211 __forceinline const ssef operator/(const ssef &a, const ssef &b)
212 {
213 return _mm_div_ps(a.m128, b.m128);
214 }
215 __forceinline const ssef operator/(const ssef &a, const float &b)
216 {
217 return a / ssef(b);
218 }
219 __forceinline const ssef operator/(const float &a, const ssef &b)
220 {
221 return ssef(a) / b;
222 }
223
224 __forceinline const ssef operator^(const ssef &a, const ssef &b)
225 {
226 return _mm_xor_ps(a.m128, b.m128);
227 }
228 __forceinline const ssef operator^(const ssef &a, const ssei &b)
229 {
230 return _mm_xor_ps(a.m128, _mm_castsi128_ps(b.m128));
231 }
232
233 __forceinline const ssef operator&(const ssef &a, const ssef &b)
234 {
235 return _mm_and_ps(a.m128, b.m128);
236 }
237 __forceinline const ssef operator&(const ssef &a, const ssei &b)
238 {
239 return _mm_and_ps(a.m128, _mm_castsi128_ps(b.m128));
240 }
241
242 __forceinline const ssef operator|(const ssef &a, const ssef &b)
243 {
244 return _mm_or_ps(a.m128, b.m128);
245 }
246 __forceinline const ssef operator|(const ssef &a, const ssei &b)
247 {
248 return _mm_or_ps(a.m128, _mm_castsi128_ps(b.m128));
249 }
250
andnot(const ssef & a,const ssef & b)251 __forceinline const ssef andnot(const ssef &a, const ssef &b)
252 {
253 return _mm_andnot_ps(a.m128, b.m128);
254 }
255
min(const ssef & a,const ssef & b)256 __forceinline const ssef min(const ssef &a, const ssef &b)
257 {
258 return _mm_min_ps(a.m128, b.m128);
259 }
min(const ssef & a,const float & b)260 __forceinline const ssef min(const ssef &a, const float &b)
261 {
262 return _mm_min_ps(a.m128, ssef(b));
263 }
min(const float & a,const ssef & b)264 __forceinline const ssef min(const float &a, const ssef &b)
265 {
266 return _mm_min_ps(ssef(a), b.m128);
267 }
268
max(const ssef & a,const ssef & b)269 __forceinline const ssef max(const ssef &a, const ssef &b)
270 {
271 return _mm_max_ps(a.m128, b.m128);
272 }
max(const ssef & a,const float & b)273 __forceinline const ssef max(const ssef &a, const float &b)
274 {
275 return _mm_max_ps(a.m128, ssef(b));
276 }
max(const float & a,const ssef & b)277 __forceinline const ssef max(const float &a, const ssef &b)
278 {
279 return _mm_max_ps(ssef(a), b.m128);
280 }
281
282 # if defined(__KERNEL_SSE41__)
mini(const ssef & a,const ssef & b)283 __forceinline ssef mini(const ssef &a, const ssef &b)
284 {
285 const ssei ai = _mm_castps_si128(a);
286 const ssei bi = _mm_castps_si128(b);
287 const ssei ci = _mm_min_epi32(ai, bi);
288 return _mm_castsi128_ps(ci);
289 }
290 # endif
291
292 # if defined(__KERNEL_SSE41__)
maxi(const ssef & a,const ssef & b)293 __forceinline ssef maxi(const ssef &a, const ssef &b)
294 {
295 const ssei ai = _mm_castps_si128(a);
296 const ssei bi = _mm_castps_si128(b);
297 const ssei ci = _mm_max_epi32(ai, bi);
298 return _mm_castsi128_ps(ci);
299 }
300 # endif
301
302 ////////////////////////////////////////////////////////////////////////////////
303 /// Ternary Operators
304 ////////////////////////////////////////////////////////////////////////////////
305
306 # if defined(__KERNEL_AVX2__)
madd(const ssef & a,const ssef & b,const ssef & c)307 __forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c)
308 {
309 return _mm_fmadd_ps(a, b, c);
310 }
msub(const ssef & a,const ssef & b,const ssef & c)311 __forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c)
312 {
313 return _mm_fmsub_ps(a, b, c);
314 }
nmadd(const ssef & a,const ssef & b,const ssef & c)315 __forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c)
316 {
317 return _mm_fnmadd_ps(a, b, c);
318 }
nmsub(const ssef & a,const ssef & b,const ssef & c)319 __forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c)
320 {
321 return _mm_fnmsub_ps(a, b, c);
322 }
323 # else
madd(const ssef & a,const ssef & b,const ssef & c)324 __forceinline const ssef madd(const ssef &a, const ssef &b, const ssef &c)
325 {
326 return a * b + c;
327 }
msub(const ssef & a,const ssef & b,const ssef & c)328 __forceinline const ssef msub(const ssef &a, const ssef &b, const ssef &c)
329 {
330 return a * b - c;
331 }
nmadd(const ssef & a,const ssef & b,const ssef & c)332 __forceinline const ssef nmadd(const ssef &a, const ssef &b, const ssef &c)
333 {
334 return c - a * b;
335 }
nmsub(const ssef & a,const ssef & b,const ssef & c)336 __forceinline const ssef nmsub(const ssef &a, const ssef &b, const ssef &c)
337 {
338 return -a * b - c;
339 }
340 # endif
341
342 ////////////////////////////////////////////////////////////////////////////////
343 /// Assignment Operators
344 ////////////////////////////////////////////////////////////////////////////////
345
346 __forceinline ssef &operator+=(ssef &a, const ssef &b)
347 {
348 return a = a + b;
349 }
350 __forceinline ssef &operator+=(ssef &a, const float &b)
351 {
352 return a = a + b;
353 }
354
355 __forceinline ssef &operator-=(ssef &a, const ssef &b)
356 {
357 return a = a - b;
358 }
359 __forceinline ssef &operator-=(ssef &a, const float &b)
360 {
361 return a = a - b;
362 }
363
364 __forceinline ssef &operator*=(ssef &a, const ssef &b)
365 {
366 return a = a * b;
367 }
368 __forceinline ssef &operator*=(ssef &a, const float &b)
369 {
370 return a = a * b;
371 }
372
373 __forceinline ssef &operator/=(ssef &a, const ssef &b)
374 {
375 return a = a / b;
376 }
377 __forceinline ssef &operator/=(ssef &a, const float &b)
378 {
379 return a = a / b;
380 }
381
382 ////////////////////////////////////////////////////////////////////////////////
383 /// Comparison Operators + Select
384 ////////////////////////////////////////////////////////////////////////////////
385
386 __forceinline const sseb operator==(const ssef &a, const ssef &b)
387 {
388 return _mm_cmpeq_ps(a.m128, b.m128);
389 }
390 __forceinline const sseb operator==(const ssef &a, const float &b)
391 {
392 return a == ssef(b);
393 }
394 __forceinline const sseb operator==(const float &a, const ssef &b)
395 {
396 return ssef(a) == b;
397 }
398
399 __forceinline const sseb operator!=(const ssef &a, const ssef &b)
400 {
401 return _mm_cmpneq_ps(a.m128, b.m128);
402 }
403 __forceinline const sseb operator!=(const ssef &a, const float &b)
404 {
405 return a != ssef(b);
406 }
407 __forceinline const sseb operator!=(const float &a, const ssef &b)
408 {
409 return ssef(a) != b;
410 }
411
412 __forceinline const sseb operator<(const ssef &a, const ssef &b)
413 {
414 return _mm_cmplt_ps(a.m128, b.m128);
415 }
416 __forceinline const sseb operator<(const ssef &a, const float &b)
417 {
418 return a < ssef(b);
419 }
420 __forceinline const sseb operator<(const float &a, const ssef &b)
421 {
422 return ssef(a) < b;
423 }
424
425 __forceinline const sseb operator>=(const ssef &a, const ssef &b)
426 {
427 return _mm_cmpnlt_ps(a.m128, b.m128);
428 }
429 __forceinline const sseb operator>=(const ssef &a, const float &b)
430 {
431 return a >= ssef(b);
432 }
433 __forceinline const sseb operator>=(const float &a, const ssef &b)
434 {
435 return ssef(a) >= b;
436 }
437
438 __forceinline const sseb operator>(const ssef &a, const ssef &b)
439 {
440 return _mm_cmpnle_ps(a.m128, b.m128);
441 }
442 __forceinline const sseb operator>(const ssef &a, const float &b)
443 {
444 return a > ssef(b);
445 }
446 __forceinline const sseb operator>(const float &a, const ssef &b)
447 {
448 return ssef(a) > b;
449 }
450
451 __forceinline const sseb operator<=(const ssef &a, const ssef &b)
452 {
453 return _mm_cmple_ps(a.m128, b.m128);
454 }
455 __forceinline const sseb operator<=(const ssef &a, const float &b)
456 {
457 return a <= ssef(b);
458 }
459 __forceinline const sseb operator<=(const float &a, const ssef &b)
460 {
461 return ssef(a) <= b;
462 }
463
select(const sseb & m,const ssef & t,const ssef & f)464 __forceinline const ssef select(const sseb &m, const ssef &t, const ssef &f)
465 {
466 # ifdef __KERNEL_SSE41__
467 return _mm_blendv_ps(f, t, m);
468 # else
469 return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
470 # endif
471 }
472
select(const ssef & m,const ssef & t,const ssef & f)473 __forceinline const ssef select(const ssef &m, const ssef &t, const ssef &f)
474 {
475 # ifdef __KERNEL_SSE41__
476 return _mm_blendv_ps(f, t, m);
477 # else
478 return _mm_or_ps(_mm_and_ps(m, t), _mm_andnot_ps(m, f));
479 # endif
480 }
481
select(const int mask,const ssef & t,const ssef & f)482 __forceinline const ssef select(const int mask, const ssef &t, const ssef &f)
483 {
484 # if defined(__KERNEL_SSE41__) && \
485 ((!defined(__clang__) && !defined(_MSC_VER)) || defined(__INTEL_COMPILER))
486 return _mm_blend_ps(f, t, mask);
487 # else
488 return select(sseb(mask), t, f);
489 # endif
490 }
491
492 ////////////////////////////////////////////////////////////////////////////////
493 /// Rounding Functions
494 ////////////////////////////////////////////////////////////////////////////////
495
496 # if defined(__KERNEL_SSE41__)
round_even(const ssef & a)497 __forceinline const ssef round_even(const ssef &a)
498 {
499 return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT);
500 }
round_down(const ssef & a)501 __forceinline const ssef round_down(const ssef &a)
502 {
503 return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
504 }
round_up(const ssef & a)505 __forceinline const ssef round_up(const ssef &a)
506 {
507 return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
508 }
round_zero(const ssef & a)509 __forceinline const ssef round_zero(const ssef &a)
510 {
511 return _mm_round_ps(a, _MM_FROUND_TO_ZERO);
512 }
floor(const ssef & a)513 __forceinline const ssef floor(const ssef &a)
514 {
515 return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF);
516 }
ceil(const ssef & a)517 __forceinline const ssef ceil(const ssef &a)
518 {
519 return _mm_round_ps(a, _MM_FROUND_TO_POS_INF);
520 }
521 # endif
522
truncatei(const ssef & a)523 __forceinline ssei truncatei(const ssef &a)
524 {
525 return _mm_cvttps_epi32(a.m128);
526 }
527
528 /* This is about 25% faster than straightforward floor to integer conversion
529 * due to better pipelining.
530 *
531 * Unsaturated add 0xffffffff (a < 0) is the same as subtract -1.
532 */
floori(const ssef & a)533 __forceinline ssei floori(const ssef &a)
534 {
535 return truncatei(a) + cast((a < 0.0f).m128);
536 }
537
floorfrac(const ssef & x,ssei * i)538 __forceinline ssef floorfrac(const ssef &x, ssei *i)
539 {
540 *i = floori(x);
541 return x - ssef(*i);
542 }
543
544 ////////////////////////////////////////////////////////////////////////////////
545 /// Common Functions
546 ////////////////////////////////////////////////////////////////////////////////
547
mix(const ssef & a,const ssef & b,const ssef & t)548 __forceinline ssef mix(const ssef &a, const ssef &b, const ssef &t)
549 {
550 return madd(t, b, (ssef(1.0f) - t) * a);
551 }
552
553 ////////////////////////////////////////////////////////////////////////////////
554 /// Movement/Shifting/Shuffling Functions
555 ////////////////////////////////////////////////////////////////////////////////
556
unpacklo(const ssef & a,const ssef & b)557 __forceinline ssef unpacklo(const ssef &a, const ssef &b)
558 {
559 return _mm_unpacklo_ps(a.m128, b.m128);
560 }
unpackhi(const ssef & a,const ssef & b)561 __forceinline ssef unpackhi(const ssef &a, const ssef &b)
562 {
563 return _mm_unpackhi_ps(a.m128, b.m128);
564 }
565
566 template<size_t i0, size_t i1, size_t i2, size_t i3>
shuffle(const ssef & b)567 __forceinline const ssef shuffle(const ssef &b)
568 {
569 return _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(b), _MM_SHUFFLE(i3, i2, i1, i0)));
570 }
571
572 template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a)
573 {
574 return _mm_movelh_ps(a, a);
575 }
576
577 template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a)
578 {
579 return _mm_movehl_ps(a, a);
580 }
581
582 template<size_t i0, size_t i1, size_t i2, size_t i3>
shuffle(const ssef & a,const ssef & b)583 __forceinline const ssef shuffle(const ssef &a, const ssef &b)
584 {
585 return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i3, i2, i1, i0));
586 }
587
shuffle(const ssef & a,const ssef & b)588 template<size_t i0> __forceinline const ssef shuffle(const ssef &a, const ssef &b)
589 {
590 return _mm_shuffle_ps(a, b, _MM_SHUFFLE(i0, i0, i0, i0));
591 }
592
593 template<> __forceinline const ssef shuffle<0, 1, 0, 1>(const ssef &a, const ssef &b)
594 {
595 return _mm_movelh_ps(a, b);
596 }
597
598 template<> __forceinline const ssef shuffle<2, 3, 2, 3>(const ssef &a, const ssef &b)
599 {
600 return _mm_movehl_ps(b, a);
601 }
602
603 # if defined(__KERNEL_SSSE3__)
shuffle8(const ssef & a,const ssei & shuf)604 __forceinline const ssef shuffle8(const ssef &a, const ssei &shuf)
605 {
606 return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a), shuf));
607 }
608 # endif
609
610 # if defined(__KERNEL_SSE3__)
611 template<> __forceinline const ssef shuffle<0, 0, 2, 2>(const ssef &b)
612 {
613 return _mm_moveldup_ps(b);
614 }
615 template<> __forceinline const ssef shuffle<1, 1, 3, 3>(const ssef &b)
616 {
617 return _mm_movehdup_ps(b);
618 }
619 # endif
620
shuffle(const ssef & b)621 template<size_t i0> __forceinline const ssef shuffle(const ssef &b)
622 {
623 return shuffle<i0, i0, i0, i0>(b);
624 }
625
626 # if defined(__KERNEL_AVX__)
shuffle(const ssef & a,const ssei & shuf)627 __forceinline const ssef shuffle(const ssef &a, const ssei &shuf)
628 {
629 return _mm_permutevar_ps(a, shuf);
630 }
631 # endif
632
extract(const ssef & a)633 template<size_t i> __forceinline float extract(const ssef &a)
634 {
635 return _mm_cvtss_f32(shuffle<i, i, i, i>(a));
636 }
637 template<> __forceinline float extract<0>(const ssef &a)
638 {
639 return _mm_cvtss_f32(a);
640 }
641
642 # if defined(__KERNEL_SSE41__)
643 template<size_t dst, size_t src, size_t clr>
insert(const ssef & a,const ssef & b)644 __forceinline const ssef insert(const ssef &a, const ssef &b)
645 {
646 return _mm_insert_ps(a, b, (dst << 4) | (src << 6) | clr);
647 }
insert(const ssef & a,const ssef & b)648 template<size_t dst, size_t src> __forceinline const ssef insert(const ssef &a, const ssef &b)
649 {
650 return insert<dst, src, 0>(a, b);
651 }
insert(const ssef & a,const float b)652 template<size_t dst> __forceinline const ssef insert(const ssef &a, const float b)
653 {
654 return insert<dst, 0>(a, _mm_set_ss(b));
655 }
656 # else
insert(const ssef & a,const float b)657 template<size_t dst> __forceinline const ssef insert(const ssef &a, const float b)
658 {
659 ssef c = a;
660 c[dst] = b;
661 return c;
662 }
663 # endif
664
665 ////////////////////////////////////////////////////////////////////////////////
666 /// Transpose
667 ////////////////////////////////////////////////////////////////////////////////
668
transpose(const ssef & r0,const ssef & r1,const ssef & r2,const ssef & r3,ssef & c0,ssef & c1,ssef & c2,ssef & c3)669 __forceinline void transpose(const ssef &r0,
670 const ssef &r1,
671 const ssef &r2,
672 const ssef &r3,
673 ssef &c0,
674 ssef &c1,
675 ssef &c2,
676 ssef &c3)
677 {
678 ssef l02 = unpacklo(r0, r2);
679 ssef h02 = unpackhi(r0, r2);
680 ssef l13 = unpacklo(r1, r3);
681 ssef h13 = unpackhi(r1, r3);
682 c0 = unpacklo(l02, l13);
683 c1 = unpackhi(l02, l13);
684 c2 = unpacklo(h02, h13);
685 c3 = unpackhi(h02, h13);
686 }
687
transpose(const ssef & r0,const ssef & r1,const ssef & r2,const ssef & r3,ssef & c0,ssef & c1,ssef & c2)688 __forceinline void transpose(
689 const ssef &r0, const ssef &r1, const ssef &r2, const ssef &r3, ssef &c0, ssef &c1, ssef &c2)
690 {
691 ssef l02 = unpacklo(r0, r2);
692 ssef h02 = unpackhi(r0, r2);
693 ssef l13 = unpacklo(r1, r3);
694 ssef h13 = unpackhi(r1, r3);
695 c0 = unpacklo(l02, l13);
696 c1 = unpackhi(l02, l13);
697 c2 = unpacklo(h02, h13);
698 }
699
700 ////////////////////////////////////////////////////////////////////////////////
701 /// Reductions
702 ////////////////////////////////////////////////////////////////////////////////
703
vreduce_min(const ssef & v)704 __forceinline const ssef vreduce_min(const ssef &v)
705 {
706 ssef h = min(shuffle<1, 0, 3, 2>(v), v);
707 return min(shuffle<2, 3, 0, 1>(h), h);
708 }
vreduce_max(const ssef & v)709 __forceinline const ssef vreduce_max(const ssef &v)
710 {
711 ssef h = max(shuffle<1, 0, 3, 2>(v), v);
712 return max(shuffle<2, 3, 0, 1>(h), h);
713 }
vreduce_add(const ssef & v)714 __forceinline const ssef vreduce_add(const ssef &v)
715 {
716 ssef h = shuffle<1, 0, 3, 2>(v) + v;
717 return shuffle<2, 3, 0, 1>(h) + h;
718 }
719
reduce_min(const ssef & v)720 __forceinline float reduce_min(const ssef &v)
721 {
722 return _mm_cvtss_f32(vreduce_min(v));
723 }
reduce_max(const ssef & v)724 __forceinline float reduce_max(const ssef &v)
725 {
726 return _mm_cvtss_f32(vreduce_max(v));
727 }
reduce_add(const ssef & v)728 __forceinline float reduce_add(const ssef &v)
729 {
730 return _mm_cvtss_f32(vreduce_add(v));
731 }
732
select_min(const ssef & v)733 __forceinline size_t select_min(const ssef &v)
734 {
735 return __bsf(movemask(v == vreduce_min(v)));
736 }
select_max(const ssef & v)737 __forceinline size_t select_max(const ssef &v)
738 {
739 return __bsf(movemask(v == vreduce_max(v)));
740 }
741
select_min(const sseb & valid,const ssef & v)742 __forceinline size_t select_min(const sseb &valid, const ssef &v)
743 {
744 const ssef a = select(valid, v, ssef(pos_inf));
745 return __bsf(movemask(valid & (a == vreduce_min(a))));
746 }
select_max(const sseb & valid,const ssef & v)747 __forceinline size_t select_max(const sseb &valid, const ssef &v)
748 {
749 const ssef a = select(valid, v, ssef(neg_inf));
750 return __bsf(movemask(valid & (a == vreduce_max(a))));
751 }
752
movemask(const ssef & a)753 __forceinline size_t movemask(const ssef &a)
754 {
755 return _mm_movemask_ps(a);
756 }
757
758 ////////////////////////////////////////////////////////////////////////////////
759 /// Memory load and store operations
760 ////////////////////////////////////////////////////////////////////////////////
761
load4f(const float4 & a)762 __forceinline ssef load4f(const float4 &a)
763 {
764 # ifdef __KERNEL_WITH_SSE_ALIGN__
765 return _mm_load_ps(&a.x);
766 # else
767 return _mm_loadu_ps(&a.x);
768 # endif
769 }
770
load4f(const float3 & a)771 __forceinline ssef load4f(const float3 &a)
772 {
773 # ifdef __KERNEL_WITH_SSE_ALIGN__
774 return _mm_load_ps(&a.x);
775 # else
776 return _mm_loadu_ps(&a.x);
777 # endif
778 }
779
load4f(const void * const a)780 __forceinline ssef load4f(const void *const a)
781 {
782 return _mm_load_ps((float *)a);
783 }
784
load1f_first(const float a)785 __forceinline ssef load1f_first(const float a)
786 {
787 return _mm_set_ss(a);
788 }
789
store4f(void * ptr,const ssef & v)790 __forceinline void store4f(void *ptr, const ssef &v)
791 {
792 _mm_store_ps((float *)ptr, v);
793 }
794
loadu4f(const void * const a)795 __forceinline ssef loadu4f(const void *const a)
796 {
797 return _mm_loadu_ps((float *)a);
798 }
799
storeu4f(void * ptr,const ssef & v)800 __forceinline void storeu4f(void *ptr, const ssef &v)
801 {
802 _mm_storeu_ps((float *)ptr, v);
803 }
804
store4f(const sseb & mask,void * ptr,const ssef & f)805 __forceinline void store4f(const sseb &mask, void *ptr, const ssef &f)
806 {
807 # if defined(__KERNEL_AVX__)
808 _mm_maskstore_ps((float *)ptr, (__m128i)mask, f);
809 # else
810 *(ssef *)ptr = select(mask, f, *(ssef *)ptr);
811 # endif
812 }
813
load4f_nt(void * ptr)814 __forceinline ssef load4f_nt(void *ptr)
815 {
816 # if defined(__KERNEL_SSE41__)
817 return _mm_castsi128_ps(_mm_stream_load_si128((__m128i *)ptr));
818 # else
819 return _mm_load_ps((float *)ptr);
820 # endif
821 }
822
store4f_nt(void * ptr,const ssef & v)823 __forceinline void store4f_nt(void *ptr, const ssef &v)
824 {
825 # if defined(__KERNEL_SSE41__)
826 _mm_stream_ps((float *)ptr, v);
827 # else
828 _mm_store_ps((float *)ptr, v);
829 # endif
830 }
831
832 ////////////////////////////////////////////////////////////////////////////////
833 /// Euclidian Space Operators
834 ////////////////////////////////////////////////////////////////////////////////
835
dot(const ssef & a,const ssef & b)836 __forceinline float dot(const ssef &a, const ssef &b)
837 {
838 return reduce_add(a * b);
839 }
840
841 /* calculate shuffled cross product, useful when order of components does not matter */
cross_zxy(const ssef & a,const ssef & b)842 __forceinline ssef cross_zxy(const ssef &a, const ssef &b)
843 {
844 const ssef a0 = a;
845 const ssef b0 = shuffle<1, 2, 0, 3>(b);
846 const ssef a1 = shuffle<1, 2, 0, 3>(a);
847 const ssef b1 = b;
848 return msub(a0, b0, a1 * b1);
849 }
850
cross(const ssef & a,const ssef & b)851 __forceinline ssef cross(const ssef &a, const ssef &b)
852 {
853 return shuffle<1, 2, 0, 3>(cross_zxy(a, b));
854 }
855
dot3_splat(const ssef & a,const ssef & b)856 ccl_device_inline const ssef dot3_splat(const ssef &a, const ssef &b)
857 {
858 # ifdef __KERNEL_SSE41__
859 return _mm_dp_ps(a.m128, b.m128, 0x7f);
860 # else
861 ssef t = a * b;
862 return ssef(((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2]);
863 # endif
864 }
865
866 /* squared length taking only specified axes into account */
len_squared(const ssef & a)867 template<size_t X, size_t Y, size_t Z, size_t W> ccl_device_inline float len_squared(const ssef &a)
868 {
869 # ifndef __KERNEL_SSE41__
870 float4 &t = (float4 &)a;
871 return (X ? t.x * t.x : 0.0f) + (Y ? t.y * t.y : 0.0f) + (Z ? t.z * t.z : 0.0f) +
872 (W ? t.w * t.w : 0.0f);
873 # else
874 return extract<0>(
875 ssef(_mm_dp_ps(a.m128, a.m128, (X << 4) | (Y << 5) | (Z << 6) | (W << 7) | 0xf)));
876 # endif
877 }
878
dot3(const ssef & a,const ssef & b)879 ccl_device_inline float dot3(const ssef &a, const ssef &b)
880 {
881 # ifdef __KERNEL_SSE41__
882 return extract<0>(ssef(_mm_dp_ps(a.m128, b.m128, 0x7f)));
883 # else
884 ssef t = a * b;
885 return ((float *)&t)[0] + ((float *)&t)[1] + ((float *)&t)[2];
886 # endif
887 }
888
len3_squared_splat(const ssef & a)889 ccl_device_inline const ssef len3_squared_splat(const ssef &a)
890 {
891 return dot3_splat(a, a);
892 }
893
len3_squared(const ssef & a)894 ccl_device_inline float len3_squared(const ssef &a)
895 {
896 return dot3(a, a);
897 }
898
len3(const ssef & a)899 ccl_device_inline float len3(const ssef &a)
900 {
901 return extract<0>(mm_sqrt(dot3_splat(a, a)));
902 }
903
904 /* SSE shuffle utility functions */
905
906 # ifdef __KERNEL_SSSE3__
907
908 /* faster version for SSSE3 */
909 typedef ssei shuffle_swap_t;
910
shuffle_swap_identity()911 ccl_device_inline shuffle_swap_t shuffle_swap_identity()
912 {
913 return _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0);
914 }
915
shuffle_swap_swap()916 ccl_device_inline shuffle_swap_t shuffle_swap_swap()
917 {
918 return _mm_set_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
919 }
920
shuffle_swap(const ssef & a,const shuffle_swap_t & shuf)921 ccl_device_inline const ssef shuffle_swap(const ssef &a, const shuffle_swap_t &shuf)
922 {
923 return cast(_mm_shuffle_epi8(cast(a), shuf));
924 }
925
926 # else
927
928 /* somewhat slower version for SSE2 */
929 typedef int shuffle_swap_t;
930
shuffle_swap_identity()931 ccl_device_inline shuffle_swap_t shuffle_swap_identity()
932 {
933 return 0;
934 }
935
shuffle_swap_swap()936 ccl_device_inline shuffle_swap_t shuffle_swap_swap()
937 {
938 return 1;
939 }
940
shuffle_swap(const ssef & a,shuffle_swap_t shuf)941 ccl_device_inline const ssef shuffle_swap(const ssef &a, shuffle_swap_t shuf)
942 {
943 /* shuffle value must be a constant, so we need to branch */
944 if (shuf)
945 return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(1, 0, 3, 2)));
946 else
947 return ssef(_mm_shuffle_ps(a.m128, a.m128, _MM_SHUFFLE(3, 2, 1, 0)));
948 }
949
950 # endif
951
952 # ifdef __KERNEL_SSE41__
953
gen_idirsplat_swap(const ssef & pn,const shuffle_swap_t & shuf_identity,const shuffle_swap_t & shuf_swap,const float3 & idir,ssef idirsplat[3],shuffle_swap_t shufflexyz[3])954 ccl_device_inline void gen_idirsplat_swap(const ssef &pn,
955 const shuffle_swap_t &shuf_identity,
956 const shuffle_swap_t &shuf_swap,
957 const float3 &idir,
958 ssef idirsplat[3],
959 shuffle_swap_t shufflexyz[3])
960 {
961 const __m128 idirsplat_raw[] = {_mm_set_ps1(idir.x), _mm_set_ps1(idir.y), _mm_set_ps1(idir.z)};
962 idirsplat[0] = _mm_xor_ps(idirsplat_raw[0], pn);
963 idirsplat[1] = _mm_xor_ps(idirsplat_raw[1], pn);
964 idirsplat[2] = _mm_xor_ps(idirsplat_raw[2], pn);
965
966 const ssef signmask = cast(ssei(0x80000000));
967 const ssef shuf_identity_f = cast(shuf_identity);
968 const ssef shuf_swap_f = cast(shuf_swap);
969
970 shufflexyz[0] = _mm_castps_si128(
971 _mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[0], signmask)));
972 shufflexyz[1] = _mm_castps_si128(
973 _mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[1], signmask)));
974 shufflexyz[2] = _mm_castps_si128(
975 _mm_blendv_ps(shuf_identity_f, shuf_swap_f, _mm_and_ps(idirsplat_raw[2], signmask)));
976 }
977
978 # else
979
gen_idirsplat_swap(const ssef & pn,const shuffle_swap_t & shuf_identity,const shuffle_swap_t & shuf_swap,const float3 & idir,ssef idirsplat[3],shuffle_swap_t shufflexyz[3])980 ccl_device_inline void gen_idirsplat_swap(const ssef &pn,
981 const shuffle_swap_t &shuf_identity,
982 const shuffle_swap_t &shuf_swap,
983 const float3 &idir,
984 ssef idirsplat[3],
985 shuffle_swap_t shufflexyz[3])
986 {
987 idirsplat[0] = ssef(idir.x) ^ pn;
988 idirsplat[1] = ssef(idir.y) ^ pn;
989 idirsplat[2] = ssef(idir.z) ^ pn;
990
991 shufflexyz[0] = (idir.x >= 0) ? shuf_identity : shuf_swap;
992 shufflexyz[1] = (idir.y >= 0) ? shuf_identity : shuf_swap;
993 shufflexyz[2] = (idir.z >= 0) ? shuf_identity : shuf_swap;
994 }
995
996 # endif
997
uint32_to_float(const ssei & in)998 ccl_device_inline const ssef uint32_to_float(const ssei &in)
999 {
1000 ssei a = _mm_srli_epi32(in, 16);
1001 ssei b = _mm_and_si128(in, _mm_set1_epi32(0x0000ffff));
1002 ssei c = _mm_or_si128(a, _mm_set1_epi32(0x53000000));
1003 ssef d = _mm_cvtepi32_ps(b);
1004 ssef e = _mm_sub_ps(_mm_castsi128_ps(c), _mm_castsi128_ps(_mm_set1_epi32(0x53000000)));
1005 return _mm_add_ps(e, d);
1006 }
1007
1008 template<size_t S1, size_t S2, size_t S3, size_t S4>
set_sign_bit(const ssef & a)1009 ccl_device_inline const ssef set_sign_bit(const ssef &a)
1010 {
1011 return cast(cast(a) ^ ssei(S1 << 31, S2 << 31, S3 << 31, S4 << 31));
1012 }
1013
1014 ////////////////////////////////////////////////////////////////////////////////
1015 /// Debug Functions
1016 ////////////////////////////////////////////////////////////////////////////////
1017
print_ssef(const char * label,const ssef & a)1018 ccl_device_inline void print_ssef(const char *label, const ssef &a)
1019 {
1020 printf(
1021 "%s: %.8f %.8f %.8f %.8f\n", label, (double)a[0], (double)a[1], (double)a[2], (double)a[3]);
1022 }
1023
1024 #endif
1025
1026 CCL_NAMESPACE_END
1027
1028 #endif
1029