1 /**************************** vectorf256e.h *******************************
2 * Author: Agner Fog
3 * Date created: 2012-05-30
4 * Last modified: 2017-02-19
5 * Version: 1.27
6 * Project: vector classes
7 * Description:
8 * Header file defining 256-bit floating point vector classes as interface
9 * to intrinsic functions. Emulated for processors without AVX instruction set.
10 *
11 * The following vector classes are defined here:
12 * Vec8f Vector of 8 single precision floating point numbers
13 * Vec8fb Vector of 8 Booleans for use with Vec8f
14 * Vec4d Vector of 4 double precision floating point numbers
15 * Vec4db Vector of 4 Booleans for use with Vec4d
16 *
17 * For detailed instructions, see VectorClass.pdf
18 *
19 * (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses
20 *****************************************************************************/
21
22 // check combination of header files
23 #ifdef VECTORF256_H
24 #if VECTORF256_H != 1
25 #error Two different versions of vectorf256.h included
26 #endif
27 #else
28 #define VECTORF256_H 1
29
30 #if defined (VECTORI256_H) && VECTORI256_H >= 2
31 #error wrong combination of header files. Use vectorf256.h instead of vectorf256e.h if you have AVX2
32 #endif
33
34
35 #include "vectorf128.h" // Define 128-bit vectors
36
37 #ifdef VCL_NAMESPACE
38 namespace VCL_NAMESPACE {
39 #endif
40
41 /*****************************************************************************
42 *
43 * base class Vec256fe and Vec256de
44 *
45 *****************************************************************************/
46 // base class to replace __m256 when AVX is not supported
47 class Vec256fe {
48 protected:
49 __m128 y0; // low half
50 __m128 y1; // high half
51 public:
Vec256fe(void)52 Vec256fe(void) {}; // default constructor
Vec256fe(__m128 x0,__m128 x1)53 Vec256fe(__m128 x0, __m128 x1) { // constructor to build from two __m128
54 y0 = x0; y1 = x1;
55 }
get_low()56 __m128 get_low() const { // get low half
57 return y0;
58 }
get_high()59 __m128 get_high() const { // get high half
60 return y1;
61 }
62 };
63
64 // base class to replace __m256d when AVX is not supported
65 class Vec256de {
66 public:
Vec256de()67 Vec256de() {}; // default constructor
Vec256de(__m128d x0,__m128d x1)68 Vec256de(__m128d x0, __m128d x1) { // constructor to build from two __m128d
69 y0 = x0; y1 = x1;
70 }
get_low()71 __m128d get_low() const { // get low half
72 return y0;
73 }
get_high()74 __m128d get_high() const { // get high half
75 return y1;
76 }
77 protected:
78 __m128d y0; // low half
79 __m128d y1; // high half
80 };
81
82
83 /*****************************************************************************
84 *
85 * select functions
86 *
87 *****************************************************************************/
88 // Select between two Vec256fe sources, element by element. Used in various functions
89 // and operators. Corresponds to this pseudocode:
90 // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
91 // Each element in s must be either 0 (false) or 0xFFFFFFFF (true).
selectf(Vec256fe const & s,Vec256fe const & a,Vec256fe const & b)92 static inline Vec256fe selectf (Vec256fe const & s, Vec256fe const & a, Vec256fe const & b) {
93 return Vec256fe(selectf(b.get_low(), a.get_low(), s.get_low()), selectf(b.get_high(), a.get_high(), s.get_high()));
94 }
95
96 // Same, with two Vec256de sources.
97 // and operators. Corresponds to this pseudocode:
98 // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
99 // Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other
100 // values are allowed.
selectd(Vec256de const & s,Vec256de const & a,Vec256de const & b)101 static inline Vec256de selectd (Vec256de const & s, Vec256de const & a, Vec256de const & b) {
102 return Vec256de(selectd(b.get_low(), a.get_low(), s.get_low()), selectd(b.get_high(), a.get_high(), s.get_high()));
103 }
104
105
106
107 /*****************************************************************************
108 *
109 * Generate compile-time constant vector
110 *
111 *****************************************************************************/
112 // Generate a constant vector of 8 integers stored in memory,
113 // load as __m256
114 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
constant8f()115 static inline Vec256fe constant8f() {
116 static const union {
117 int i[8];
118 __m128 y[2];
119 } u = {{i0,i1,i2,i3,i4,i5,i6,i7}};
120 return Vec256fe(u.y[0], u.y[1]);
121 }
122
123
124 /*****************************************************************************
125 *
126 * Vec8fb: Vector of 8 Booleans for use with Vec8f
127 *
128 *****************************************************************************/
129
130 class Vec8fb : public Vec256fe {
131 public:
132 // Default constructor:
Vec8fb()133 Vec8fb() {
134 }
135 // Constructor to build from all elements:
Vec8fb(bool b0,bool b1,bool b2,bool b3,bool b4,bool b5,bool b6,bool b7)136 Vec8fb(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) {
137 y0 = Vec4fb(b0, b1, b2, b3);
138 y1 = Vec4fb(b4, b5, b6, b7);
139 }
140 // Constructor to build from two Vec4fb:
Vec8fb(Vec4fb const & a0,Vec4fb const & a1)141 Vec8fb(Vec4fb const & a0, Vec4fb const & a1) {
142 y0 = a0; y1 = a1;
143 }
144 // Constructor to convert from type Vec256fe
Vec8fb(Vec256fe const & x)145 Vec8fb(Vec256fe const & x) {
146 y0 = x.get_low(); y1 = x.get_high();
147 }
148 // Assignment operator to convert from type Vec256fe
149 Vec8fb & operator = (Vec256fe const & x) {
150 y0 = x.get_low(); y1 = x.get_high();
151 return *this;
152 }
153 #ifdef VECTORI256_H // 256 bit integer vectors are available
154 // Constructor to convert from type Vec8ib used as Boolean for integer vectors
Vec8fb(Vec8ib const & x)155 Vec8fb(Vec8ib const & x) {
156 y0 = _mm_castsi128_ps(Vec8i(x).get_low());
157 y1 = _mm_castsi128_ps(Vec8i(x).get_high());
158 }
159 // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors
160 Vec8fb & operator = (Vec8ib const & x) {
161 y0 = _mm_castsi128_ps(Vec8i(x).get_low());
162 y1 = _mm_castsi128_ps(Vec8i(x).get_high());
163 return *this;
164 }
165 // Constructor to broadcast the same value into all elements:
Vec8fb(bool b)166 Vec8fb(bool b) {
167 y1 = y0 = Vec4fb(b);
168 }
169 // Assignment operator to broadcast scalar value:
170 Vec8fb & operator = (bool b) {
171 y0 = y1 = Vec4fb(b);
172 return *this;
173 }
174 private: // Prevent constructing from int, etc.
175 Vec8fb(int b);
176 Vec8fb & operator = (int x);
177 public:
178 // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors
Vec8ib()179 operator Vec8ib() const {
180 return Vec8i(_mm_castps_si128(y0), _mm_castps_si128(y1));
181 }
182 #endif // VECTORI256_H
183 // Member function to change a single element in vector
184 // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,bool value)185 Vec8fb const & insert(uint32_t index, bool value) {
186 if (index < 4) {
187 y0 = Vec4fb(y0).insert(index, value);
188 }
189 else {
190 y1 = Vec4fb(y1).insert(index-4, value);
191 }
192 return *this;
193 }
194 // Member function extract a single element from vector
195 // Note: This function is inefficient. Use store function if extracting more than one element
extract(uint32_t index)196 bool extract(uint32_t index) const {
197 if (index < 4) {
198 return Vec4fb(y0).extract(index);
199 }
200 else {
201 return Vec4fb(y1).extract(index-4);
202 }
203 }
204 // Extract a single element. Operator [] can only read an element, not write.
205 bool operator [] (uint32_t index) const {
206 return extract(index);
207 }
208 // Member functions to split into two Vec4fb:
get_low()209 Vec4fb get_low() const {
210 return y0;
211 }
get_high()212 Vec4fb get_high() const {
213 return y1;
214 }
size()215 static int size () {
216 return 8;
217 }
218 };
219
220
221 /*****************************************************************************
222 *
223 * Operators for Vec8fb
224 *
225 *****************************************************************************/
226
227 // vector operator & : bitwise and
228 static inline Vec8fb operator & (Vec8fb const & a, Vec8fb const & b) {
229 return Vec8fb(a.get_low() & b.get_low(), a.get_high() & b.get_high());
230 }
231
232 static inline Vec8fb operator && (Vec8fb const & a, Vec8fb const & b) {
233 return a & b;
234 }
235
236 // vector operator &= : bitwise and
237 static inline Vec8fb & operator &= (Vec8fb & a, Vec8fb const & b) {
238 a = a & b;
239 return a;
240 }
241
242 // vector operator | : bitwise or
243 static inline Vec8fb operator | (Vec8fb const & a, Vec8fb const & b) {
244 return Vec8fb(a.get_low() | b.get_low(), a.get_high() | b.get_high());
245 }
246 static inline Vec8fb operator || (Vec8fb const & a, Vec8fb const & b) {
247 return a | b;
248 }
249
250 // vector operator |= : bitwise or
251 static inline Vec8fb & operator |= (Vec8fb & a, Vec8fb const & b) {
252 a = a | b;
253 return a;
254 }
255
256 // vector operator ^ : bitwise xor
257 static inline Vec8fb operator ^ (Vec8fb const & a, Vec8fb const & b) {
258 return Vec8fb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
259 }
260
261 // vector operator ^= : bitwise xor
262 static inline Vec8fb & operator ^= (Vec8fb & a, Vec8fb const & b) {
263 a = a ^ b;
264 return a;
265 }
266
267 // vector operator ~ : bitwise not
268 static inline Vec8fb operator ~ (Vec8fb const & a) {
269 return Vec8fb(~a.get_low(), ~a.get_high());
270 }
271
272 // vector operator ! : logical not
273 // (operator ! is less efficient than operator ~. Use only where not
274 // all bits in an element are the same)
275 static inline Vec8fb operator ! (Vec8fb const & a) {
276 return Vec8fb(!a.get_low(), !a.get_high());
277 }
278
279 // Functions for Vec8fb
280
281 // andnot: a & ~ b
andnot(Vec8fb const & a,Vec8fb const & b)282 static inline Vec8fb andnot(Vec8fb const & a, Vec8fb const & b) {
283 return Vec8fb(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
284 }
285
286
287
288 /*****************************************************************************
289 *
290 * Horizontal Boolean functions
291 *
292 *****************************************************************************/
293
294 // horizontal_and. Returns true if all bits are 1
horizontal_and(Vec8fb const & a)295 static inline bool horizontal_and (Vec8fb const & a) {
296 return horizontal_and(a.get_low() & a.get_high());
297 }
298
299 // horizontal_or. Returns true if at least one bit is 1
horizontal_or(Vec8fb const & a)300 static inline bool horizontal_or (Vec8fb const & a) {
301 return horizontal_or(a.get_low() | a.get_high());
302 }
303
304
305
306 /*****************************************************************************
307 *
308 * Vec4db: Vector of 4 Booleans for use with Vec4d
309 *
310 *****************************************************************************/
311
312 class Vec4db : public Vec256de {
313 public:
314 // Default constructor:
Vec4db()315 Vec4db() {
316 }
317 // Constructor to build from all elements:
Vec4db(bool b0,bool b1,bool b2,bool b3)318 Vec4db(bool b0, bool b1, bool b2, bool b3) {
319 y0 = Vec2db(b0, b1);
320 y1 = Vec2db(b2, b3);
321 }
322 // Constructor to build from two Vec2db:
Vec4db(Vec2db const & a0,Vec2db const & a1)323 Vec4db(Vec2db const & a0, Vec2db const & a1) {
324 y0 = a0; y1 = a1;
325 }
326 // Constructor to convert from type Vec256de
Vec4db(Vec256de const & x)327 Vec4db(Vec256de const & x) {
328 y0 = x.get_low(); y1 = x.get_high();
329 }
330 // Assignment operator to convert from type Vec256de
331 Vec4db & operator = (Vec256de const & x) {
332 y0 = x.get_low(); y1 = x.get_high();
333 return *this;
334 }
335 #ifdef VECTORI256_H // 256 bit integer vectors are available
336 // Constructor to convert from type Vec4qb used as Boolean for integer vectors
Vec4db(Vec4qb const & x)337 Vec4db(Vec4qb const & x) {
338 y0 = _mm_castsi128_pd(Vec4q(x).get_low());
339 y1 = _mm_castsi128_pd(Vec4q(x).get_high());
340 }
341 // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors
342 Vec4db & operator = (Vec4qb const & x) {
343 y0 = _mm_castsi128_pd(Vec4q(x).get_low());
344 y1 = _mm_castsi128_pd(Vec4q(x).get_high());
345 return *this;
346 }
347 // Constructor to broadcast the same value into all elements:
Vec4db(bool b)348 Vec4db(bool b) {
349 y1 = y0 = Vec2db(b);
350 }
351 // Assignment operator to broadcast scalar value:
352 Vec4db & operator = (bool b) {
353 y0 = y1 = Vec2db(b);
354 return *this;
355 }
356 private: // Prevent constructing from int, etc.
357 Vec4db(int b);
358 Vec4db & operator = (int x);
359 public:
360 // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors
Vec4qb()361 operator Vec4qb() const {
362 return Vec4q(_mm_castpd_si128(y0), _mm_castpd_si128(y1));
363 }
364 #endif // VECTORI256_H
365 // Member function to change a single element in vector
366 // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,bool value)367 Vec4db const & insert(uint32_t index, bool value) {
368 if (index < 2) {
369 y0 = Vec2db(y0).insert(index, value);
370 }
371 else {
372 y1 = Vec2db(y1).insert(index - 2, value);
373 }
374 return *this;
375 }
376 // Member function extract a single element from vector
377 // Note: This function is inefficient. Use store function if extracting more than one element
extract(uint32_t index)378 bool extract(uint32_t index) const {
379 if (index < 2) {
380 return Vec2db(y0).extract(index);
381 }
382 else {
383 return Vec2db(y1).extract(index - 2);
384 }
385 }
386 // Extract a single element. Operator [] can only read an element, not write.
387 bool operator [] (uint32_t index) const {
388 return extract(index);
389 }
390 // Member functions to split into two Vec4fb:
get_low()391 Vec2db get_low() const {
392 return y0;
393 }
get_high()394 Vec2db get_high() const {
395 return y1;
396 }
size()397 static int size () {
398 return 4;
399 }
400 };
401
402
403 /*****************************************************************************
404 *
405 * Operators for Vec4db
406 *
407 *****************************************************************************/
408
409 // vector operator & : bitwise and
410 static inline Vec4db operator & (Vec4db const & a, Vec4db const & b) {
411 return Vec4db(a.get_low() & b.get_low(), a.get_high() & b.get_high());
412 }
413 static inline Vec4db operator && (Vec4db const & a, Vec4db const & b) {
414 return a & b;
415 }
416
417 // vector operator &= : bitwise and
418 static inline Vec4db & operator &= (Vec4db & a, Vec4db const & b) {
419 a = a & b;
420 return a;
421 }
422
423 // vector operator | : bitwise or
424 static inline Vec4db operator | (Vec4db const & a, Vec4db const & b) {
425 return Vec4db(a.get_low() | b.get_low(), a.get_high() | b.get_high());
426 }
427 static inline Vec4db operator || (Vec4db const & a, Vec4db const & b) {
428 return a | b;
429 }
430
431 // vector operator |= : bitwise or
432 static inline Vec4db & operator |= (Vec4db & a, Vec4db const & b) {
433 a = a | b;
434 return a;
435 }
436
437 // vector operator ^ : bitwise xor
438 static inline Vec4db operator ^ (Vec4db const & a, Vec4db const & b) {
439 return Vec4db(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
440
441 }
442
443 // vector operator ^= : bitwise xor
444 static inline Vec4db & operator ^= (Vec4db & a, Vec4db const & b) {
445 a = a ^ b;
446 return a;
447 }
448
449 // vector operator ~ : bitwise not
450 static inline Vec4db operator ~ (Vec4db const & a) {
451 return Vec4db(~a.get_low(), ~a.get_high());
452 }
453
454 // vector operator ! : logical not
455 // (operator ! is less efficient than operator ~. Use only where not
456 // all bits in an element are the same)
457 static inline Vec4db operator ! (Vec4db const & a) {
458 return Vec4db(!a.get_low(), !a.get_high());
459 }
460
461 // Functions for Vec4db
462
463 // andnot: a & ~ b
andnot(Vec4db const & a,Vec4db const & b)464 static inline Vec4db andnot(Vec4db const & a, Vec4db const & b) {
465 return Vec4db(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
466 }
467
468
469 /*****************************************************************************
470 *
471 * Horizontal Boolean functions
472 *
473 *****************************************************************************/
474
475 // horizontal_and. Returns true if all bits are 1
horizontal_and(Vec4db const & a)476 static inline bool horizontal_and (Vec4db const & a) {
477 return horizontal_and(a.get_low() & a.get_high());
478 }
479
480 // horizontal_or. Returns true if at least one bit is 1
horizontal_or(Vec4db const & a)481 static inline bool horizontal_or (Vec4db const & a) {
482 return horizontal_or(a.get_low() | a.get_high());
483 }
484
485
486
487 /*****************************************************************************
488 *
489 * Vec8f: Vector of 8 single precision floating point values
490 *
491 *****************************************************************************/
492
493 class Vec8f : public Vec256fe {
494 public:
495 // Default constructor:
Vec8f()496 Vec8f() {
497 }
498 // Constructor to broadcast the same value into all elements:
Vec8f(float f)499 Vec8f(float f) {
500 y1 = y0 = _mm_set1_ps(f);
501 }
502 // Constructor to build from all elements:
Vec8f(float f0,float f1,float f2,float f3,float f4,float f5,float f6,float f7)503 Vec8f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7) {
504 y0 = _mm_setr_ps(f0, f1, f2, f3);
505 y1 = _mm_setr_ps(f4, f5, f6, f7);
506 }
507 // Constructor to build from two Vec4f:
Vec8f(Vec4f const & a0,Vec4f const & a1)508 Vec8f(Vec4f const & a0, Vec4f const & a1) {
509 y0 = a0; y1 = a1;
510 }
511 // Constructor to convert from type Vec256fe
Vec8f(Vec256fe const & x)512 Vec8f(Vec256fe const & x) {
513 y0 = x.get_low(); y1 = x.get_high();
514 }
515 // Assignment operator to convert from type Vec256fe
516 Vec8f & operator = (Vec256fe const & x) {
517 y0 = x.get_low(); y1 = x.get_high();
518 return *this;
519 }
520 // Member function to load from array (unaligned)
load(float const * p)521 Vec8f & load(float const * p) {
522 y0 = _mm_loadu_ps(p);
523 y1 = _mm_loadu_ps(p+4);
524 return *this;
525 }
526 // Member function to load from array, aligned by 32
527 // You may use load_a instead of load if you are certain that p points to an address
528 // divisible by 32.
load_a(float const * p)529 Vec8f & load_a(float const * p) {
530 y0 = _mm_load_ps(p);
531 y1 = _mm_load_ps(p+4);
532 return *this;
533 }
534 // Member function to store into array (unaligned)
store(float * p)535 void store(float * p) const {
536 _mm_storeu_ps(p, y0);
537 _mm_storeu_ps(p+4, y1);
538 }
539 // Member function to store into array, aligned by 32
540 // You may use store_a instead of store if you are certain that p points to an address
541 // divisible by 32.
store_a(float * p)542 void store_a(float * p) const {
543 _mm_store_ps(p, y0);
544 _mm_store_ps(p+4, y1);
545 }
546 // Partial load. Load n elements and set the rest to 0
load_partial(int n,float const * p)547 Vec8f & load_partial(int n, float const * p) {
548 if (n > 0 && n <= 4) {
549 *this = Vec8f(Vec4f().load_partial(n, p),_mm_setzero_ps());
550 }
551 else if (n > 4 && n <= 8) {
552 *this = Vec8f(Vec4f().load(p), Vec4f().load_partial(n - 4, p + 4));
553 }
554 else {
555 y1 = y0 = _mm_setzero_ps();
556 }
557 return *this;
558 }
559 // Partial store. Store n elements
store_partial(int n,float * p)560 void store_partial(int n, float * p) const {
561 if (n <= 4) {
562 get_low().store_partial(n, p);
563 }
564 else if (n <= 8) {
565 get_low().store(p);
566 get_high().store_partial(n - 4, p + 4);
567 }
568 }
569 // cut off vector to n elements. The last 8-n elements are set to zero
cutoff(int n)570 Vec8f & cutoff(int n) {
571 if (uint32_t(n) >= 8) return *this;
572 else if (n >= 4) {
573 y1 = Vec4f(y1).cutoff(n - 4);
574 }
575 else {
576 y0 = Vec4f(y0).cutoff(n);
577 y1 = Vec4f(0.0f);
578 }
579 return *this;
580 }
581 // Member function to change a single element in vector
582 // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,float value)583 Vec8f const & insert(uint32_t index, float value) {
584 if (index < 4) {
585 y0 = Vec4f(y0).insert(index, value);
586 }
587 else {
588 y1 = Vec4f(y1).insert(index - 4, value);
589 }
590 return *this;
591 }
592 // Member function extract a single element from vector
593 // Note: This function is inefficient. Use store function if extracting more than one element
extract(uint32_t index)594 float extract(uint32_t index) const {
595 if (index < 4) {
596 return Vec4f(y0).extract(index);
597 }
598 else {
599 return Vec4f(y1).extract(index - 4);
600 }
601 }
602 // Extract a single element. Use store function if extracting more than one element.
603 // Operator [] can only read an element, not write.
604 float operator [] (uint32_t index) const {
605 return extract(index);
606 }
607 // Member functions to split into two Vec4f:
get_low()608 Vec4f get_low() const {
609 return y0;
610 }
get_high()611 Vec4f get_high() const {
612 return y1;
613 }
size()614 static int size () {
615 return 8;
616 }
617 };
618
619
620 /*****************************************************************************
621 *
622 * Operators for Vec8f
623 *
624 *****************************************************************************/
625
626 // vector operator + : add element by element
627 static inline Vec8f operator + (Vec8f const & a, Vec8f const & b) {
628 return Vec8f(a.get_low() + b.get_low(), a.get_high() + b.get_high());
629 }
630
631 // vector operator + : add vector and scalar
632 static inline Vec8f operator + (Vec8f const & a, float b) {
633 return a + Vec8f(b);
634 }
635 static inline Vec8f operator + (float a, Vec8f const & b) {
636 return Vec8f(a) + b;
637 }
638
639 // vector operator += : add
640 static inline Vec8f & operator += (Vec8f & a, Vec8f const & b) {
641 a = a + b;
642 return a;
643 }
644
645 // postfix operator ++
646 static inline Vec8f operator ++ (Vec8f & a, int) {
647 Vec8f a0 = a;
648 a = a + 1.0f;
649 return a0;
650 }
651
652 // prefix operator ++
653 static inline Vec8f & operator ++ (Vec8f & a) {
654 a = a + 1.0f;
655 return a;
656 }
657
658 // vector operator - : subtract element by element
659 static inline Vec8f operator - (Vec8f const & a, Vec8f const & b) {
660 return Vec8f(a.get_low() - b.get_low(), a.get_high() - b.get_high());
661 }
662
663 // vector operator - : subtract vector and scalar
664 static inline Vec8f operator - (Vec8f const & a, float b) {
665 return a - Vec8f(b);
666 }
667 static inline Vec8f operator - (float a, Vec8f const & b) {
668 return Vec8f(a) - b;
669 }
670
671 // vector operator - : unary minus
672 // Change sign bit, even for 0, INF and NAN
673 static inline Vec8f operator - (Vec8f const & a) {
674 return Vec8f(-a.get_low(), -a.get_high());
675 }
676
677 // vector operator -= : subtract
678 static inline Vec8f & operator -= (Vec8f & a, Vec8f const & b) {
679 a = a - b;
680 return a;
681 }
682
683 // postfix operator --
684 static inline Vec8f operator -- (Vec8f & a, int) {
685 Vec8f a0 = a;
686 a = a - 1.0f;
687 return a0;
688 }
689
690 // prefix operator --
691 static inline Vec8f & operator -- (Vec8f & a) {
692 a = a - 1.0f;
693 return a;
694 }
695
696 // vector operator * : multiply element by element
697 static inline Vec8f operator * (Vec8f const & a, Vec8f const & b) {
698 return Vec8f(a.get_low() * b.get_low(), a.get_high() * b.get_high());
699 }
700
701 // vector operator * : multiply vector and scalar
702 static inline Vec8f operator * (Vec8f const & a, float b) {
703 return a * Vec8f(b);
704 }
705 static inline Vec8f operator * (float a, Vec8f const & b) {
706 return Vec8f(a) * b;
707 }
708
709 // vector operator *= : multiply
710 static inline Vec8f & operator *= (Vec8f & a, Vec8f const & b) {
711 a = a * b;
712 return a;
713 }
714
715 // vector operator / : divide all elements by same integer
716 static inline Vec8f operator / (Vec8f const & a, Vec8f const & b) {
717 return Vec8f(a.get_low() / b.get_low(), a.get_high() / b.get_high());
718 }
719
720 // vector operator / : divide vector and scalar
721 static inline Vec8f operator / (Vec8f const & a, float b) {
722 return a / Vec8f(b);
723 }
724 static inline Vec8f operator / (float a, Vec8f const & b) {
725 return Vec8f(a) / b;
726 }
727
728 // vector operator /= : divide
729 static inline Vec8f & operator /= (Vec8f & a, Vec8f const & b) {
730 a = a / b;
731 return a;
732 }
733
734 // vector operator == : returns true for elements for which a == b
735 static inline Vec8fb operator == (Vec8f const & a, Vec8f const & b) {
736 return Vec8fb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
737 }
738
739 // vector operator != : returns true for elements for which a != b
740 static inline Vec8fb operator != (Vec8f const & a, Vec8f const & b) {
741 return Vec8fb(a.get_low() != b.get_low(), a.get_high() != b.get_high());
742 }
743
744 // vector operator < : returns true for elements for which a < b
745 static inline Vec8fb operator < (Vec8f const & a, Vec8f const & b) {
746 return Vec8fb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
747 }
748
749 // vector operator <= : returns true for elements for which a <= b
750 static inline Vec8fb operator <= (Vec8f const & a, Vec8f const & b) {
751 return Vec8fb(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
752 }
753
754 // vector operator > : returns true for elements for which a > b
755 static inline Vec8fb operator > (Vec8f const & a, Vec8f const & b) {
756 return Vec8fb(a.get_low() > b.get_low(), a.get_high() > b.get_high());
757 }
758
759 // vector operator >= : returns true for elements for which a >= b
760 static inline Vec8fb operator >= (Vec8f const & a, Vec8f const & b) {
761 return Vec8fb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
762 }
763
764 // Bitwise logical operators
765
766 // vector operator & : bitwise and
767 static inline Vec8f operator & (Vec8f const & a, Vec8f const & b) {
768 return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
769 }
770
771 // vector operator &= : bitwise and
772 static inline Vec8f & operator &= (Vec8f & a, Vec8f const & b) {
773 a = a & b;
774 return a;
775 }
776
777 // vector operator & : bitwise and of Vec8f and Vec8fb
778 static inline Vec8f operator & (Vec8f const & a, Vec8fb const & b) {
779 return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
780 }
781 static inline Vec8f operator & (Vec8fb const & a, Vec8f const & b) {
782 return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
783 }
784
785 // vector operator | : bitwise or
786 static inline Vec8f operator | (Vec8f const & a, Vec8f const & b) {
787 return Vec8f(a.get_low() | b.get_low(), a.get_high() | b.get_high());
788 }
789
790 // vector operator |= : bitwise or
791 static inline Vec8f & operator |= (Vec8f & a, Vec8f const & b) {
792 a = a | b;
793 return a;
794 }
795
796 // vector operator ^ : bitwise xor
797 static inline Vec8f operator ^ (Vec8f const & a, Vec8f const & b) {
798 return Vec8f(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
799 }
800
801 // vector operator ^= : bitwise xor
802 static inline Vec8f & operator ^= (Vec8f & a, Vec8f const & b) {
803 a = a ^ b;
804 return a;
805 }
806
807 // vector operator ! : logical not. Returns Boolean vector
808 static inline Vec8fb operator ! (Vec8f const & a) {
809 return Vec8fb(!a.get_low(), !a.get_high());
810 }
811
812
813 /*****************************************************************************
814 *
815 * Functions for Vec8f
816 *
817 *****************************************************************************/
818
819 // Select between two operands. Corresponds to this pseudocode:
820 // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
821 // Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
select(Vec8fb const & s,Vec8f const & a,Vec8f const & b)822 static inline Vec8f select (Vec8fb const & s, Vec8f const & a, Vec8f const & b) {
823 return Vec8f(select(s.get_low(),a.get_low(),b.get_low()), select(s.get_high(),a.get_high(),b.get_high()));
824 }
825
826 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec8fb const & f,Vec8f const & a,Vec8f const & b)827 static inline Vec8f if_add (Vec8fb const & f, Vec8f const & a, Vec8f const & b) {
828 return a + (Vec8f(f) & b);
829 }
830
831 // Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
if_mul(Vec8fb const & f,Vec8f const & a,Vec8f const & b)832 static inline Vec8f if_mul (Vec8fb const & f, Vec8f const & a, Vec8f const & b) {
833 return a * select(f, b, 1.f);
834 }
835
836
837 // General arithmetic functions, etc.
838
839 // Horizontal add: Calculates the sum of all vector elements.
horizontal_add(Vec8f const & a)840 static inline float horizontal_add (Vec8f const & a) {
841 return horizontal_add(a.get_low() + a.get_high());
842 }
843
844 // function max: a > b ? a : b
max(Vec8f const & a,Vec8f const & b)845 static inline Vec8f max(Vec8f const & a, Vec8f const & b) {
846 return Vec8f(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
847 }
848
849 // function min: a < b ? a : b
min(Vec8f const & a,Vec8f const & b)850 static inline Vec8f min(Vec8f const & a, Vec8f const & b) {
851 return Vec8f(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
852 }
853
854 // function abs: absolute value
855 // Removes sign bit, even for -0.0f, -INF and -NAN
abs(Vec8f const & a)856 static inline Vec8f abs(Vec8f const & a) {
857 return Vec8f(abs(a.get_low()), abs(a.get_high()));
858 }
859
860 // function sqrt: square root
sqrt(Vec8f const & a)861 static inline Vec8f sqrt(Vec8f const & a) {
862 return Vec8f(sqrt(a.get_low()), sqrt(a.get_high()));
863 }
864
865 // function square: a * a
square(Vec8f const & a)866 static inline Vec8f square(Vec8f const & a) {
867 return Vec8f(square(a.get_low()), square(a.get_high()));
868 }
869
870 // pow(Vec8f, int):
871 template <typename TT> static Vec8f pow(Vec8f const & a, TT const & n);
872
873 // Raise floating point numbers to integer power n
874 template <>
875 inline Vec8f pow<int>(Vec8f const & x0, int const & n) {
876 return pow_template_i<Vec8f>(x0, n);
877 }
878
879 // allow conversion from unsigned int
880 template <>
881 inline Vec8f pow<uint32_t>(Vec8f const & x0, uint32_t const & n) {
882 return pow_template_i<Vec8f>(x0, (int)n);
883 }
884
885
886 // Raise floating point numbers to integer power n, where n is a compile-time constant
887 template <int n>
pow_n(Vec8f const & a)888 static inline Vec8f pow_n(Vec8f const & a) {
889 return Vec8f(pow_n<n>(a.get_low()), pow_n<n>(a.get_high()));
890 }
891
892 template <int n>
pow(Vec8f const & a,Const_int_t<n>)893 static inline Vec8f pow(Vec8f const & a, Const_int_t<n>) {
894 return pow_n<n>(a);
895 }
896
897
898 // function round: round to nearest integer (even). (result as float vector)
round(Vec8f const & a)899 static inline Vec8f round(Vec8f const & a) {
900 return Vec8f(round(a.get_low()), round(a.get_high()));
901 }
902
903 // function truncate: round towards zero. (result as float vector)
truncate(Vec8f const & a)904 static inline Vec8f truncate(Vec8f const & a) {
905 return Vec8f(truncate(a.get_low()), truncate(a.get_high()));
906 }
907
908 // function floor: round towards minus infinity. (result as float vector)
floor(Vec8f const & a)909 static inline Vec8f floor(Vec8f const & a) {
910 return Vec8f(floor(a.get_low()), floor(a.get_high()));
911 }
912
913 // function ceil: round towards plus infinity. (result as float vector)
ceil(Vec8f const & a)914 static inline Vec8f ceil(Vec8f const & a) {
915 return Vec8f(ceil(a.get_low()), ceil(a.get_high()));
916 }
917
918 #ifdef VECTORI256_H // 256 bit integer vectors are available
919 // function round_to_int: round to nearest integer (even). (result as integer vector)
round_to_int(Vec8f const & a)920 static inline Vec8i round_to_int(Vec8f const & a) {
921 // Note: assume MXCSR control register is set to rounding
922 return Vec8i(round_to_int(a.get_low()), round_to_int(a.get_high()));
923 }
924
925 // function truncate_to_int: round towards zero. (result as integer vector)
truncate_to_int(Vec8f const & a)926 static inline Vec8i truncate_to_int(Vec8f const & a) {
927 return Vec8i(truncate_to_int(a.get_low()), truncate_to_int(a.get_high()));
928 }
929
930 // function to_float: convert integer vector to float vector
to_float(Vec8i const & a)931 static inline Vec8f to_float(Vec8i const & a) {
932 return Vec8f(to_float(a.get_low()), to_float(a.get_high()));
933 }
934
935 // function to_float: convert unsigned integer vector to float vector
to_float(Vec8ui const & a)936 static inline Vec8f to_float(Vec8ui const & a) {
937 return Vec8f(to_float(a.get_low()), to_float(a.get_high()));
938 }
939
940 #endif // VECTORI256_H
941
942
943 // Approximate math functions
944
945 // approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11)
approx_recipr(Vec8f const & a)946 static inline Vec8f approx_recipr(Vec8f const & a) {
947 return Vec8f(approx_recipr(a.get_low()), approx_recipr(a.get_high()));
948 }
949
950 // approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11)
approx_rsqrt(Vec8f const & a)951 static inline Vec8f approx_rsqrt(Vec8f const & a) {
952 return Vec8f(approx_rsqrt(a.get_low()), approx_rsqrt(a.get_high()));
953 }
954
955 // Fused multiply and add functions
956
957 // Multiply and add
mul_add(Vec8f const & a,Vec8f const & b,Vec8f const & c)958 static inline Vec8f mul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
959 return Vec8f(mul_add(a.get_low(),b.get_low(),c.get_low()), mul_add(a.get_high(),b.get_high(),c.get_high()));
960 }
961
962 // Multiply and subtract
mul_sub(Vec8f const & a,Vec8f const & b,Vec8f const & c)963 static inline Vec8f mul_sub(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
964 return Vec8f(mul_sub(a.get_low(),b.get_low(),c.get_low()), mul_sub(a.get_high(),b.get_high(),c.get_high()));
965 }
966
967 // Multiply and inverse subtract
nmul_add(Vec8f const & a,Vec8f const & b,Vec8f const & c)968 static inline Vec8f nmul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
969 return Vec8f(nmul_add(a.get_low(),b.get_low(),c.get_low()), nmul_add(a.get_high(),b.get_high(),c.get_high()));
970 }
971
972
973 // Multiply and subtract with extra precision on the intermediate calculations,
974 // even if FMA instructions not supported, using Veltkamp-Dekker split
mul_sub_x(Vec8f const & a,Vec8f const & b,Vec8f const & c)975 static inline Vec8f mul_sub_x(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
976 return Vec8f(mul_sub_x(a.get_low(),b.get_low(),c.get_low()), mul_sub_x(a.get_high(),b.get_high(),c.get_high()));
977 }
978
979
980 // Math functions using fast bit manipulation
981
982 #ifdef VECTORI256_H // 256 bit integer vectors are available
983 // Extract the exponent as an integer
984 // exponent(a) = floor(log2(abs(a)));
985 // exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
exponent(Vec8f const & a)986 static inline Vec8i exponent(Vec8f const & a) {
987 return Vec8i(exponent(a.get_low()), exponent(a.get_high()));
988 }
989 #endif
990
991 // Extract the fraction part of a floating point number
992 // a = 2^exponent(a) * fraction(a), except for a = 0
993 // fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f
fraction(Vec8f const & a)994 static inline Vec8f fraction(Vec8f const & a) {
995 return Vec8f(fraction(a.get_low()), fraction(a.get_high()));
996 }
997
998 #ifdef VECTORI256_H // 256 bit integer vectors are available
999 // Fast calculation of pow(2,n) with n integer
1000 // n = 0 gives 1.0f
1001 // n >= 128 gives +INF
1002 // n <= -127 gives 0.0f
1003 // This function will never produce denormals, and never raise exceptions
exp2(Vec8i const & a)1004 static inline Vec8f exp2(Vec8i const & a) {
1005 return Vec8f(exp2(a.get_low()), exp2(a.get_high()));
1006 }
1007 //static Vec8f exp2(Vec8f const & x); // defined in vectormath_exp.h
1008 #endif // VECTORI256_H
1009
1010
1011
1012 // Categorization functions
1013
1014 // Function sign_bit: gives true for elements that have the sign bit set
1015 // even for -0.0f, -INF and -NAN
1016 // Note that sign_bit(Vec8f(-0.0f)) gives true, while Vec8f(-0.0f) < Vec8f(0.0f) gives false
1017 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
sign_bit(Vec8f const & a)1018 static inline Vec8fb sign_bit(Vec8f const & a) {
1019 return Vec8fb(sign_bit(a.get_low()), sign_bit(a.get_high()));
1020 }
1021
1022 // Function sign_combine: changes the sign of a when b has the sign bit set
1023 // same as select(sign_bit(b), -a, a)
sign_combine(Vec8f const & a,Vec8f const & b)1024 static inline Vec8f sign_combine(Vec8f const & a, Vec8f const & b) {
1025 return Vec8f(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
1026 }
1027
1028 // Function is_finite: gives true for elements that are normal, denormal or zero,
1029 // false for INF and NAN
1030 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
is_finite(Vec8f const & a)1031 static inline Vec8fb is_finite(Vec8f const & a) {
1032 return Vec8fb(is_finite(a.get_low()), is_finite(a.get_high()));
1033 }
1034
1035 // Function is_inf: gives true for elements that are +INF or -INF
1036 // false for finite numbers and NAN
1037 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
is_inf(Vec8f const & a)1038 static inline Vec8fb is_inf(Vec8f const & a) {
1039 return Vec8fb(is_inf(a.get_low()), is_inf(a.get_high()));
1040 }
1041
1042 // Function is_nan: gives true for elements that are +NAN or -NAN
1043 // false for finite numbers and +/-INF
1044 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
is_nan(Vec8f const & a)1045 static inline Vec8fb is_nan(Vec8f const & a) {
1046 return Vec8fb(is_nan(a.get_low()), is_nan(a.get_high()));
1047 }
1048
1049 // Function is_subnormal: gives true for elements that are denormal (subnormal)
1050 // false for finite numbers, zero, NAN and INF
is_subnormal(Vec8f const & a)1051 static inline Vec8fb is_subnormal(Vec8f const & a) {
1052 return Vec8fb(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
1053 }
1054
1055 // Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
1056 // false for finite numbers, NAN and INF
is_zero_or_subnormal(Vec8f const & a)1057 static inline Vec8fb is_zero_or_subnormal(Vec8f const & a) {
1058 return Vec8fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
1059 }
1060
1061 // Function infinite4f: returns a vector where all elements are +INF
infinite8f()1062 static inline Vec8f infinite8f() {
1063 return constant8f<0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000>();
1064 }
1065
1066 // Function nan4f: returns a vector where all elements are +NAN (quiet)
1067 static inline Vec8f nan8f(int n = 0x10) {
1068 return Vec8f(nan4f(n), nan4f(n));
1069 }
1070
1071 // change signs on vectors Vec8f
1072 // Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
1073 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
change_sign(Vec8f const & a)1074 static inline Vec8f change_sign(Vec8f const & a) {
1075 if ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0) return a;
1076 Vec4f lo = change_sign<i0,i1,i2,i3>(a.get_low());
1077 Vec4f hi = change_sign<i4,i5,i6,i7>(a.get_high());
1078 return Vec8f(lo, hi);
1079 }
1080
1081
1082 /*****************************************************************************
1083 *
1084 * Vec2d: Vector of 2 double precision floating point values
1085 *
1086 *****************************************************************************/
1087
1088 class Vec4d : public Vec256de {
1089 public:
1090 // Default constructor:
Vec4d()1091 Vec4d() {
1092 }
1093 // Constructor to broadcast the same value into all elements:
Vec4d(double d)1094 Vec4d(double d) {
1095 y1 = y0 = _mm_set1_pd(d);
1096 }
1097 // Constructor to build from all elements:
Vec4d(double d0,double d1,double d2,double d3)1098 Vec4d(double d0, double d1, double d2, double d3) {
1099 y0 = _mm_setr_pd(d0, d1);
1100 y1 = _mm_setr_pd(d2, d3);
1101 }
1102 // Constructor to build from two Vec4f:
Vec4d(Vec2d const & a0,Vec2d const & a1)1103 Vec4d(Vec2d const & a0, Vec2d const & a1) {
1104 y0 = a0; y1 = a1;
1105 }
1106 // Constructor to convert from type Vec256de
Vec4d(Vec256de const & x)1107 Vec4d(Vec256de const & x) {
1108 y0 = x.get_low();
1109 y1 = x.get_high();
1110 }
1111 // Assignment operator to convert from type Vec256de
1112 Vec4d & operator = (Vec256de const & x) {
1113 y0 = x.get_low();
1114 y1 = x.get_high();
1115 return *this;
1116 }
1117 // Member function to load from array (unaligned)
load(double const * p)1118 Vec4d & load(double const * p) {
1119 y0 = _mm_loadu_pd(p);
1120 y1 = _mm_loadu_pd(p+2);
1121 return *this;
1122 }
1123 // Member function to load from array, aligned by 32
1124 // You may use load_a instead of load if you are certain that p points to an address
1125 // divisible by 32
load_a(double const * p)1126 Vec4d & load_a(double const * p) {
1127 y0 = _mm_load_pd(p);
1128 y1 = _mm_load_pd(p+2);
1129 return *this;
1130 }
1131 // Member function to store into array (unaligned)
store(double * p)1132 void store(double * p) const {
1133 _mm_storeu_pd(p, y0);
1134 _mm_storeu_pd(p+2, y1);
1135 }
1136 // Member function to store into array, aligned by 32
1137 // You may use store_a instead of store if you are certain that p points to an address
1138 // divisible by 32
store_a(double * p)1139 void store_a(double * p) const {
1140 _mm_store_pd(p, y0);
1141 _mm_store_pd(p+2, y1);
1142 }
1143 // Partial load. Load n elements and set the rest to 0
load_partial(int n,double const * p)1144 Vec4d & load_partial(int n, double const * p) {
1145 if (n > 0 && n <= 2) {
1146 *this = Vec4d(Vec2d().load_partial(n, p), _mm_setzero_pd());
1147 }
1148 else if (n > 2 && n <= 4) {
1149 *this = Vec4d(Vec2d().load(p), Vec2d().load_partial(n - 2, p + 2));
1150 }
1151 else {
1152 y1 = y0 = _mm_setzero_pd();
1153 }
1154 return *this;
1155 }
1156 // Partial store. Store n elements
store_partial(int n,double * p)1157 void store_partial(int n, double * p) const {
1158 if (n <= 2) {
1159 get_low().store_partial(n, p);
1160 }
1161 else if (n <= 4) {
1162 get_low().store(p);
1163 get_high().store_partial(n - 2, p + 2);
1164 }
1165 }
cutoff(int n)1166 Vec4d & cutoff(int n) {
1167 if (uint32_t(n) >= 4) return *this;
1168 else if (n >= 2) {
1169 y1 = Vec2d(y1).cutoff(n - 2);
1170 }
1171 else {
1172 y0 = Vec2d(y0).cutoff(n);
1173 y1 = Vec2d(0.0);
1174 }
1175 return *this;
1176 }
1177 // Member function to change a single element in vector
1178 // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,double value)1179 Vec4d const & insert(uint32_t index, double value) {
1180 if (index < 2) {
1181 y0 = Vec2d(y0).insert(index, value);
1182 }
1183 else {
1184 y1 = Vec2d(y1).insert(index-2, value);
1185 }
1186 return *this;
1187 }
1188 // Member function extract a single element from vector
1189 // Note: This function is inefficient. Use store function if extracting more than one element
extract(uint32_t index)1190 double extract(uint32_t index) const {
1191 if (index < 2) {
1192 return Vec2d(y0).extract(index);
1193 }
1194 else {
1195 return Vec2d(y1).extract(index-2);
1196 }
1197 }
1198 // Extract a single element. Use store function if extracting more than one element.
1199 // Operator [] can only read an element, not write.
1200 double operator [] (uint32_t index) const {
1201 return extract(index);
1202 }
1203 // Member functions to split into two Vec2d:
get_low()1204 Vec2d get_low() const {
1205 return y0;
1206 }
get_high()1207 Vec2d get_high() const {
1208 return y1;
1209 }
size()1210 static int size () {
1211 return 4;
1212 }
1213 };
1214
1215
1216
1217 /*****************************************************************************
1218 *
1219 * Operators for Vec4d
1220 *
1221 *****************************************************************************/
1222
1223 // vector operator + : add element by element
1224 static inline Vec4d operator + (Vec4d const & a, Vec4d const & b) {
1225 return Vec4d(a.get_low() + b.get_low(), a.get_high() + b.get_high());
1226 }
1227
1228 // vector operator + : add vector and scalar
1229 static inline Vec4d operator + (Vec4d const & a, double b) {
1230 return a + Vec4d(b);
1231 }
1232 static inline Vec4d operator + (double a, Vec4d const & b) {
1233 return Vec4d(a) + b;
1234 }
1235
1236 // vector operator += : add
1237 static inline Vec4d & operator += (Vec4d & a, Vec4d const & b) {
1238 a = a + b;
1239 return a;
1240 }
1241
1242 // postfix operator ++
1243 static inline Vec4d operator ++ (Vec4d & a, int) {
1244 Vec4d a0 = a;
1245 a = a + 1.0;
1246 return a0;
1247 }
1248
1249 // prefix operator ++
1250 static inline Vec4d & operator ++ (Vec4d & a) {
1251 a = a + 1.0;
1252 return a;
1253 }
1254
1255 // vector operator - : subtract element by element
1256 static inline Vec4d operator - (Vec4d const & a, Vec4d const & b) {
1257 return Vec4d(a.get_low() - b.get_low(), a.get_high() - b.get_high());
1258 }
1259
1260 // vector operator - : subtract vector and scalar
1261 static inline Vec4d operator - (Vec4d const & a, double b) {
1262 return a - Vec4d(b);
1263 }
1264 static inline Vec4d operator - (double a, Vec4d const & b) {
1265 return Vec4d(a) - b;
1266 }
1267
1268 // vector operator - : unary minus
1269 // Change sign bit, even for 0, INF and NAN
1270 static inline Vec4d operator - (Vec4d const & a) {
1271 return Vec4d(-a.get_low(), -a.get_high());
1272 }
1273
1274 // vector operator -= : subtract
1275 static inline Vec4d & operator -= (Vec4d & a, Vec4d const & b) {
1276 a = a - b;
1277 return a;
1278 }
1279
1280 // postfix operator --
1281 static inline Vec4d operator -- (Vec4d & a, int) {
1282 Vec4d a0 = a;
1283 a = a - 1.0;
1284 return a0;
1285 }
1286
1287 // prefix operator --
1288 static inline Vec4d & operator -- (Vec4d & a) {
1289 a = a - 1.0;
1290 return a;
1291 }
1292
1293 // vector operator * : multiply element by element
1294 static inline Vec4d operator * (Vec4d const & a, Vec4d const & b) {
1295 return Vec4d(a.get_low() * b.get_low(), a.get_high() * b.get_high());
1296 }
1297
1298 // vector operator * : multiply vector and scalar
1299 static inline Vec4d operator * (Vec4d const & a, double b) {
1300 return a * Vec4d(b);
1301 }
1302 static inline Vec4d operator * (double a, Vec4d const & b) {
1303 return Vec4d(a) * b;
1304 }
1305
1306 // vector operator *= : multiply
1307 static inline Vec4d & operator *= (Vec4d & a, Vec4d const & b) {
1308 a = a * b;
1309 return a;
1310 }
1311
1312 // vector operator / : divide all elements by same integer
1313 static inline Vec4d operator / (Vec4d const & a, Vec4d const & b) {
1314 return Vec4d(a.get_low() / b.get_low(), a.get_high() / b.get_high());
1315 }
1316
1317 // vector operator / : divide vector and scalar
1318 static inline Vec4d operator / (Vec4d const & a, double b) {
1319 return a / Vec4d(b);
1320 }
1321 static inline Vec4d operator / (double a, Vec4d const & b) {
1322 return Vec4d(a) / b;
1323 }
1324
1325 // vector operator /= : divide
1326 static inline Vec4d & operator /= (Vec4d & a, Vec4d const & b) {
1327 a = a / b;
1328 return a;
1329 }
1330
1331 // vector operator == : returns true for elements for which a == b
1332 static inline Vec4db operator == (Vec4d const & a, Vec4d const & b) {
1333 return Vec4db(a.get_low() == b.get_low(), a.get_high() == b.get_high());
1334 }
1335
1336 // vector operator != : returns true for elements for which a != b
1337 static inline Vec4db operator != (Vec4d const & a, Vec4d const & b) {
1338 return Vec4db(a.get_low() != b.get_low(), a.get_high() != b.get_high());
1339 }
1340
1341 // vector operator < : returns true for elements for which a < b
1342 static inline Vec4db operator < (Vec4d const & a, Vec4d const & b) {
1343 return Vec4db(a.get_low() < b.get_low(), a.get_high() < b.get_high());
1344 }
1345
1346 // vector operator <= : returns true for elements for which a <= b
1347 static inline Vec4db operator <= (Vec4d const & a, Vec4d const & b) {
1348 return Vec4db(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
1349 }
1350
1351 // vector operator > : returns true for elements for which a > b
1352 static inline Vec4db operator > (Vec4d const & a, Vec4d const & b) {
1353 return Vec4db(a.get_low() > b.get_low(), a.get_high() > b.get_high());
1354 }
1355
1356 // vector operator >= : returns true for elements for which a >= b
1357 static inline Vec4db operator >= (Vec4d const & a, Vec4d const & b) {
1358 return Vec4db(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
1359 }
1360
1361 // Bitwise logical operators
1362
1363 // vector operator & : bitwise and
1364 static inline Vec4d operator & (Vec4d const & a, Vec4d const & b) {
1365 return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
1366 }
1367
1368 // vector operator &= : bitwise and
1369 static inline Vec4d & operator &= (Vec4d & a, Vec4d const & b) {
1370 a = a & b;
1371 return a;
1372 }
1373
1374 // vector operator & : bitwise and of Vec4d and Vec4db
1375 static inline Vec4d operator & (Vec4d const & a, Vec4db const & b) {
1376 return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
1377 }
1378 static inline Vec4d operator & (Vec4db const & a, Vec4d const & b) {
1379 return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
1380 }
1381
1382 // vector operator | : bitwise or
1383 static inline Vec4d operator | (Vec4d const & a, Vec4d const & b) {
1384 return Vec4d(a.get_low() | b.get_low(), a.get_high() | b.get_high());
1385 }
1386
1387 // vector operator |= : bitwise or
1388 static inline Vec4d & operator |= (Vec4d & a, Vec4d const & b) {
1389 a = a | b;
1390 return a;
1391 }
1392
1393 // vector operator ^ : bitwise xor
1394 static inline Vec4d operator ^ (Vec4d const & a, Vec4d const & b) {
1395 return Vec4d(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
1396 }
1397
1398 // vector operator ^= : bitwise xor
1399 static inline Vec4d & operator ^= (Vec4d & a, Vec4d const & b) {
1400 a = a ^ b;
1401 return a;
1402 }
1403
1404 // vector operator ! : logical not. Returns Boolean vector
1405 static inline Vec4db operator ! (Vec4d const & a) {
1406 return Vec4db(!a.get_low(), !a.get_high());
1407 }
1408
1409
1410 /*****************************************************************************
1411 *
1412 * Functions for Vec4d
1413 *
1414 *****************************************************************************/
1415
1416 // Select between two operands. Corresponds to this pseudocode:
1417 // for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
1418 // Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true).
1419 // No other values are allowed.
select(Vec4db const & s,Vec4d const & a,Vec4d const & b)1420 static inline Vec4d select (Vec4db const & s, Vec4d const & a, Vec4d const & b) {
1421 return Vec4d(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
1422 }
1423
1424 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec4db const & f,Vec4d const & a,Vec4d const & b)1425 static inline Vec4d if_add (Vec4db const & f, Vec4d const & a, Vec4d const & b) {
1426 return a + (Vec4d(f) & b);
1427 }
1428
1429 // Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
if_mul(Vec4db const & f,Vec4d const & a,Vec4d const & b)1430 static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b) {
1431 return a * select(f, b, 1.f);
1432 }
1433
1434 // General arithmetic functions, etc.
1435
1436 // Horizontal add: Calculates the sum of all vector elements.
horizontal_add(Vec4d const & a)1437 static inline double horizontal_add (Vec4d const & a) {
1438 return horizontal_add(a.get_low() + a.get_high());
1439 }
1440
1441 // function max: a > b ? a : b
max(Vec4d const & a,Vec4d const & b)1442 static inline Vec4d max(Vec4d const & a, Vec4d const & b) {
1443 return Vec4d(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
1444 }
1445
1446 // function min: a < b ? a : b
min(Vec4d const & a,Vec4d const & b)1447 static inline Vec4d min(Vec4d const & a, Vec4d const & b) {
1448 return Vec4d(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
1449 }
1450
1451 // function abs: absolute value
1452 // Removes sign bit, even for -0.0f, -INF and -NAN
abs(Vec4d const & a)1453 static inline Vec4d abs(Vec4d const & a) {
1454 return Vec4d(abs(a.get_low()), abs(a.get_high()));
1455 }
1456
1457 // function sqrt: square root
sqrt(Vec4d const & a)1458 static inline Vec4d sqrt(Vec4d const & a) {
1459 return Vec4d(sqrt(a.get_low()), sqrt(a.get_high()));
1460 }
1461
1462 // function square: a * a
square(Vec4d const & a)1463 static inline Vec4d square(Vec4d const & a) {
1464 return Vec4d(square(a.get_low()), square(a.get_high()));
1465 }
1466
1467 // pow(Vec4d, int):
1468 // Raise floating point numbers to integer power n
1469 template <typename TT> static Vec4d pow(Vec4d const & a, TT const & n);
1470
1471 // Raise floating point numbers to integer power n
1472 template <>
1473 inline Vec4d pow<int>(Vec4d const & x0, int const & n) {
1474 return pow_template_i<Vec4d>(x0, n);
1475 }
1476
1477 // allow conversion from unsigned int
1478 template <>
1479 inline Vec4d pow<uint32_t>(Vec4d const & x0, uint32_t const & n) {
1480 return pow_template_i<Vec4d>(x0, (int)n);
1481 }
1482
1483
1484 // Raise floating point numbers to integer power n, where n is a compile-time constant
1485 template <int n>
pow_n(Vec4d const & a)1486 static inline Vec4d pow_n(Vec4d const & a) {
1487 return Vec4d(pow_n<n>(a.get_low()), pow_n<n>(a.get_high()));
1488 }
1489
1490 template <int n>
pow(Vec4d const & a,Const_int_t<n>)1491 static inline Vec4d pow(Vec4d const & a, Const_int_t<n>) {
1492 return pow_n<n>(a);
1493 }
1494
1495
1496 // function round: round to nearest integer (even). (result as double vector)
round(Vec4d const & a)1497 static inline Vec4d round(Vec4d const & a) {
1498 return Vec4d(round(a.get_low()), round(a.get_high()));
1499 }
1500
1501 // function truncate: round towards zero. (result as double vector)
truncate(Vec4d const & a)1502 static inline Vec4d truncate(Vec4d const & a) {
1503 return Vec4d(truncate(a.get_low()), truncate(a.get_high()));
1504 }
1505
1506 // function floor: round towards minus infinity. (result as double vector)
floor(Vec4d const & a)1507 static inline Vec4d floor(Vec4d const & a) {
1508 return Vec4d(floor(a.get_low()), floor(a.get_high()));
1509 }
1510
1511 // function ceil: round towards plus infinity. (result as double vector)
ceil(Vec4d const & a)1512 static inline Vec4d ceil(Vec4d const & a) {
1513 return Vec4d(ceil(a.get_low()), ceil(a.get_high()));
1514 }
1515
1516 // function round_to_int: round to nearest integer (even). (result as integer vector)
round_to_int(Vec4d const & a)1517 static inline Vec4i round_to_int(Vec4d const & a) {
1518 // Note: assume MXCSR control register is set to rounding
1519 return round_to_int(a.get_low(), a.get_high());
1520 }
1521
1522 // function truncate_to_int: round towards zero. (result as integer vector)
truncate_to_int(Vec4d const & a)1523 static inline Vec4i truncate_to_int(Vec4d const & a) {
1524 return truncate_to_int(a.get_low(), a.get_high());
1525 }
1526
1527 #ifdef VECTORI256_H // 256 bit integer vectors are available
1528
1529 // function truncate_to_int64: round towards zero. (inefficient)
truncate_to_int64(Vec4d const & a)1530 static inline Vec4q truncate_to_int64(Vec4d const & a) {
1531 double aa[4];
1532 a.store(aa);
1533 return Vec4q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3]));
1534 }
1535
1536 // function truncate_to_int64_limited: round towards zero.
1537 // result as 64-bit integer vector, but with limited range
truncate_to_int64_limited(Vec4d const & a)1538 static inline Vec4q truncate_to_int64_limited(Vec4d const & a) {
1539 return Vec4q(truncate_to_int64_limited(a.get_low()), truncate_to_int64_limited(a.get_high()));
1540 }
1541
1542 // function round_to_int64: round to nearest or even. (inefficient)
round_to_int64(Vec4d const & a)1543 static inline Vec4q round_to_int64(Vec4d const & a) {
1544 return truncate_to_int64(round(a));
1545 }
1546
1547 // function round_to_int64_limited: round to nearest integer
1548 // result as 64-bit integer vector, but with limited range
round_to_int64_limited(Vec4d const & a)1549 static inline Vec4q round_to_int64_limited(Vec4d const & a) {
1550 return Vec4q(round_to_int64_limited(a.get_low()), round_to_int64_limited(a.get_high()));
1551 }
1552
1553 // function to_double: convert integer vector elements to double vector (inefficient)
to_double(Vec4q const & a)1554 static inline Vec4d to_double(Vec4q const & a) {
1555 int64_t aa[4];
1556 a.store(aa);
1557 return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]));
1558 }
1559
1560 // function to_double_limited: convert integer vector elements to double vector
1561 // limited to abs(x) < 2^31
to_double_limited(Vec4q const & x)1562 static inline Vec4d to_double_limited(Vec4q const & x) {
1563 return Vec4d(to_double_limited(x.get_low()),to_double_limited(x.get_high()));
1564 }
1565
1566 #endif // VECTORI256_H
1567
1568 // function to_double: convert integer vector to double vector
to_double(Vec4i const & a)1569 static inline Vec4d to_double(Vec4i const & a) {
1570 return Vec4d(to_double_low(a), to_double_high(a));
1571 }
1572
1573 // function compress: convert two Vec4d to one Vec8f
compress(Vec4d const & low,Vec4d const & high)1574 static inline Vec8f compress (Vec4d const & low, Vec4d const & high) {
1575 return Vec8f(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high()));
1576 }
1577
1578 // Function extend_low : convert Vec8f vector elements 0 - 3 to Vec4d
extend_low(Vec8f const & a)1579 static inline Vec4d extend_low (Vec8f const & a) {
1580 return Vec4d(extend_low(a.get_low()), extend_high(a.get_low()));
1581 }
1582
1583 // Function extend_high : convert Vec8f vector elements 4 - 7 to Vec4d
extend_high(Vec8f const & a)1584 static inline Vec4d extend_high (Vec8f const & a) {
1585 return Vec4d(extend_low(a.get_high()), extend_high(a.get_high()));
1586 }
1587
1588
1589 // Fused multiply and add functions
1590
1591 // Multiply and add
mul_add(Vec4d const & a,Vec4d const & b,Vec4d const & c)1592 static inline Vec4d mul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
1593 return Vec4d(mul_add(a.get_low(),b.get_low(),c.get_low()), mul_add(a.get_high(),b.get_high(),c.get_high()));
1594 }
1595
1596 // Multiply and subtract
mul_sub(Vec4d const & a,Vec4d const & b,Vec4d const & c)1597 static inline Vec4d mul_sub(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
1598 return Vec4d(mul_sub(a.get_low(),b.get_low(),c.get_low()), mul_sub(a.get_high(),b.get_high(),c.get_high()));
1599 }
1600
1601 // Multiply and inverse subtract
nmul_add(Vec4d const & a,Vec4d const & b,Vec4d const & c)1602 static inline Vec4d nmul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
1603 return Vec4d(nmul_add(a.get_low(),b.get_low(),c.get_low()), nmul_add(a.get_high(),b.get_high(),c.get_high()));
1604 }
1605
1606 // Multiply and subtract with extra precision on the intermediate calculations,
1607 // even if FMA instructions not supported, using Veltkamp-Dekker split
mul_sub_x(Vec4d const & a,Vec4d const & b,Vec4d const & c)1608 static inline Vec4d mul_sub_x(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
1609 return Vec4d(mul_sub_x(a.get_low(),b.get_low(),c.get_low()), mul_sub_x(a.get_high(),b.get_high(),c.get_high()));
1610 }
1611
1612
1613 // Math functions using fast bit manipulation
1614
1615 #ifdef VECTORI256_H // 256 bit integer vectors are available, AVX2
1616 // Extract the exponent as an integer
1617 // exponent(a) = floor(log2(abs(a)));
1618 // exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
exponent(Vec4d const & a)1619 static inline Vec4q exponent(Vec4d const & a) {
1620 return Vec4q(exponent(a.get_low()), exponent(a.get_high()));
1621 }
1622
1623 // Extract the fraction part of a floating point number
1624 // a = 2^exponent(a) * fraction(a), except for a = 0
1625 // fraction(1.0) = 1.0, fraction(5.0) = 1.25
fraction(Vec4d const & a)1626 static inline Vec4d fraction(Vec4d const & a) {
1627 return Vec4d(fraction(a.get_low()), fraction(a.get_high()));
1628 }
1629
1630 // Fast calculation of pow(2,n) with n integer
1631 // n = 0 gives 1.0
1632 // n >= 1024 gives +INF
1633 // n <= -1023 gives 0.0
1634 // This function will never produce denormals, and never raise exceptions
exp2(Vec4q const & a)1635 static inline Vec4d exp2(Vec4q const & a) {
1636 return Vec4d(exp2(a.get_low()), exp2(a.get_high()));
1637 }
1638 //static Vec4d exp2(Vec4d const & x); // defined in vectormath_exp.h
1639 #endif
1640
1641
1642 // Categorization functions
1643
1644 // Function sign_bit: gives true for elements that have the sign bit set
1645 // even for -0.0, -INF and -NAN
1646 // Note that sign_bit(Vec4d(-0.0)) gives true, while Vec4d(-0.0) < Vec4d(0.0) gives false
sign_bit(Vec4d const & a)1647 static inline Vec4db sign_bit(Vec4d const & a) {
1648 return Vec4db(sign_bit(a.get_low()), sign_bit(a.get_high()));
1649 }
1650
1651 // Function sign_combine: changes the sign of a when b has the sign bit set
1652 // same as select(sign_bit(b), -a, a)
sign_combine(Vec4d const & a,Vec4d const & b)1653 static inline Vec4d sign_combine(Vec4d const & a, Vec4d const & b) {
1654 return Vec4d(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
1655 }
1656
1657 // Function is_finite: gives true for elements that are normal, denormal or zero,
1658 // false for INF and NAN
is_finite(Vec4d const & a)1659 static inline Vec4db is_finite(Vec4d const & a) {
1660 return Vec4db(is_finite(a.get_low()), is_finite(a.get_high()));
1661 }
1662
1663 // Function is_inf: gives true for elements that are +INF or -INF
1664 // false for finite numbers and NAN
is_inf(Vec4d const & a)1665 static inline Vec4db is_inf(Vec4d const & a) {
1666 return Vec4db(is_inf(a.get_low()), is_inf(a.get_high()));
1667 }
1668
1669 // Function is_nan: gives true for elements that are +NAN or -NAN
1670 // false for finite numbers and +/-INF
is_nan(Vec4d const & a)1671 static inline Vec4db is_nan(Vec4d const & a) {
1672 return Vec4db(is_nan(a.get_low()), is_nan(a.get_high()));
1673 }
1674
1675 // Function is_subnormal: gives true for elements that are denormal (subnormal)
1676 // false for finite numbers, zero, NAN and INF
is_subnormal(Vec4d const & a)1677 static inline Vec4db is_subnormal(Vec4d const & a) {
1678 return Vec4db(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
1679 }
1680
1681 // Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
1682 // false for finite numbers, NAN and INF
is_zero_or_subnormal(Vec4d const & a)1683 static inline Vec4db is_zero_or_subnormal(Vec4d const & a) {
1684 return Vec4db(is_zero_or_subnormal(a.get_low()),is_zero_or_subnormal(a.get_high()));
1685 }
1686
1687 // Function infinite2d: returns a vector where all elements are +INF
infinite4d()1688 static inline Vec4d infinite4d() {
1689 return Vec4d(infinite2d(), infinite2d());
1690 }
1691
1692 // Function nan2d: returns a vector where all elements are +NAN (quiet)
1693 static inline Vec4d nan4d(int n = 0x10) {
1694 return Vec4d(nan2d(n), nan2d(n));
1695 }
1696
1697 // change signs on vectors Vec4d
1698 // Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
1699 template <int i0, int i1, int i2, int i3>
change_sign(Vec4d const & a)1700 static inline Vec4d change_sign(Vec4d const & a) {
1701 if ((i0 | i1 | i2 | i3) == 0) return a;
1702 Vec2d lo = change_sign<i0,i1>(a.get_low());
1703 Vec2d hi = change_sign<i2,i3>(a.get_high());
1704 return Vec4d(lo, hi);
1705 }
1706
1707
1708 /*****************************************************************************
1709 *
1710 * Functions for reinterpretation between vector types
1711 *
1712 *****************************************************************************/
1713
reinterpret_i(Vec256ie const & x)1714 static inline Vec256ie reinterpret_i (Vec256ie const & x) {
1715 return x;
1716 }
1717
reinterpret_i(Vec256fe const & x)1718 static inline Vec256ie reinterpret_i (Vec256fe const & x) {
1719 return Vec256ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high()));
1720 }
1721
reinterpret_i(Vec256de const & x)1722 static inline Vec256ie reinterpret_i (Vec256de const & x) {
1723 return Vec256ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high()));
1724 }
1725
reinterpret_f(Vec256ie const & x)1726 static inline Vec256fe reinterpret_f (Vec256ie const & x) {
1727 return Vec256fe(reinterpret_f(x.get_low()), reinterpret_f(x.get_high()));
1728 }
1729
reinterpret_f(Vec256fe const & x)1730 static inline Vec256fe reinterpret_f (Vec256fe const & x) {
1731 return x;
1732 }
1733
reinterpret_f(Vec256de const & x)1734 static inline Vec256fe reinterpret_f (Vec256de const & x) {
1735 return Vec256fe(reinterpret_f(x.get_low()), reinterpret_f(x.get_high()));
1736 }
1737
reinterpret_d(Vec256ie const & x)1738 static inline Vec256de reinterpret_d (Vec256ie const & x) {
1739 return Vec256de(reinterpret_d(x.get_low()), reinterpret_d(x.get_high()));
1740 }
1741
reinterpret_d(Vec256fe const & x)1742 static inline Vec256de reinterpret_d (Vec256fe const & x) {
1743 return Vec256de(reinterpret_d(x.get_low()), reinterpret_d(x.get_high()));
1744 }
1745
reinterpret_d(Vec256de const & x)1746 static inline Vec256de reinterpret_d (Vec256de const & x) {
1747 return x;
1748 }
1749
1750
1751 /*****************************************************************************
1752 *
1753 * Vector permute and blend functions
1754 *
1755 ******************************************************************************
1756 *
1757 * The permute function can reorder the elements of a vector and optionally
1758 * set some elements to zero.
1759 *
1760 * The indexes are inserted as template parameters in <>. These indexes must be
1761 * constants. Each template parameter is an index to the element you want to
1762 * select. An index of -1 will generate zero. An index of -256 means don't care.
1763 *
1764 * Example:
1765 * Vec4d a(10., 11., 12., 13.); // a is (10, 11, 12, 13)
1766 * Vec4d b;
1767 * b = permute4d<1,0,-1,3>(a); // b is (11, 10, 0, 13)
1768 *
1769 *
1770 * The blend function can mix elements from two different vectors and
1771 * optionally set some elements to zero.
1772 *
1773 * The indexes are inserted as template parameters in <>. These indexes must be
1774 * constants. Each template parameter is an index to the element you want to
1775 * select, where indexes 0 - 3 indicate an element from the first source
1776 * vector and indexes 4 - 7 indicate an element from the second source vector.
1777 * A negative index will generate zero.
1778 *
1779 *
1780 * Example:
1781 * Vec4d a(10., 11., 12., 13.); // a is (10, 11, 12, 13)
1782 * Vec4d b(20., 21., 22., 23.); // a is (20, 21, 22, 23)
1783 * Vec4d c;
1784 * c = blend4d<4,3,7,-1> (a,b); // c is (20, 13, 23, 0)
1785 *****************************************************************************/
1786
1787 // permute vector Vec4d
1788 template <int i0, int i1, int i2, int i3>
permute4d(Vec4d const & a)1789 static inline Vec4d permute4d(Vec4d const & a) {
1790 return Vec4d(blend2d<i0,i1> (a.get_low(), a.get_high()),
1791 blend2d<i2,i3> (a.get_low(), a.get_high()));
1792 }
1793
1794 // helper function used below
1795 template <int n>
select4(Vec4d const & a,Vec4d const & b)1796 static inline Vec2d select4(Vec4d const & a, Vec4d const & b) {
1797 switch (n) {
1798 case 0:
1799 return a.get_low();
1800 case 1:
1801 return a.get_high();
1802 case 2:
1803 return b.get_low();
1804 case 3:
1805 return b.get_high();
1806 }
1807 return _mm_setzero_pd();
1808 }
1809
1810 // blend vectors Vec4d
1811 template <int i0, int i1, int i2, int i3>
blend4d(Vec4d const & a,Vec4d const & b)1812 static inline Vec4d blend4d(Vec4d const & a, Vec4d const & b) {
1813 const int j0 = i0 >= 0 ? i0/2 : i0;
1814 const int j1 = i1 >= 0 ? i1/2 : i1;
1815 const int j2 = i2 >= 0 ? i2/2 : i2;
1816 const int j3 = i3 >= 0 ? i3/2 : i3;
1817 Vec2d x0, x1;
1818
1819 if (j0 == j1 || i0 < 0 || i1 < 0) { // both from same
1820 const int k0 = j0 >= 0 ? j0 : j1;
1821 x0 = permute2d<i0 & -7, i1 & -7> (select4<k0> (a,b));
1822 }
1823 else {
1824 x0 = blend2d<i0 & -7, (i1 & -7) | 2> (select4<j0>(a,b), select4<j1>(a,b));
1825 }
1826 if (j2 == j3 || i2 < 0 || i3 < 0) { // both from same
1827 const int k1 = j2 >= 0 ? j2 : j3;
1828 x1 = permute2d<i2 & -7, i3 & -7> (select4<k1> (a,b));
1829 }
1830 else {
1831 x1 = blend2d<i2 & -7, (i3 & -7) | 2> (select4<j2>(a,b), select4<j3>(a,b));
1832 }
1833 return Vec4d(x0,x1);
1834 }
1835
1836 /*****************************************************************************
1837 *
1838 * Vector Vec8f permute and blend functions
1839 *
1840 *****************************************************************************/
1841
1842 // permute vector Vec8f
1843 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
permute8f(Vec8f const & a)1844 static inline Vec8f permute8f(Vec8f const & a) {
1845 return Vec8f(blend4f<i0,i1,i2,i3> (a.get_low(), a.get_high()),
1846 blend4f<i4,i5,i6,i7> (a.get_low(), a.get_high()));
1847 }
1848
1849 // helper function used below
1850 template <int n>
select4(Vec8f const & a,Vec8f const & b)1851 static inline Vec4f select4(Vec8f const & a, Vec8f const & b) {
1852 switch (n) {
1853 case 0:
1854 return a.get_low();
1855 case 1:
1856 return a.get_high();
1857 case 2:
1858 return b.get_low();
1859 case 3:
1860 return b.get_high();
1861 }
1862 return _mm_setzero_ps();
1863 }
1864
1865 // blend vectors Vec8f
1866 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
blend8f(Vec8f const & a,Vec8f const & b)1867 static inline Vec8f blend8f(Vec8f const & a, Vec8f const & b) {
1868 const int j0 = i0 >= 0 ? i0/4 : i0;
1869 const int j1 = i1 >= 0 ? i1/4 : i1;
1870 const int j2 = i2 >= 0 ? i2/4 : i2;
1871 const int j3 = i3 >= 0 ? i3/4 : i3;
1872 const int j4 = i4 >= 0 ? i4/4 : i4;
1873 const int j5 = i5 >= 0 ? i5/4 : i5;
1874 const int j6 = i6 >= 0 ? i6/4 : i6;
1875 const int j7 = i7 >= 0 ? i7/4 : i7;
1876 Vec4f x0, x1;
1877
1878 const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3;
1879 const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
1880 const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3;
1881 const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7;
1882
1883 // Combine all the indexes into a single bitfield, with 4 bits for each
1884 const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
1885
1886 // Mask to zero out negative indexes
1887 const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
1888
1889 if (r0 < 0) {
1890 x0 = _mm_setzero_ps();
1891 }
1892 else if (((m1 ^ r0*0x4444) & 0xCCCC & mz) == 0) {
1893 // i0 - i3 all from same source
1894 x0 = permute4f<i0 & -13, i1 & -13, i2 & -13, i3 & -13> (select4<r0> (a,b));
1895 }
1896 else if ((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0)) {
1897 // i0 - i3 all from two sources
1898 const int k0 = i0 >= 0 ? i0 & 3 : i0;
1899 const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0);
1900 const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0);
1901 const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0);
1902 x0 = blend4f<k0,k1,k2,k3> (select4<r0>(a,b), select4<s0>(a,b));
1903 }
1904 else {
1905 // i0 - i3 from three or four different sources
1906 x0 = blend4f<0,1,6,7> (
1907 blend4f<i0 & -13, (i1 & -13) | 4, -0x100, -0x100> (select4<j0>(a,b), select4<j1>(a,b)),
1908 blend4f<-0x100, -0x100, i2 & -13, (i3 & -13) | 4> (select4<j2>(a,b), select4<j3>(a,b)));
1909 }
1910
1911 if (r1 < 0) {
1912 x1 = _mm_setzero_ps();
1913 }
1914 else if (((m1 ^ r1*0x44440000u) & 0xCCCC0000 & mz) == 0) {
1915 // i4 - i7 all from same source
1916 x1 = permute4f<i4 & -13, i5 & -13, i6 & -13, i7 & -13> (select4<r1> (a,b));
1917 }
1918 else if ((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1)) {
1919 // i4 - i7 all from two sources
1920 const int k4 = i4 >= 0 ? i4 & 3 : i4;
1921 const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0);
1922 const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0);
1923 const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0);
1924 x1 = blend4f<k4,k5,k6,k7> (select4<r1>(a,b), select4<s1>(a,b));
1925 }
1926 else {
1927 // i4 - i7 from three or four different sources
1928 x1 = blend4f<0,1,6,7> (
1929 blend4f<i4 & -13, (i5 & -13) | 4, -0x100, -0x100> (select4<j4>(a,b), select4<j5>(a,b)),
1930 blend4f<-0x100, -0x100, i6 & -13, (i7 & -13) | 4> (select4<j6>(a,b), select4<j7>(a,b)));
1931 }
1932
1933 return Vec8f(x0,x1);
1934 }
1935
1936 /*****************************************************************************
1937 *
1938 * Vector lookup functions
1939 *
1940 ******************************************************************************
1941 *
1942 * These functions use vector elements as indexes into a table.
1943 * The table is given as one or more vectors or as an array.
1944 *
1945 * This can be used for several purposes:
1946 * - table lookup
1947 * - permute or blend with variable indexes
1948 * - blend from more than two sources
1949 * - gather non-contiguous data
1950 *
1951 * An index out of range may produce any value - the actual value produced is
1952 * implementation dependent and may be different for different instruction
1953 * sets. An index out of range does not produce an error message or exception.
1954 *
1955 * Example:
1956 * Vec4i a(2,0,0,3); // index a is ( 2, 0, 0, 3)
1957 * Vec4f b(1.0f,1.1f,1.2f,1.3f); // table b is (1.0, 1.1, 1.2, 1.3)
1958 * Vec4f c;
1959 * c = lookup4 (a,b); // result c is (1.2, 1.0, 1.0, 1.3)
1960 *
1961 *****************************************************************************/
1962
1963 #ifdef VECTORI256_H // Vec8i and Vec4q must be defined
1964
lookup8(Vec8i const & index,Vec8f const & table)1965 static inline Vec8f lookup8(Vec8i const & index, Vec8f const & table) {
1966 Vec4f r0 = lookup8(index.get_low() , table.get_low(), table.get_high());
1967 Vec4f r1 = lookup8(index.get_high(), table.get_low(), table.get_high());
1968 return Vec8f(r0, r1);
1969 }
1970
1971 template <int n>
lookup(Vec8i const & index,float const * table)1972 static inline Vec8f lookup(Vec8i const & index, float const * table) {
1973 if (n <= 0) return 0;
1974 if (n <= 4) {
1975 Vec4f table1 = Vec4f().load(table);
1976 return Vec8f(
1977 lookup4 (index.get_low(), table1),
1978 lookup4 (index.get_high(), table1));
1979 }
1980 if (n <= 8) {
1981 return lookup8(index, Vec8f().load(table));
1982 }
1983 // Limit index
1984 Vec8ui index1;
1985 if ((n & (n-1)) == 0) {
1986 // n is a power of 2, make index modulo n
1987 index1 = Vec8ui(index) & (n-1);
1988 }
1989 else {
1990 // n is not a power of 2, limit to n-1
1991 index1 = min(Vec8ui(index), n-1);
1992 }
1993 return Vec8f(table[index1[0]],table[index1[1]],table[index1[2]],table[index1[3]],
1994 table[index1[4]],table[index1[5]],table[index1[6]],table[index1[7]]);
1995 }
1996
lookup4(Vec4q const & index,Vec4d const & table)1997 static inline Vec4d lookup4(Vec4q const & index, Vec4d const & table) {
1998 Vec2d r0 = lookup4(index.get_low() , table.get_low(), table.get_high());
1999 Vec2d r1 = lookup4(index.get_high(), table.get_low(), table.get_high());
2000 return Vec4d(r0, r1);
2001 }
2002
2003 template <int n>
lookup(Vec4q const & index,double const * table)2004 static inline Vec4d lookup(Vec4q const & index, double const * table) {
2005 if (n <= 0) return 0;
2006 if (n <= 2) {
2007 Vec2d table1 = Vec2d().load(table);
2008 return Vec4d(
2009 lookup2 (index.get_low(), table1),
2010 lookup2 (index.get_high(), table1));
2011 }
2012 // Limit index
2013 Vec8ui index1;
2014 if ((n & (n-1)) == 0) {
2015 // n is a power of 2, make index modulo n
2016 index1 = Vec8ui(index) & constant8i<n-1, 0, n-1, 0, n-1, 0, n-1, 0>();
2017 }
2018 else {
2019 // n is not a power of 2, limit to n-1
2020 index1 = min(Vec8ui(index), constant8i<n-1, 0, n-1, 0, n-1, 0, n-1, 0>() );
2021 }
2022 Vec4q index2 = Vec4q(index1);
2023 return Vec4d(table[index2[0]],table[index2[1]],table[index2[2]],table[index2[3]]);
2024 }
2025 #endif // VECTORI256_H
2026
2027 /*****************************************************************************
2028 *
2029 * Vector scatter functions
2030 *
2031 ******************************************************************************
2032 *
2033 * These functions write the elements of a vector to arbitrary positions in an
2034 * array in memory. Each vector element is written to an array position
2035 * determined by an index. An element is not written if the corresponding
2036 * index is out of range.
2037 * The indexes can be specified as constant template parameters or as an
2038 * integer vector.
2039 *
2040 * The scatter functions are useful if the data are distributed in a sparce
2041 * manner into the array. If the array is dense then it is more efficient
2042 * to permute the data into the right positions and then write the whole
2043 * permuted vector into the array.
2044 *
2045 * Example:
2046 * Vec8d a(10,11,12,13,14,15,16,17);
2047 * double b[16] = {0};
2048 * scatter<0,2,14,10,1,-1,5,9>(a,b);
2049 * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
2050 *
2051 *****************************************************************************/
2052
2053 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
scatter(Vec8f const & data,float * array)2054 static inline void scatter(Vec8f const & data, float * array) {
2055 const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7};
2056 for (int i = 0; i < 8; i++) {
2057 if (index[i] >= 0) array[index[i]] = data[i];
2058 }
2059 }
2060
2061 template <int i0, int i1, int i2, int i3>
scatter(Vec4d const & data,double * array)2062 static inline void scatter(Vec4d const & data, double * array) {
2063 const int index[4] = {i0,i1,i2,i3};
2064 for (int i = 0; i < 4; i++) {
2065 if (index[i] >= 0) array[index[i]] = data[i];
2066 }
2067 }
2068
scatter(Vec8i const & index,uint32_t limit,Vec8f const & data,float * array)2069 static inline void scatter(Vec8i const & index, uint32_t limit, Vec8f const & data, float * array) {
2070 for (int i = 0; i < 8; i++) {
2071 if (uint32_t(index[i]) < limit) array[index[i]] = data[i];
2072 }
2073 }
2074
scatter(Vec4q const & index,uint32_t limit,Vec4d const & data,double * array)2075 static inline void scatter(Vec4q const & index, uint32_t limit, Vec4d const & data, double * array) {
2076 for (int i = 0; i < 4; i++) {
2077 if (uint64_t(index[i]) < uint64_t(limit)) array[index[i]] = data[i];
2078 }
2079 }
2080
scatter(Vec4i const & index,uint32_t limit,Vec4d const & data,double * array)2081 static inline void scatter(Vec4i const & index, uint32_t limit, Vec4d const & data, double * array) {
2082 for (int i = 0; i < 4; i++) {
2083 if (uint32_t(index[i]) < limit) array[index[i]] = data[i];
2084 }
2085 }
2086
2087 /*****************************************************************************
2088 *
2089 * Horizontal scan functions
2090 *
2091 *****************************************************************************/
2092
2093 // Get index to the first element that is true. Return -1 if all are false
2094
horizontal_find_first(Vec8fb const & x)2095 static inline int horizontal_find_first(Vec8fb const & x) {
2096 return horizontal_find_first(Vec8ib(x));
2097 }
2098
horizontal_find_first(Vec4db const & x)2099 static inline int horizontal_find_first(Vec4db const & x) {
2100 return horizontal_find_first(Vec4qb(x));
2101 }
2102
2103 // Count the number of elements that are true
horizontal_count(Vec8fb const & x)2104 static inline uint32_t horizontal_count(Vec8fb const & x) {
2105 return horizontal_count(Vec8ib(x));
2106 }
2107
horizontal_count(Vec4db const & x)2108 static inline uint32_t horizontal_count(Vec4db const & x) {
2109 return horizontal_count(Vec4qb(x));
2110 }
2111
2112 /*****************************************************************************
2113 *
2114 * Boolean <-> bitfield conversion functions
2115 *
2116 *****************************************************************************/
2117
2118 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec8fb const & x)2119 static inline uint8_t to_bits(Vec8fb const & x) {
2120 return to_bits(Vec8ib(x));
2121 }
2122
2123 // to_Vec8fb: convert integer bitfield to boolean vector
to_Vec8fb(uint8_t x)2124 static inline Vec8fb to_Vec8fb(uint8_t x) {
2125 return Vec8fb(to_Vec8ib(x));
2126 }
2127
2128 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec4db const & x)2129 static inline uint8_t to_bits(Vec4db const & x) {
2130 return to_bits(Vec4qb(x));
2131 }
2132
2133 // to_Vec4db: convert integer bitfield to boolean vector
to_Vec4db(uint8_t x)2134 static inline Vec4db to_Vec4db(uint8_t x) {
2135 return Vec4db(to_Vec4qb(x));
2136 }
2137
2138 #ifdef VCL_NAMESPACE
2139 }
2140 #endif
2141
2142 #endif // VECTORF256_H
2143