1 /****************************  vectorf256e.h   *******************************
2 * Author:        Agner Fog
3 * Date created:  2012-05-30
4 * Last modified: 2017-02-19
5 * Version:       1.27
6 * Project:       vector classes
7 * Description:
8 * Header file defining 256-bit floating point vector classes as interface
9 * to intrinsic functions. Emulated for processors without AVX instruction set.
10 *
11 * The following vector classes are defined here:
12 * Vec8f     Vector of 8 single precision floating point numbers
13 * Vec8fb    Vector of 8 Booleans for use with Vec8f
14 * Vec4d     Vector of 4 double precision floating point numbers
15 * Vec4db    Vector of 4 Booleans for use with Vec4d
16 *
17 * For detailed instructions, see VectorClass.pdf
18 *
19 * (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses
20 *****************************************************************************/
21 
22 // check combination of header files
23 #ifdef VECTORF256_H
24 #if    VECTORF256_H != 1
25 #error Two different versions of vectorf256.h included
26 #endif
27 #else
28 #define VECTORF256_H  1
29 
30 #if defined (VECTORI256_H) &&  VECTORI256_H >= 2
31 #error wrong combination of header files. Use vectorf256.h instead of vectorf256e.h if you have AVX2
32 #endif
33 
34 
35 #include "vectorf128.h"  // Define 128-bit vectors
36 
37 #ifdef VCL_NAMESPACE
38 namespace VCL_NAMESPACE {
39 #endif
40 
41 /*****************************************************************************
42 *
43 *          base class Vec256fe and Vec256de
44 *
45 *****************************************************************************/
46 // base class to replace __m256 when AVX is not supported
47 class Vec256fe {
48 protected:
49     __m128 y0;                         // low half
50     __m128 y1;                         // high half
51 public:
Vec256fe(void)52     Vec256fe(void) {};                 // default constructor
Vec256fe(__m128 x0,__m128 x1)53     Vec256fe(__m128 x0, __m128 x1) {   // constructor to build from two __m128
54         y0 = x0;  y1 = x1;
55     }
get_low()56     __m128 get_low() const {           // get low half
57         return y0;
58     }
get_high()59     __m128 get_high() const {          // get high half
60         return y1;
61     }
62 };
63 
64 // base class to replace __m256d when AVX is not supported
65 class Vec256de {
66 public:
Vec256de()67     Vec256de() {};                     // default constructor
Vec256de(__m128d x0,__m128d x1)68     Vec256de(__m128d x0, __m128d x1) { // constructor to build from two __m128d
69         y0 = x0;  y1 = x1;
70     }
get_low()71     __m128d get_low() const {          // get low half
72         return y0;
73     }
get_high()74     __m128d get_high() const {         // get high half
75         return y1;
76     }
77 protected:
78     __m128d y0;                        // low half
79     __m128d y1;                        // high half
80 };
81 
82 
83 /*****************************************************************************
84 *
85 *          select functions
86 *
87 *****************************************************************************/
88 // Select between two Vec256fe sources, element by element. Used in various functions
89 // and operators. Corresponds to this pseudocode:
90 // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
91 // Each element in s must be either 0 (false) or 0xFFFFFFFF (true).
selectf(Vec256fe const & s,Vec256fe const & a,Vec256fe const & b)92 static inline Vec256fe selectf (Vec256fe const & s, Vec256fe const & a, Vec256fe const & b) {
93     return Vec256fe(selectf(b.get_low(), a.get_low(), s.get_low()), selectf(b.get_high(), a.get_high(), s.get_high()));
94 }
95 
96 // Same, with two Vec256de sources.
97 // and operators. Corresponds to this pseudocode:
98 // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
99 // Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other
100 // values are allowed.
selectd(Vec256de const & s,Vec256de const & a,Vec256de const & b)101 static inline Vec256de selectd (Vec256de const & s, Vec256de const & a, Vec256de const & b) {
102     return Vec256de(selectd(b.get_low(), a.get_low(), s.get_low()), selectd(b.get_high(), a.get_high(), s.get_high()));
103 }
104 
105 
106 
107 /*****************************************************************************
108 *
109 *          Generate compile-time constant vector
110 *
111 *****************************************************************************/
112 // Generate a constant vector of 8 integers stored in memory,
113 // load as __m256
114 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
constant8f()115 static inline Vec256fe constant8f() {
116     static const union {
117         int      i[8];
118         __m128   y[2];
119     } u = {{i0,i1,i2,i3,i4,i5,i6,i7}};
120     return Vec256fe(u.y[0], u.y[1]);
121 }
122 
123 
124 /*****************************************************************************
125 *
126 *          Vec8fb: Vector of 8 Booleans for use with Vec8f
127 *
128 *****************************************************************************/
129 
130 class Vec8fb : public Vec256fe {
131 public:
132     // Default constructor:
Vec8fb()133     Vec8fb() {
134     }
135     // Constructor to build from all elements:
Vec8fb(bool b0,bool b1,bool b2,bool b3,bool b4,bool b5,bool b6,bool b7)136     Vec8fb(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) {
137         y0 = Vec4fb(b0, b1, b2, b3);
138         y1 = Vec4fb(b4, b5, b6, b7);
139     }
140     // Constructor to build from two Vec4fb:
Vec8fb(Vec4fb const & a0,Vec4fb const & a1)141     Vec8fb(Vec4fb const & a0, Vec4fb const & a1) {
142         y0 = a0;  y1 = a1;
143     }
144     // Constructor to convert from type Vec256fe
Vec8fb(Vec256fe const & x)145     Vec8fb(Vec256fe const & x) {
146         y0 = x.get_low();  y1 = x.get_high();
147     }
148     // Assignment operator to convert from type Vec256fe
149     Vec8fb & operator = (Vec256fe const & x) {
150         y0 = x.get_low();  y1 = x.get_high();
151         return *this;
152     }
153 #ifdef VECTORI256_H  // 256 bit integer vectors are available
154     // Constructor to convert from type Vec8ib used as Boolean for integer vectors
Vec8fb(Vec8ib const & x)155     Vec8fb(Vec8ib const & x) {
156         y0 = _mm_castsi128_ps(Vec8i(x).get_low());
157         y1 = _mm_castsi128_ps(Vec8i(x).get_high());
158     }
159     // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors
160     Vec8fb & operator = (Vec8ib const & x) {
161         y0 = _mm_castsi128_ps(Vec8i(x).get_low());
162         y1 = _mm_castsi128_ps(Vec8i(x).get_high());
163         return *this;
164     }
165     // Constructor to broadcast the same value into all elements:
Vec8fb(bool b)166     Vec8fb(bool b) {
167         y1 = y0 = Vec4fb(b);
168     }
169     // Assignment operator to broadcast scalar value:
170     Vec8fb & operator = (bool b) {
171         y0 = y1 = Vec4fb(b);
172         return *this;
173     }
174 private: // Prevent constructing from int, etc.
175     Vec8fb(int b);
176     Vec8fb & operator = (int x);
177 public:
178     // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors
Vec8ib()179     operator Vec8ib() const {
180         return Vec8i(_mm_castps_si128(y0), _mm_castps_si128(y1));
181     }
182 #endif // VECTORI256_H
183     // Member function to change a single element in vector
184     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,bool value)185     Vec8fb const & insert(uint32_t index, bool value) {
186         if (index < 4) {
187             y0 = Vec4fb(y0).insert(index, value);
188         }
189         else {
190             y1 = Vec4fb(y1).insert(index-4, value);
191         }
192         return *this;
193     }
194     // Member function extract a single element from vector
195     // Note: This function is inefficient. Use store function if extracting more than one element
extract(uint32_t index)196     bool extract(uint32_t index) const {
197         if (index < 4) {
198             return Vec4fb(y0).extract(index);
199         }
200         else {
201             return Vec4fb(y1).extract(index-4);
202         }
203     }
204     // Extract a single element. Operator [] can only read an element, not write.
205     bool operator [] (uint32_t index) const {
206         return extract(index);
207     }
208     // Member functions to split into two Vec4fb:
get_low()209     Vec4fb get_low() const {
210         return y0;
211     }
get_high()212     Vec4fb get_high() const {
213         return y1;
214     }
size()215     static int size () {
216         return 8;
217     }
218 };
219 
220 
221 /*****************************************************************************
222 *
223 *          Operators for Vec8fb
224 *
225 *****************************************************************************/
226 
227 // vector operator & : bitwise and
228 static inline Vec8fb operator & (Vec8fb const & a, Vec8fb const & b) {
229     return Vec8fb(a.get_low() & b.get_low(), a.get_high() & b.get_high());
230 }
231 
232 static inline Vec8fb operator && (Vec8fb const & a, Vec8fb const & b) {
233     return a & b;
234 }
235 
236 // vector operator &= : bitwise and
237 static inline Vec8fb & operator &= (Vec8fb & a, Vec8fb const & b) {
238     a = a & b;
239     return a;
240 }
241 
242 // vector operator | : bitwise or
243 static inline Vec8fb operator | (Vec8fb const & a, Vec8fb const & b) {
244     return Vec8fb(a.get_low() | b.get_low(), a.get_high() | b.get_high());
245 }
246 static inline Vec8fb operator || (Vec8fb const & a, Vec8fb const & b) {
247     return a | b;
248 }
249 
250 // vector operator |= : bitwise or
251 static inline Vec8fb & operator |= (Vec8fb & a, Vec8fb const & b) {
252     a = a | b;
253     return a;
254 }
255 
256 // vector operator ^ : bitwise xor
257 static inline Vec8fb operator ^ (Vec8fb const & a, Vec8fb const & b) {
258     return Vec8fb(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
259 }
260 
261 // vector operator ^= : bitwise xor
262 static inline Vec8fb & operator ^= (Vec8fb & a, Vec8fb const & b) {
263     a = a ^ b;
264     return a;
265 }
266 
267 // vector operator ~ : bitwise not
268 static inline Vec8fb operator ~ (Vec8fb const & a) {
269     return Vec8fb(~a.get_low(), ~a.get_high());
270 }
271 
272 // vector operator ! : logical not
273 // (operator ! is less efficient than operator ~. Use only where not
274 // all bits in an element are the same)
275 static inline Vec8fb operator ! (Vec8fb const & a) {
276     return Vec8fb(!a.get_low(), !a.get_high());
277 }
278 
279 // Functions for Vec8fb
280 
281 // andnot: a & ~ b
andnot(Vec8fb const & a,Vec8fb const & b)282 static inline Vec8fb andnot(Vec8fb const & a, Vec8fb const & b) {
283     return Vec8fb(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
284 }
285 
286 
287 
288 /*****************************************************************************
289 *
290 *          Horizontal Boolean functions
291 *
292 *****************************************************************************/
293 
294 // horizontal_and. Returns true if all bits are 1
horizontal_and(Vec8fb const & a)295 static inline bool horizontal_and (Vec8fb const & a) {
296     return horizontal_and(a.get_low() & a.get_high());
297 }
298 
299 // horizontal_or. Returns true if at least one bit is 1
horizontal_or(Vec8fb const & a)300 static inline bool horizontal_or (Vec8fb const & a) {
301     return horizontal_or(a.get_low() | a.get_high());
302 }
303 
304 
305 
306 /*****************************************************************************
307 *
308 *          Vec4db: Vector of 4 Booleans for use with Vec4d
309 *
310 *****************************************************************************/
311 
312 class Vec4db : public Vec256de {
313 public:
314     // Default constructor:
Vec4db()315     Vec4db() {
316     }
317     // Constructor to build from all elements:
Vec4db(bool b0,bool b1,bool b2,bool b3)318     Vec4db(bool b0, bool b1, bool b2, bool b3) {
319         y0 = Vec2db(b0, b1);
320         y1 = Vec2db(b2, b3);
321     }
322     // Constructor to build from two Vec2db:
Vec4db(Vec2db const & a0,Vec2db const & a1)323     Vec4db(Vec2db const & a0, Vec2db const & a1) {
324         y0 = a0;  y1 = a1;
325     }
326     // Constructor to convert from type Vec256de
Vec4db(Vec256de const & x)327     Vec4db(Vec256de const & x) {
328         y0 = x.get_low();  y1 = x.get_high();
329     }
330     // Assignment operator to convert from type Vec256de
331     Vec4db & operator = (Vec256de const & x) {
332         y0 = x.get_low();  y1 = x.get_high();
333         return *this;
334     }
335 #ifdef VECTORI256_H  // 256 bit integer vectors are available
336     // Constructor to convert from type Vec4qb used as Boolean for integer vectors
Vec4db(Vec4qb const & x)337     Vec4db(Vec4qb const & x) {
338         y0 = _mm_castsi128_pd(Vec4q(x).get_low());
339         y1 = _mm_castsi128_pd(Vec4q(x).get_high());
340     }
341     // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors
342     Vec4db & operator = (Vec4qb const & x) {
343         y0 = _mm_castsi128_pd(Vec4q(x).get_low());
344         y1 = _mm_castsi128_pd(Vec4q(x).get_high());
345         return *this;
346     }
347     // Constructor to broadcast the same value into all elements:
Vec4db(bool b)348     Vec4db(bool b) {
349         y1 = y0 = Vec2db(b);
350     }
351     // Assignment operator to broadcast scalar value:
352     Vec4db & operator = (bool b) {
353         y0 = y1 = Vec2db(b);
354         return *this;
355     }
356 private: // Prevent constructing from int, etc.
357     Vec4db(int b);
358     Vec4db & operator = (int x);
359 public:
360     // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors
Vec4qb()361     operator Vec4qb() const {
362         return Vec4q(_mm_castpd_si128(y0), _mm_castpd_si128(y1));
363     }
364 #endif // VECTORI256_H
365     // Member function to change a single element in vector
366     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,bool value)367     Vec4db const & insert(uint32_t index, bool value) {
368         if (index < 2) {
369             y0 = Vec2db(y0).insert(index, value);
370         }
371         else {
372             y1 = Vec2db(y1).insert(index - 2, value);
373         }
374         return *this;
375     }
376     // Member function extract a single element from vector
377     // Note: This function is inefficient. Use store function if extracting more than one element
extract(uint32_t index)378     bool extract(uint32_t index) const {
379         if (index < 2) {
380             return Vec2db(y0).extract(index);
381         }
382         else {
383             return Vec2db(y1).extract(index - 2);
384         }
385     }
386     // Extract a single element. Operator [] can only read an element, not write.
387     bool operator [] (uint32_t index) const {
388         return extract(index);
389     }
390     // Member functions to split into two Vec4fb:
get_low()391     Vec2db get_low() const {
392         return y0;
393     }
get_high()394     Vec2db get_high() const {
395         return y1;
396     }
size()397     static int size () {
398         return 4;
399     }
400 };
401 
402 
403 /*****************************************************************************
404 *
405 *          Operators for Vec4db
406 *
407 *****************************************************************************/
408 
409 // vector operator & : bitwise and
410 static inline Vec4db operator & (Vec4db const & a, Vec4db const & b) {
411     return Vec4db(a.get_low() & b.get_low(), a.get_high() & b.get_high());
412 }
413 static inline Vec4db operator && (Vec4db const & a, Vec4db const & b) {
414     return a & b;
415 }
416 
417 // vector operator &= : bitwise and
418 static inline Vec4db & operator &= (Vec4db & a, Vec4db const & b) {
419     a = a & b;
420     return a;
421 }
422 
423 // vector operator | : bitwise or
424 static inline Vec4db operator | (Vec4db const & a, Vec4db const & b) {
425     return Vec4db(a.get_low() | b.get_low(), a.get_high() | b.get_high());
426 }
427 static inline Vec4db operator || (Vec4db const & a, Vec4db const & b) {
428     return a | b;
429 }
430 
431 // vector operator |= : bitwise or
432 static inline Vec4db & operator |= (Vec4db & a, Vec4db const & b) {
433     a = a | b;
434     return a;
435 }
436 
437 // vector operator ^ : bitwise xor
438 static inline Vec4db operator ^ (Vec4db const & a, Vec4db const & b) {
439     return Vec4db(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
440 
441 }
442 
443 // vector operator ^= : bitwise xor
444 static inline Vec4db & operator ^= (Vec4db & a, Vec4db const & b) {
445     a = a ^ b;
446     return a;
447 }
448 
449 // vector operator ~ : bitwise not
450 static inline Vec4db operator ~ (Vec4db const & a) {
451     return Vec4db(~a.get_low(), ~a.get_high());
452 }
453 
454 // vector operator ! : logical not
455 // (operator ! is less efficient than operator ~. Use only where not
456 // all bits in an element are the same)
457 static inline Vec4db operator ! (Vec4db const & a) {
458     return Vec4db(!a.get_low(), !a.get_high());
459 }
460 
461 // Functions for Vec4db
462 
463 // andnot: a & ~ b
andnot(Vec4db const & a,Vec4db const & b)464 static inline Vec4db andnot(Vec4db const & a, Vec4db const & b) {
465     return Vec4db(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
466 }
467 
468 
469 /*****************************************************************************
470 *
471 *          Horizontal Boolean functions
472 *
473 *****************************************************************************/
474 
475 // horizontal_and. Returns true if all bits are 1
horizontal_and(Vec4db const & a)476 static inline bool horizontal_and (Vec4db const & a) {
477     return horizontal_and(a.get_low() & a.get_high());
478 }
479 
480 // horizontal_or. Returns true if at least one bit is 1
horizontal_or(Vec4db const & a)481 static inline bool horizontal_or (Vec4db const & a) {
482     return horizontal_or(a.get_low() | a.get_high());
483 }
484 
485 
486 
487 /*****************************************************************************
488 *
489 *          Vec8f: Vector of 8 single precision floating point values
490 *
491 *****************************************************************************/
492 
493 class Vec8f : public Vec256fe {
494 public:
495     // Default constructor:
Vec8f()496     Vec8f() {
497     }
498     // Constructor to broadcast the same value into all elements:
Vec8f(float f)499     Vec8f(float f) {
500         y1 = y0 = _mm_set1_ps(f);
501     }
502     // Constructor to build from all elements:
Vec8f(float f0,float f1,float f2,float f3,float f4,float f5,float f6,float f7)503     Vec8f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7) {
504         y0 = _mm_setr_ps(f0, f1, f2, f3);
505         y1 = _mm_setr_ps(f4, f5, f6, f7);
506     }
507     // Constructor to build from two Vec4f:
Vec8f(Vec4f const & a0,Vec4f const & a1)508     Vec8f(Vec4f const & a0, Vec4f const & a1) {
509         y0 = a0;  y1 = a1;
510     }
511     // Constructor to convert from type Vec256fe
Vec8f(Vec256fe const & x)512     Vec8f(Vec256fe const & x) {
513         y0 = x.get_low();  y1 = x.get_high();
514     }
515     // Assignment operator to convert from type Vec256fe
516     Vec8f & operator = (Vec256fe const & x) {
517         y0 = x.get_low();  y1 = x.get_high();
518         return *this;
519     }
520     // Member function to load from array (unaligned)
load(float const * p)521     Vec8f & load(float const * p) {
522         y0 = _mm_loadu_ps(p);
523         y1 = _mm_loadu_ps(p+4);
524         return *this;
525     }
526     // Member function to load from array, aligned by 32
527     // You may use load_a instead of load if you are certain that p points to an address
528     // divisible by 32.
load_a(float const * p)529     Vec8f & load_a(float const * p) {
530         y0 = _mm_load_ps(p);
531         y1 = _mm_load_ps(p+4);
532         return *this;
533     }
534     // Member function to store into array (unaligned)
store(float * p)535     void store(float * p) const {
536         _mm_storeu_ps(p,   y0);
537         _mm_storeu_ps(p+4, y1);
538     }
539     // Member function to store into array, aligned by 32
540     // You may use store_a instead of store if you are certain that p points to an address
541     // divisible by 32.
store_a(float * p)542     void store_a(float * p) const {
543         _mm_store_ps(p,   y0);
544         _mm_store_ps(p+4, y1);
545     }
546     // Partial load. Load n elements and set the rest to 0
load_partial(int n,float const * p)547     Vec8f & load_partial(int n, float const * p) {
548         if (n > 0 && n <= 4) {
549             *this = Vec8f(Vec4f().load_partial(n, p),_mm_setzero_ps());
550         }
551         else if (n > 4 && n <= 8) {
552             *this = Vec8f(Vec4f().load(p), Vec4f().load_partial(n - 4, p + 4));
553         }
554         else {
555             y1 = y0 = _mm_setzero_ps();
556         }
557         return *this;
558     }
559     // Partial store. Store n elements
store_partial(int n,float * p)560     void store_partial(int n, float * p) const {
561         if (n <= 4) {
562             get_low().store_partial(n, p);
563         }
564         else if (n <= 8) {
565             get_low().store(p);
566             get_high().store_partial(n - 4, p + 4);
567         }
568     }
569     // cut off vector to n elements. The last 8-n elements are set to zero
cutoff(int n)570     Vec8f & cutoff(int n) {
571         if (uint32_t(n) >= 8) return *this;
572         else if (n >= 4) {
573             y1 = Vec4f(y1).cutoff(n - 4);
574         }
575         else {
576             y0 = Vec4f(y0).cutoff(n);
577             y1 = Vec4f(0.0f);
578         }
579         return *this;
580     }
581     // Member function to change a single element in vector
582     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,float value)583     Vec8f const & insert(uint32_t index, float value) {
584         if (index < 4) {
585             y0 = Vec4f(y0).insert(index, value);
586         }
587         else {
588             y1 = Vec4f(y1).insert(index - 4, value);
589         }
590         return *this;
591     }
592     // Member function extract a single element from vector
593     // Note: This function is inefficient. Use store function if extracting more than one element
extract(uint32_t index)594     float extract(uint32_t index) const {
595         if (index < 4) {
596             return Vec4f(y0).extract(index);
597         }
598         else {
599             return Vec4f(y1).extract(index - 4);
600         }
601     }
602     // Extract a single element. Use store function if extracting more than one element.
603     // Operator [] can only read an element, not write.
604     float operator [] (uint32_t index) const {
605         return extract(index);
606     }
607     // Member functions to split into two Vec4f:
get_low()608     Vec4f get_low() const {
609         return y0;
610     }
get_high()611     Vec4f get_high() const {
612         return y1;
613     }
size()614     static int size () {
615         return 8;
616     }
617 };
618 
619 
620 /*****************************************************************************
621 *
622 *          Operators for Vec8f
623 *
624 *****************************************************************************/
625 
626 // vector operator + : add element by element
627 static inline Vec8f operator + (Vec8f const & a, Vec8f const & b) {
628     return Vec8f(a.get_low() + b.get_low(), a.get_high() + b.get_high());
629 }
630 
631 // vector operator + : add vector and scalar
632 static inline Vec8f operator + (Vec8f const & a, float b) {
633     return a + Vec8f(b);
634 }
635 static inline Vec8f operator + (float a, Vec8f const & b) {
636     return Vec8f(a) + b;
637 }
638 
639 // vector operator += : add
640 static inline Vec8f & operator += (Vec8f & a, Vec8f const & b) {
641     a = a + b;
642     return a;
643 }
644 
645 // postfix operator ++
646 static inline Vec8f operator ++ (Vec8f & a, int) {
647     Vec8f a0 = a;
648     a = a + 1.0f;
649     return a0;
650 }
651 
652 // prefix operator ++
653 static inline Vec8f & operator ++ (Vec8f & a) {
654     a = a + 1.0f;
655     return a;
656 }
657 
658 // vector operator - : subtract element by element
659 static inline Vec8f operator - (Vec8f const & a, Vec8f const & b) {
660     return Vec8f(a.get_low() - b.get_low(), a.get_high() - b.get_high());
661 }
662 
663 // vector operator - : subtract vector and scalar
664 static inline Vec8f operator - (Vec8f const & a, float b) {
665     return a - Vec8f(b);
666 }
667 static inline Vec8f operator - (float a, Vec8f const & b) {
668     return Vec8f(a) - b;
669 }
670 
671 // vector operator - : unary minus
672 // Change sign bit, even for 0, INF and NAN
673 static inline Vec8f operator - (Vec8f const & a) {
674     return Vec8f(-a.get_low(), -a.get_high());
675 }
676 
677 // vector operator -= : subtract
678 static inline Vec8f & operator -= (Vec8f & a, Vec8f const & b) {
679     a = a - b;
680     return a;
681 }
682 
683 // postfix operator --
684 static inline Vec8f operator -- (Vec8f & a, int) {
685     Vec8f a0 = a;
686     a = a - 1.0f;
687     return a0;
688 }
689 
690 // prefix operator --
691 static inline Vec8f & operator -- (Vec8f & a) {
692     a = a - 1.0f;
693     return a;
694 }
695 
696 // vector operator * : multiply element by element
697 static inline Vec8f operator * (Vec8f const & a, Vec8f const & b) {
698     return Vec8f(a.get_low() * b.get_low(), a.get_high() * b.get_high());
699 }
700 
701 // vector operator * : multiply vector and scalar
702 static inline Vec8f operator * (Vec8f const & a, float b) {
703     return a * Vec8f(b);
704 }
705 static inline Vec8f operator * (float a, Vec8f const & b) {
706     return Vec8f(a) * b;
707 }
708 
709 // vector operator *= : multiply
710 static inline Vec8f & operator *= (Vec8f & a, Vec8f const & b) {
711     a = a * b;
712     return a;
713 }
714 
715 // vector operator / : divide all elements by same integer
716 static inline Vec8f operator / (Vec8f const & a, Vec8f const & b) {
717     return Vec8f(a.get_low() / b.get_low(), a.get_high() / b.get_high());
718 }
719 
720 // vector operator / : divide vector and scalar
721 static inline Vec8f operator / (Vec8f const & a, float b) {
722     return a / Vec8f(b);
723 }
724 static inline Vec8f operator / (float a, Vec8f const & b) {
725     return Vec8f(a) / b;
726 }
727 
728 // vector operator /= : divide
729 static inline Vec8f & operator /= (Vec8f & a, Vec8f const & b) {
730     a = a / b;
731     return a;
732 }
733 
734 // vector operator == : returns true for elements for which a == b
735 static inline Vec8fb operator == (Vec8f const & a, Vec8f const & b) {
736     return Vec8fb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
737 }
738 
739 // vector operator != : returns true for elements for which a != b
740 static inline Vec8fb operator != (Vec8f const & a, Vec8f const & b) {
741     return Vec8fb(a.get_low() != b.get_low(), a.get_high() != b.get_high());
742 }
743 
744 // vector operator < : returns true for elements for which a < b
745 static inline Vec8fb operator < (Vec8f const & a, Vec8f const & b) {
746     return Vec8fb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
747 }
748 
749 // vector operator <= : returns true for elements for which a <= b
750 static inline Vec8fb operator <= (Vec8f const & a, Vec8f const & b) {
751     return Vec8fb(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
752 }
753 
754 // vector operator > : returns true for elements for which a > b
755 static inline Vec8fb operator > (Vec8f const & a, Vec8f const & b) {
756     return Vec8fb(a.get_low() > b.get_low(), a.get_high() > b.get_high());
757 }
758 
759 // vector operator >= : returns true for elements for which a >= b
760 static inline Vec8fb operator >= (Vec8f const & a, Vec8f const & b) {
761     return Vec8fb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
762 }
763 
764 // Bitwise logical operators
765 
766 // vector operator & : bitwise and
767 static inline Vec8f operator & (Vec8f const & a, Vec8f const & b) {
768     return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
769 }
770 
771 // vector operator &= : bitwise and
772 static inline Vec8f & operator &= (Vec8f & a, Vec8f const & b) {
773     a = a & b;
774     return a;
775 }
776 
777 // vector operator & : bitwise and of Vec8f and Vec8fb
778 static inline Vec8f operator & (Vec8f const & a, Vec8fb const & b) {
779     return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
780 }
781 static inline Vec8f operator & (Vec8fb const & a, Vec8f const & b) {
782     return Vec8f(a.get_low() & b.get_low(), a.get_high() & b.get_high());
783 }
784 
785 // vector operator | : bitwise or
786 static inline Vec8f operator | (Vec8f const & a, Vec8f const & b) {
787     return Vec8f(a.get_low() | b.get_low(), a.get_high() | b.get_high());
788 }
789 
790 // vector operator |= : bitwise or
791 static inline Vec8f & operator |= (Vec8f & a, Vec8f const & b) {
792     a = a | b;
793     return a;
794 }
795 
796 // vector operator ^ : bitwise xor
797 static inline Vec8f operator ^ (Vec8f const & a, Vec8f const & b) {
798     return Vec8f(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
799 }
800 
801 // vector operator ^= : bitwise xor
802 static inline Vec8f & operator ^= (Vec8f & a, Vec8f const & b) {
803     a = a ^ b;
804     return a;
805 }
806 
807 // vector operator ! : logical not. Returns Boolean vector
808 static inline Vec8fb operator ! (Vec8f const & a) {
809     return Vec8fb(!a.get_low(), !a.get_high());
810 }
811 
812 
813 /*****************************************************************************
814 *
815 *          Functions for Vec8f
816 *
817 *****************************************************************************/
818 
819 // Select between two operands. Corresponds to this pseudocode:
820 // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
821 // Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
select(Vec8fb const & s,Vec8f const & a,Vec8f const & b)822 static inline Vec8f select (Vec8fb const & s, Vec8f const & a, Vec8f const & b) {
823     return Vec8f(select(s.get_low(),a.get_low(),b.get_low()), select(s.get_high(),a.get_high(),b.get_high()));
824 }
825 
826 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec8fb const & f,Vec8f const & a,Vec8f const & b)827 static inline Vec8f if_add (Vec8fb const & f, Vec8f const & a, Vec8f const & b) {
828     return a + (Vec8f(f) & b);
829 }
830 
831 // Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
if_mul(Vec8fb const & f,Vec8f const & a,Vec8f const & b)832 static inline Vec8f if_mul (Vec8fb const & f, Vec8f const & a, Vec8f const & b) {
833     return a * select(f, b, 1.f);
834 }
835 
836 
837 // General arithmetic functions, etc.
838 
839 // Horizontal add: Calculates the sum of all vector elements.
horizontal_add(Vec8f const & a)840 static inline float horizontal_add (Vec8f const & a) {
841     return horizontal_add(a.get_low() + a.get_high());
842 }
843 
844 // function max: a > b ? a : b
max(Vec8f const & a,Vec8f const & b)845 static inline Vec8f max(Vec8f const & a, Vec8f const & b) {
846     return Vec8f(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
847 }
848 
849 // function min: a < b ? a : b
min(Vec8f const & a,Vec8f const & b)850 static inline Vec8f min(Vec8f const & a, Vec8f const & b) {
851     return Vec8f(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
852 }
853 
854 // function abs: absolute value
855 // Removes sign bit, even for -0.0f, -INF and -NAN
abs(Vec8f const & a)856 static inline Vec8f abs(Vec8f const & a) {
857     return Vec8f(abs(a.get_low()), abs(a.get_high()));
858 }
859 
860 // function sqrt: square root
sqrt(Vec8f const & a)861 static inline Vec8f sqrt(Vec8f const & a) {
862     return Vec8f(sqrt(a.get_low()), sqrt(a.get_high()));
863 }
864 
865 // function square: a * a
square(Vec8f const & a)866 static inline Vec8f square(Vec8f const & a) {
867     return Vec8f(square(a.get_low()), square(a.get_high()));
868 }
869 
870 // pow(Vec8f, int):
871 template <typename TT> static Vec8f pow(Vec8f const & a, TT const & n);
872 
873 // Raise floating point numbers to integer power n
874 template <>
875 inline Vec8f pow<int>(Vec8f const & x0, int const & n) {
876     return pow_template_i<Vec8f>(x0, n);
877 }
878 
879 // allow conversion from unsigned int
880 template <>
881 inline Vec8f pow<uint32_t>(Vec8f const & x0, uint32_t const & n) {
882     return pow_template_i<Vec8f>(x0, (int)n);
883 }
884 
885 
886 // Raise floating point numbers to integer power n, where n is a compile-time constant
887 template <int n>
pow_n(Vec8f const & a)888 static inline Vec8f pow_n(Vec8f const & a) {
889     return Vec8f(pow_n<n>(a.get_low()), pow_n<n>(a.get_high()));
890 }
891 
892 template <int n>
pow(Vec8f const & a,Const_int_t<n>)893 static inline Vec8f pow(Vec8f const & a, Const_int_t<n>) {
894     return pow_n<n>(a);
895 }
896 
897 
898 // function round: round to nearest integer (even). (result as float vector)
round(Vec8f const & a)899 static inline Vec8f round(Vec8f const & a) {
900     return Vec8f(round(a.get_low()), round(a.get_high()));
901 }
902 
903 // function truncate: round towards zero. (result as float vector)
truncate(Vec8f const & a)904 static inline Vec8f truncate(Vec8f const & a) {
905     return Vec8f(truncate(a.get_low()), truncate(a.get_high()));
906 }
907 
908 // function floor: round towards minus infinity. (result as float vector)
floor(Vec8f const & a)909 static inline Vec8f floor(Vec8f const & a) {
910     return Vec8f(floor(a.get_low()), floor(a.get_high()));
911 }
912 
913 // function ceil: round towards plus infinity. (result as float vector)
ceil(Vec8f const & a)914 static inline Vec8f ceil(Vec8f const & a) {
915     return Vec8f(ceil(a.get_low()), ceil(a.get_high()));
916 }
917 
918 #ifdef VECTORI256_H  // 256 bit integer vectors are available
919 // function round_to_int: round to nearest integer (even). (result as integer vector)
round_to_int(Vec8f const & a)920 static inline Vec8i round_to_int(Vec8f const & a) {
921     // Note: assume MXCSR control register is set to rounding
922     return Vec8i(round_to_int(a.get_low()), round_to_int(a.get_high()));
923 }
924 
925 // function truncate_to_int: round towards zero. (result as integer vector)
truncate_to_int(Vec8f const & a)926 static inline Vec8i truncate_to_int(Vec8f const & a) {
927     return Vec8i(truncate_to_int(a.get_low()), truncate_to_int(a.get_high()));
928 }
929 
930 // function to_float: convert integer vector to float vector
to_float(Vec8i const & a)931 static inline Vec8f to_float(Vec8i const & a) {
932     return Vec8f(to_float(a.get_low()), to_float(a.get_high()));
933 }
934 
935 // function to_float: convert unsigned integer vector to float vector
to_float(Vec8ui const & a)936 static inline Vec8f to_float(Vec8ui const & a) {
937     return Vec8f(to_float(a.get_low()), to_float(a.get_high()));
938 }
939 
940 #endif // VECTORI256_H
941 
942 
943 // Approximate math functions
944 
945 // approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11)
approx_recipr(Vec8f const & a)946 static inline Vec8f approx_recipr(Vec8f const & a) {
947     return Vec8f(approx_recipr(a.get_low()), approx_recipr(a.get_high()));
948 }
949 
950 // approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11)
approx_rsqrt(Vec8f const & a)951 static inline Vec8f approx_rsqrt(Vec8f const & a) {
952     return Vec8f(approx_rsqrt(a.get_low()), approx_rsqrt(a.get_high()));
953 }
954 
955 // Fused multiply and add functions
956 
957 // Multiply and add
mul_add(Vec8f const & a,Vec8f const & b,Vec8f const & c)958 static inline Vec8f mul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
959     return Vec8f(mul_add(a.get_low(),b.get_low(),c.get_low()), mul_add(a.get_high(),b.get_high(),c.get_high()));
960 }
961 
962 // Multiply and subtract
mul_sub(Vec8f const & a,Vec8f const & b,Vec8f const & c)963 static inline Vec8f mul_sub(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
964     return Vec8f(mul_sub(a.get_low(),b.get_low(),c.get_low()), mul_sub(a.get_high(),b.get_high(),c.get_high()));
965 }
966 
967 // Multiply and inverse subtract
nmul_add(Vec8f const & a,Vec8f const & b,Vec8f const & c)968 static inline Vec8f nmul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
969     return Vec8f(nmul_add(a.get_low(),b.get_low(),c.get_low()), nmul_add(a.get_high(),b.get_high(),c.get_high()));
970 }
971 
972 
973 // Multiply and subtract with extra precision on the intermediate calculations,
974 // even if FMA instructions not supported, using Veltkamp-Dekker split
mul_sub_x(Vec8f const & a,Vec8f const & b,Vec8f const & c)975 static inline Vec8f mul_sub_x(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
976     return Vec8f(mul_sub_x(a.get_low(),b.get_low(),c.get_low()), mul_sub_x(a.get_high(),b.get_high(),c.get_high()));
977 }
978 
979 
980 // Math functions using fast bit manipulation
981 
982 #ifdef VECTORI256_H  // 256 bit integer vectors are available
983 // Extract the exponent as an integer
984 // exponent(a) = floor(log2(abs(a)));
985 // exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
exponent(Vec8f const & a)986 static inline Vec8i exponent(Vec8f const & a) {
987     return Vec8i(exponent(a.get_low()), exponent(a.get_high()));
988 }
989 #endif
990 
991 // Extract the fraction part of a floating point number
992 // a = 2^exponent(a) * fraction(a), except for a = 0
993 // fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f
fraction(Vec8f const & a)994 static inline Vec8f fraction(Vec8f const & a) {
995     return Vec8f(fraction(a.get_low()), fraction(a.get_high()));
996 }
997 
998 #ifdef VECTORI256_H  // 256 bit integer vectors are available
999 // Fast calculation of pow(2,n) with n integer
1000 // n  =    0 gives 1.0f
1001 // n >=  128 gives +INF
1002 // n <= -127 gives 0.0f
1003 // This function will never produce denormals, and never raise exceptions
exp2(Vec8i const & a)1004 static inline Vec8f exp2(Vec8i const & a) {
1005     return Vec8f(exp2(a.get_low()), exp2(a.get_high()));
1006 }
1007 //static Vec8f exp2(Vec8f const & x); // defined in vectormath_exp.h
1008 #endif // VECTORI256_H
1009 
1010 
1011 
1012 // Categorization functions
1013 
1014 // Function sign_bit: gives true for elements that have the sign bit set
1015 // even for -0.0f, -INF and -NAN
1016 // Note that sign_bit(Vec8f(-0.0f)) gives true, while Vec8f(-0.0f) < Vec8f(0.0f) gives false
1017 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
sign_bit(Vec8f const & a)1018 static inline Vec8fb sign_bit(Vec8f const & a) {
1019     return Vec8fb(sign_bit(a.get_low()), sign_bit(a.get_high()));
1020 }
1021 
1022 // Function sign_combine: changes the sign of a when b has the sign bit set
1023 // same as select(sign_bit(b), -a, a)
sign_combine(Vec8f const & a,Vec8f const & b)1024 static inline Vec8f sign_combine(Vec8f const & a, Vec8f const & b) {
1025     return Vec8f(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
1026 }
1027 
1028 // Function is_finite: gives true for elements that are normal, denormal or zero,
1029 // false for INF and NAN
1030 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
is_finite(Vec8f const & a)1031 static inline Vec8fb is_finite(Vec8f const & a) {
1032     return Vec8fb(is_finite(a.get_low()), is_finite(a.get_high()));
1033 }
1034 
1035 // Function is_inf: gives true for elements that are +INF or -INF
1036 // false for finite numbers and NAN
1037 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
is_inf(Vec8f const & a)1038 static inline Vec8fb is_inf(Vec8f const & a) {
1039     return Vec8fb(is_inf(a.get_low()), is_inf(a.get_high()));
1040 }
1041 
1042 // Function is_nan: gives true for elements that are +NAN or -NAN
1043 // false for finite numbers and +/-INF
1044 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
is_nan(Vec8f const & a)1045 static inline Vec8fb is_nan(Vec8f const & a) {
1046     return Vec8fb(is_nan(a.get_low()), is_nan(a.get_high()));
1047 }
1048 
1049 // Function is_subnormal: gives true for elements that are denormal (subnormal)
1050 // false for finite numbers, zero, NAN and INF
is_subnormal(Vec8f const & a)1051 static inline Vec8fb is_subnormal(Vec8f const & a) {
1052     return Vec8fb(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
1053 }
1054 
1055 // Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
1056 // false for finite numbers, NAN and INF
is_zero_or_subnormal(Vec8f const & a)1057 static inline Vec8fb is_zero_or_subnormal(Vec8f const & a) {
1058     return Vec8fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
1059 }
1060 
1061 // Function infinite4f: returns a vector where all elements are +INF
infinite8f()1062 static inline Vec8f infinite8f() {
1063     return constant8f<0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000>();
1064 }
1065 
1066 // Function nan4f: returns a vector where all elements are +NAN (quiet)
1067 static inline Vec8f nan8f(int n = 0x10) {
1068     return Vec8f(nan4f(n), nan4f(n));
1069 }
1070 
1071 // change signs on vectors Vec8f
1072 // Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
1073 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
change_sign(Vec8f const & a)1074 static inline Vec8f change_sign(Vec8f const & a) {
1075     if ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0) return a;
1076     Vec4f lo = change_sign<i0,i1,i2,i3>(a.get_low());
1077     Vec4f hi = change_sign<i4,i5,i6,i7>(a.get_high());
1078     return Vec8f(lo, hi);
1079 }
1080 
1081 
1082 /*****************************************************************************
1083 *
1084 *          Vec2d: Vector of 2 double precision floating point values
1085 *
1086 *****************************************************************************/
1087 
1088 class Vec4d : public Vec256de {
1089 public:
1090     // Default constructor:
Vec4d()1091     Vec4d() {
1092     }
1093     // Constructor to broadcast the same value into all elements:
Vec4d(double d)1094     Vec4d(double d) {
1095         y1 = y0 = _mm_set1_pd(d);
1096     }
1097     // Constructor to build from all elements:
Vec4d(double d0,double d1,double d2,double d3)1098     Vec4d(double d0, double d1, double d2, double d3) {
1099         y0 = _mm_setr_pd(d0, d1);
1100         y1 = _mm_setr_pd(d2, d3);
1101     }
1102     // Constructor to build from two Vec4f:
Vec4d(Vec2d const & a0,Vec2d const & a1)1103     Vec4d(Vec2d const & a0, Vec2d const & a1) {
1104         y0 = a0;  y1 = a1;
1105     }
1106     // Constructor to convert from type Vec256de
Vec4d(Vec256de const & x)1107     Vec4d(Vec256de const & x) {
1108         y0 = x.get_low();
1109         y1 = x.get_high();
1110     }
1111     // Assignment operator to convert from type Vec256de
1112     Vec4d & operator = (Vec256de const & x) {
1113         y0 = x.get_low();
1114         y1 = x.get_high();
1115         return *this;
1116     }
1117     // Member function to load from array (unaligned)
load(double const * p)1118     Vec4d & load(double const * p) {
1119         y0 = _mm_loadu_pd(p);
1120         y1 = _mm_loadu_pd(p+2);
1121         return *this;
1122     }
1123     // Member function to load from array, aligned by 32
1124     // You may use load_a instead of load if you are certain that p points to an address
1125     // divisible by 32
load_a(double const * p)1126     Vec4d & load_a(double const * p) {
1127         y0 = _mm_load_pd(p);
1128         y1 = _mm_load_pd(p+2);
1129         return *this;
1130     }
1131     // Member function to store into array (unaligned)
store(double * p)1132     void store(double * p) const {
1133         _mm_storeu_pd(p,   y0);
1134         _mm_storeu_pd(p+2, y1);
1135     }
1136     // Member function to store into array, aligned by 32
1137     // You may use store_a instead of store if you are certain that p points to an address
1138     // divisible by 32
store_a(double * p)1139     void store_a(double * p) const {
1140         _mm_store_pd(p,   y0);
1141         _mm_store_pd(p+2, y1);
1142     }
1143     // Partial load. Load n elements and set the rest to 0
load_partial(int n,double const * p)1144     Vec4d & load_partial(int n, double const * p) {
1145         if (n > 0 && n <= 2) {
1146             *this = Vec4d(Vec2d().load_partial(n, p), _mm_setzero_pd());
1147         }
1148         else if (n > 2 && n <= 4) {
1149             *this = Vec4d(Vec2d().load(p), Vec2d().load_partial(n - 2, p + 2));
1150         }
1151         else {
1152             y1 = y0 = _mm_setzero_pd();
1153         }
1154         return *this;
1155     }
1156     // Partial store. Store n elements
store_partial(int n,double * p)1157     void store_partial(int n, double * p) const {
1158         if (n <= 2) {
1159             get_low().store_partial(n, p);
1160         }
1161         else if (n <= 4) {
1162             get_low().store(p);
1163             get_high().store_partial(n - 2, p + 2);
1164         }
1165     }
cutoff(int n)1166     Vec4d & cutoff(int n) {
1167         if (uint32_t(n) >= 4) return *this;
1168         else if (n >= 2) {
1169             y1 = Vec2d(y1).cutoff(n - 2);
1170         }
1171         else {
1172             y0 = Vec2d(y0).cutoff(n);
1173             y1 = Vec2d(0.0);
1174         }
1175         return *this;
1176     }
1177     // Member function to change a single element in vector
1178     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,double value)1179     Vec4d const & insert(uint32_t index, double value) {
1180         if (index < 2) {
1181             y0 = Vec2d(y0).insert(index, value);
1182         }
1183         else {
1184             y1 = Vec2d(y1).insert(index-2, value);
1185         }
1186         return *this;
1187     }
1188     // Member function extract a single element from vector
1189     // Note: This function is inefficient. Use store function if extracting more than one element
extract(uint32_t index)1190     double extract(uint32_t index) const {
1191         if (index < 2) {
1192             return Vec2d(y0).extract(index);
1193         }
1194         else {
1195             return Vec2d(y1).extract(index-2);
1196         }
1197     }
1198     // Extract a single element. Use store function if extracting more than one element.
1199     // Operator [] can only read an element, not write.
1200     double operator [] (uint32_t index) const {
1201         return extract(index);
1202     }
1203     // Member functions to split into two Vec2d:
get_low()1204     Vec2d get_low() const {
1205         return y0;
1206     }
get_high()1207     Vec2d get_high() const {
1208         return y1;
1209     }
size()1210     static int size () {
1211         return 4;
1212     }
1213 };
1214 
1215 
1216 
1217 /*****************************************************************************
1218 *
1219 *          Operators for Vec4d
1220 *
1221 *****************************************************************************/
1222 
1223 // vector operator + : add element by element
1224 static inline Vec4d operator + (Vec4d const & a, Vec4d const & b) {
1225     return Vec4d(a.get_low() + b.get_low(), a.get_high() + b.get_high());
1226 }
1227 
1228 // vector operator + : add vector and scalar
1229 static inline Vec4d operator + (Vec4d const & a, double b) {
1230     return a + Vec4d(b);
1231 }
1232 static inline Vec4d operator + (double a, Vec4d const & b) {
1233     return Vec4d(a) + b;
1234 }
1235 
1236 // vector operator += : add
1237 static inline Vec4d & operator += (Vec4d & a, Vec4d const & b) {
1238     a = a + b;
1239     return a;
1240 }
1241 
1242 // postfix operator ++
1243 static inline Vec4d operator ++ (Vec4d & a, int) {
1244     Vec4d a0 = a;
1245     a = a + 1.0;
1246     return a0;
1247 }
1248 
1249 // prefix operator ++
1250 static inline Vec4d & operator ++ (Vec4d & a) {
1251     a = a + 1.0;
1252     return a;
1253 }
1254 
1255 // vector operator - : subtract element by element
1256 static inline Vec4d operator - (Vec4d const & a, Vec4d const & b) {
1257     return Vec4d(a.get_low() - b.get_low(), a.get_high() - b.get_high());
1258 }
1259 
1260 // vector operator - : subtract vector and scalar
1261 static inline Vec4d operator - (Vec4d const & a, double b) {
1262     return a - Vec4d(b);
1263 }
1264 static inline Vec4d operator - (double a, Vec4d const & b) {
1265     return Vec4d(a) - b;
1266 }
1267 
1268 // vector operator - : unary minus
1269 // Change sign bit, even for 0, INF and NAN
1270 static inline Vec4d operator - (Vec4d const & a) {
1271     return Vec4d(-a.get_low(), -a.get_high());
1272 }
1273 
1274 // vector operator -= : subtract
1275 static inline Vec4d & operator -= (Vec4d & a, Vec4d const & b) {
1276     a = a - b;
1277     return a;
1278 }
1279 
1280 // postfix operator --
1281 static inline Vec4d operator -- (Vec4d & a, int) {
1282     Vec4d a0 = a;
1283     a = a - 1.0;
1284     return a0;
1285 }
1286 
1287 // prefix operator --
1288 static inline Vec4d & operator -- (Vec4d & a) {
1289     a = a - 1.0;
1290     return a;
1291 }
1292 
1293 // vector operator * : multiply element by element
1294 static inline Vec4d operator * (Vec4d const & a, Vec4d const & b) {
1295     return Vec4d(a.get_low() * b.get_low(), a.get_high() * b.get_high());
1296 }
1297 
1298 // vector operator * : multiply vector and scalar
1299 static inline Vec4d operator * (Vec4d const & a, double b) {
1300     return a * Vec4d(b);
1301 }
1302 static inline Vec4d operator * (double a, Vec4d const & b) {
1303     return Vec4d(a) * b;
1304 }
1305 
1306 // vector operator *= : multiply
1307 static inline Vec4d & operator *= (Vec4d & a, Vec4d const & b) {
1308     a = a * b;
1309     return a;
1310 }
1311 
1312 // vector operator / : divide all elements by same integer
1313 static inline Vec4d operator / (Vec4d const & a, Vec4d const & b) {
1314     return Vec4d(a.get_low() / b.get_low(), a.get_high() / b.get_high());
1315 }
1316 
1317 // vector operator / : divide vector and scalar
1318 static inline Vec4d operator / (Vec4d const & a, double b) {
1319     return a / Vec4d(b);
1320 }
1321 static inline Vec4d operator / (double a, Vec4d const & b) {
1322     return Vec4d(a) / b;
1323 }
1324 
1325 // vector operator /= : divide
1326 static inline Vec4d & operator /= (Vec4d & a, Vec4d const & b) {
1327     a = a / b;
1328     return a;
1329 }
1330 
1331 // vector operator == : returns true for elements for which a == b
1332 static inline Vec4db operator == (Vec4d const & a, Vec4d const & b) {
1333     return Vec4db(a.get_low() == b.get_low(), a.get_high() == b.get_high());
1334 }
1335 
1336 // vector operator != : returns true for elements for which a != b
1337 static inline Vec4db operator != (Vec4d const & a, Vec4d const & b) {
1338     return Vec4db(a.get_low() != b.get_low(), a.get_high() != b.get_high());
1339 }
1340 
1341 // vector operator < : returns true for elements for which a < b
1342 static inline Vec4db operator < (Vec4d const & a, Vec4d const & b) {
1343     return Vec4db(a.get_low() < b.get_low(), a.get_high() < b.get_high());
1344 }
1345 
1346 // vector operator <= : returns true for elements for which a <= b
1347 static inline Vec4db operator <= (Vec4d const & a, Vec4d const & b) {
1348     return Vec4db(a.get_low() <= b.get_low(), a.get_high() <= b.get_high());
1349 }
1350 
1351 // vector operator > : returns true for elements for which a > b
1352 static inline Vec4db operator > (Vec4d const & a, Vec4d const & b) {
1353     return Vec4db(a.get_low() > b.get_low(), a.get_high() > b.get_high());
1354 }
1355 
1356 // vector operator >= : returns true for elements for which a >= b
1357 static inline Vec4db operator >= (Vec4d const & a, Vec4d const & b) {
1358     return Vec4db(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
1359 }
1360 
1361 // Bitwise logical operators
1362 
1363 // vector operator & : bitwise and
1364 static inline Vec4d operator & (Vec4d const & a, Vec4d const & b) {
1365     return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
1366 }
1367 
1368 // vector operator &= : bitwise and
1369 static inline Vec4d & operator &= (Vec4d & a, Vec4d const & b) {
1370     a = a & b;
1371     return a;
1372 }
1373 
1374 // vector operator & : bitwise and of Vec4d and Vec4db
1375 static inline Vec4d operator & (Vec4d const & a, Vec4db const & b) {
1376     return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
1377 }
1378 static inline Vec4d operator & (Vec4db const & a, Vec4d const & b) {
1379     return Vec4d(a.get_low() & b.get_low(), a.get_high() & b.get_high());
1380 }
1381 
1382 // vector operator | : bitwise or
1383 static inline Vec4d operator | (Vec4d const & a, Vec4d const & b) {
1384     return Vec4d(a.get_low() | b.get_low(), a.get_high() | b.get_high());
1385 }
1386 
1387 // vector operator |= : bitwise or
1388 static inline Vec4d & operator |= (Vec4d & a, Vec4d const & b) {
1389     a = a | b;
1390     return a;
1391 }
1392 
1393 // vector operator ^ : bitwise xor
1394 static inline Vec4d operator ^ (Vec4d const & a, Vec4d const & b) {
1395     return Vec4d(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
1396 }
1397 
1398 // vector operator ^= : bitwise xor
1399 static inline Vec4d & operator ^= (Vec4d & a, Vec4d const & b) {
1400     a = a ^ b;
1401     return a;
1402 }
1403 
1404 // vector operator ! : logical not. Returns Boolean vector
1405 static inline Vec4db operator ! (Vec4d const & a) {
1406     return Vec4db(!a.get_low(), !a.get_high());
1407 }
1408 
1409 
1410 /*****************************************************************************
1411 *
1412 *          Functions for Vec4d
1413 *
1414 *****************************************************************************/
1415 
1416 // Select between two operands. Corresponds to this pseudocode:
1417 // for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
1418 // Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true).
1419 // No other values are allowed.
select(Vec4db const & s,Vec4d const & a,Vec4d const & b)1420 static inline Vec4d select (Vec4db const & s, Vec4d const & a, Vec4d const & b) {
1421     return Vec4d(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
1422 }
1423 
1424 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec4db const & f,Vec4d const & a,Vec4d const & b)1425 static inline Vec4d if_add (Vec4db const & f, Vec4d const & a, Vec4d const & b) {
1426     return a + (Vec4d(f) & b);
1427 }
1428 
1429 // Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
if_mul(Vec4db const & f,Vec4d const & a,Vec4d const & b)1430 static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b) {
1431     return a * select(f, b, 1.f);
1432 }
1433 
1434 // General arithmetic functions, etc.
1435 
1436 // Horizontal add: Calculates the sum of all vector elements.
horizontal_add(Vec4d const & a)1437 static inline double horizontal_add (Vec4d const & a) {
1438     return horizontal_add(a.get_low() + a.get_high());
1439 }
1440 
1441 // function max: a > b ? a : b
max(Vec4d const & a,Vec4d const & b)1442 static inline Vec4d max(Vec4d const & a, Vec4d const & b) {
1443     return Vec4d(max(a.get_low(),b.get_low()), max(a.get_high(),b.get_high()));
1444 }
1445 
1446 // function min: a < b ? a : b
min(Vec4d const & a,Vec4d const & b)1447 static inline Vec4d min(Vec4d const & a, Vec4d const & b) {
1448     return Vec4d(min(a.get_low(),b.get_low()), min(a.get_high(),b.get_high()));
1449 }
1450 
1451 // function abs: absolute value
1452 // Removes sign bit, even for -0.0f, -INF and -NAN
abs(Vec4d const & a)1453 static inline Vec4d abs(Vec4d const & a) {
1454     return Vec4d(abs(a.get_low()), abs(a.get_high()));
1455 }
1456 
1457 // function sqrt: square root
sqrt(Vec4d const & a)1458 static inline Vec4d sqrt(Vec4d const & a) {
1459     return Vec4d(sqrt(a.get_low()), sqrt(a.get_high()));
1460 }
1461 
1462 // function square: a * a
square(Vec4d const & a)1463 static inline Vec4d square(Vec4d const & a) {
1464     return Vec4d(square(a.get_low()), square(a.get_high()));
1465 }
1466 
1467 // pow(Vec4d, int):
1468 // Raise floating point numbers to integer power n
1469 template <typename TT> static Vec4d pow(Vec4d const & a, TT const & n);
1470 
1471 // Raise floating point numbers to integer power n
1472 template <>
1473 inline Vec4d pow<int>(Vec4d const & x0, int const & n) {
1474     return pow_template_i<Vec4d>(x0, n);
1475 }
1476 
1477 // allow conversion from unsigned int
1478 template <>
1479 inline Vec4d pow<uint32_t>(Vec4d const & x0, uint32_t const & n) {
1480     return pow_template_i<Vec4d>(x0, (int)n);
1481 }
1482 
1483 
1484 // Raise floating point numbers to integer power n, where n is a compile-time constant
1485 template <int n>
pow_n(Vec4d const & a)1486 static inline Vec4d pow_n(Vec4d const & a) {
1487     return Vec4d(pow_n<n>(a.get_low()), pow_n<n>(a.get_high()));
1488 }
1489 
1490 template <int n>
pow(Vec4d const & a,Const_int_t<n>)1491 static inline Vec4d pow(Vec4d const & a, Const_int_t<n>) {
1492     return pow_n<n>(a);
1493 }
1494 
1495 
1496 // function round: round to nearest integer (even). (result as double vector)
round(Vec4d const & a)1497 static inline Vec4d round(Vec4d const & a) {
1498     return Vec4d(round(a.get_low()), round(a.get_high()));
1499 }
1500 
1501 // function truncate: round towards zero. (result as double vector)
truncate(Vec4d const & a)1502 static inline Vec4d truncate(Vec4d const & a) {
1503     return Vec4d(truncate(a.get_low()), truncate(a.get_high()));
1504 }
1505 
1506 // function floor: round towards minus infinity. (result as double vector)
floor(Vec4d const & a)1507 static inline Vec4d floor(Vec4d const & a) {
1508     return Vec4d(floor(a.get_low()), floor(a.get_high()));
1509 }
1510 
1511 // function ceil: round towards plus infinity. (result as double vector)
ceil(Vec4d const & a)1512 static inline Vec4d ceil(Vec4d const & a) {
1513     return Vec4d(ceil(a.get_low()), ceil(a.get_high()));
1514 }
1515 
1516 // function round_to_int: round to nearest integer (even). (result as integer vector)
round_to_int(Vec4d const & a)1517 static inline Vec4i round_to_int(Vec4d const & a) {
1518     // Note: assume MXCSR control register is set to rounding
1519     return round_to_int(a.get_low(), a.get_high());
1520 }
1521 
1522 // function truncate_to_int: round towards zero. (result as integer vector)
truncate_to_int(Vec4d const & a)1523 static inline Vec4i truncate_to_int(Vec4d const & a) {
1524     return truncate_to_int(a.get_low(), a.get_high());
1525 }
1526 
1527 #ifdef VECTORI256_H  // 256 bit integer vectors are available
1528 
1529 // function truncate_to_int64: round towards zero. (inefficient)
truncate_to_int64(Vec4d const & a)1530 static inline Vec4q truncate_to_int64(Vec4d const & a) {
1531     double aa[4];
1532     a.store(aa);
1533     return Vec4q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3]));
1534 }
1535 
1536 // function truncate_to_int64_limited: round towards zero.
1537 // result as 64-bit integer vector, but with limited range
truncate_to_int64_limited(Vec4d const & a)1538 static inline Vec4q truncate_to_int64_limited(Vec4d const & a) {
1539     return Vec4q(truncate_to_int64_limited(a.get_low()), truncate_to_int64_limited(a.get_high()));
1540 }
1541 
1542 // function round_to_int64: round to nearest or even. (inefficient)
round_to_int64(Vec4d const & a)1543 static inline Vec4q round_to_int64(Vec4d const & a) {
1544     return truncate_to_int64(round(a));
1545 }
1546 
1547 // function round_to_int64_limited: round to nearest integer
1548 // result as 64-bit integer vector, but with limited range
round_to_int64_limited(Vec4d const & a)1549 static inline Vec4q round_to_int64_limited(Vec4d const & a) {
1550     return Vec4q(round_to_int64_limited(a.get_low()), round_to_int64_limited(a.get_high()));
1551 }
1552 
1553 // function to_double: convert integer vector elements to double vector (inefficient)
to_double(Vec4q const & a)1554 static inline Vec4d to_double(Vec4q const & a) {
1555     int64_t aa[4];
1556     a.store(aa);
1557     return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]));
1558 }
1559 
1560 // function to_double_limited: convert integer vector elements to double vector
1561 // limited to abs(x) < 2^31
to_double_limited(Vec4q const & x)1562 static inline Vec4d to_double_limited(Vec4q const & x) {
1563     return Vec4d(to_double_limited(x.get_low()),to_double_limited(x.get_high()));
1564 }
1565 
1566 #endif  // VECTORI256_H
1567 
1568 // function to_double: convert integer vector to double vector
to_double(Vec4i const & a)1569 static inline Vec4d to_double(Vec4i const & a) {
1570     return Vec4d(to_double_low(a), to_double_high(a));
1571 }
1572 
1573 // function compress: convert two Vec4d to one Vec8f
compress(Vec4d const & low,Vec4d const & high)1574 static inline Vec8f compress (Vec4d const & low, Vec4d const & high) {
1575     return Vec8f(compress(low.get_low(), low.get_high()), compress(high.get_low(), high.get_high()));
1576 }
1577 
1578 // Function extend_low : convert Vec8f vector elements 0 - 3 to Vec4d
extend_low(Vec8f const & a)1579 static inline Vec4d extend_low (Vec8f const & a) {
1580     return Vec4d(extend_low(a.get_low()), extend_high(a.get_low()));
1581 }
1582 
1583 // Function extend_high : convert Vec8f vector elements 4 - 7 to Vec4d
extend_high(Vec8f const & a)1584 static inline Vec4d extend_high (Vec8f const & a) {
1585     return Vec4d(extend_low(a.get_high()), extend_high(a.get_high()));
1586 }
1587 
1588 
1589 // Fused multiply and add functions
1590 
1591 // Multiply and add
mul_add(Vec4d const & a,Vec4d const & b,Vec4d const & c)1592 static inline Vec4d mul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
1593     return Vec4d(mul_add(a.get_low(),b.get_low(),c.get_low()), mul_add(a.get_high(),b.get_high(),c.get_high()));
1594 }
1595 
1596 // Multiply and subtract
mul_sub(Vec4d const & a,Vec4d const & b,Vec4d const & c)1597 static inline Vec4d mul_sub(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
1598     return Vec4d(mul_sub(a.get_low(),b.get_low(),c.get_low()), mul_sub(a.get_high(),b.get_high(),c.get_high()));
1599 }
1600 
1601 // Multiply and inverse subtract
nmul_add(Vec4d const & a,Vec4d const & b,Vec4d const & c)1602 static inline Vec4d nmul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
1603     return Vec4d(nmul_add(a.get_low(),b.get_low(),c.get_low()), nmul_add(a.get_high(),b.get_high(),c.get_high()));
1604 }
1605 
1606 // Multiply and subtract with extra precision on the intermediate calculations,
1607 // even if FMA instructions not supported, using Veltkamp-Dekker split
mul_sub_x(Vec4d const & a,Vec4d const & b,Vec4d const & c)1608 static inline Vec4d mul_sub_x(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
1609     return Vec4d(mul_sub_x(a.get_low(),b.get_low(),c.get_low()), mul_sub_x(a.get_high(),b.get_high(),c.get_high()));
1610 }
1611 
1612 
1613 // Math functions using fast bit manipulation
1614 
1615 #ifdef VECTORI256_H  // 256 bit integer vectors are available, AVX2
1616 // Extract the exponent as an integer
1617 // exponent(a) = floor(log2(abs(a)));
1618 // exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
exponent(Vec4d const & a)1619 static inline Vec4q exponent(Vec4d const & a) {
1620     return Vec4q(exponent(a.get_low()), exponent(a.get_high()));
1621 }
1622 
1623 // Extract the fraction part of a floating point number
1624 // a = 2^exponent(a) * fraction(a), except for a = 0
1625 // fraction(1.0) = 1.0, fraction(5.0) = 1.25
fraction(Vec4d const & a)1626 static inline Vec4d fraction(Vec4d const & a) {
1627     return Vec4d(fraction(a.get_low()), fraction(a.get_high()));
1628 }
1629 
1630 // Fast calculation of pow(2,n) with n integer
1631 // n  =     0 gives 1.0
1632 // n >=  1024 gives +INF
1633 // n <= -1023 gives 0.0
1634 // This function will never produce denormals, and never raise exceptions
exp2(Vec4q const & a)1635 static inline Vec4d exp2(Vec4q const & a) {
1636     return Vec4d(exp2(a.get_low()), exp2(a.get_high()));
1637 }
1638 //static Vec4d exp2(Vec4d const & x); // defined in vectormath_exp.h
1639 #endif
1640 
1641 
1642 // Categorization functions
1643 
1644 // Function sign_bit: gives true for elements that have the sign bit set
1645 // even for -0.0, -INF and -NAN
1646 // Note that sign_bit(Vec4d(-0.0)) gives true, while Vec4d(-0.0) < Vec4d(0.0) gives false
sign_bit(Vec4d const & a)1647 static inline Vec4db sign_bit(Vec4d const & a) {
1648     return Vec4db(sign_bit(a.get_low()), sign_bit(a.get_high()));
1649 }
1650 
1651 // Function sign_combine: changes the sign of a when b has the sign bit set
1652 // same as select(sign_bit(b), -a, a)
sign_combine(Vec4d const & a,Vec4d const & b)1653 static inline Vec4d sign_combine(Vec4d const & a, Vec4d const & b) {
1654     return Vec4d(sign_combine(a.get_low(), b.get_low()), sign_combine(a.get_high(), b.get_high()));
1655 }
1656 
1657 // Function is_finite: gives true for elements that are normal, denormal or zero,
1658 // false for INF and NAN
is_finite(Vec4d const & a)1659 static inline Vec4db is_finite(Vec4d const & a) {
1660     return Vec4db(is_finite(a.get_low()), is_finite(a.get_high()));
1661 }
1662 
1663 // Function is_inf: gives true for elements that are +INF or -INF
1664 // false for finite numbers and NAN
is_inf(Vec4d const & a)1665 static inline Vec4db is_inf(Vec4d const & a) {
1666     return Vec4db(is_inf(a.get_low()), is_inf(a.get_high()));
1667 }
1668 
1669 // Function is_nan: gives true for elements that are +NAN or -NAN
1670 // false for finite numbers and +/-INF
is_nan(Vec4d const & a)1671 static inline Vec4db is_nan(Vec4d const & a) {
1672     return Vec4db(is_nan(a.get_low()), is_nan(a.get_high()));
1673 }
1674 
1675 // Function is_subnormal: gives true for elements that are denormal (subnormal)
1676 // false for finite numbers, zero, NAN and INF
is_subnormal(Vec4d const & a)1677 static inline Vec4db is_subnormal(Vec4d const & a) {
1678     return Vec4db(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
1679 }
1680 
1681 // Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
1682 // false for finite numbers, NAN and INF
is_zero_or_subnormal(Vec4d const & a)1683 static inline Vec4db is_zero_or_subnormal(Vec4d const & a) {
1684     return Vec4db(is_zero_or_subnormal(a.get_low()),is_zero_or_subnormal(a.get_high()));
1685 }
1686 
1687 // Function infinite2d: returns a vector where all elements are +INF
infinite4d()1688 static inline Vec4d infinite4d() {
1689     return Vec4d(infinite2d(), infinite2d());
1690 }
1691 
1692 // Function nan2d: returns a vector where all elements are +NAN (quiet)
1693 static inline Vec4d nan4d(int n = 0x10) {
1694     return Vec4d(nan2d(n), nan2d(n));
1695 }
1696 
1697 // change signs on vectors Vec4d
1698 // Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
1699 template <int i0, int i1, int i2, int i3>
change_sign(Vec4d const & a)1700 static inline Vec4d change_sign(Vec4d const & a) {
1701     if ((i0 | i1 | i2 | i3) == 0) return a;
1702     Vec2d lo = change_sign<i0,i1>(a.get_low());
1703     Vec2d hi = change_sign<i2,i3>(a.get_high());
1704     return Vec4d(lo, hi);
1705 }
1706 
1707 
1708 /*****************************************************************************
1709 *
1710 *          Functions for reinterpretation between vector types
1711 *
1712 *****************************************************************************/
1713 
reinterpret_i(Vec256ie const & x)1714 static inline Vec256ie reinterpret_i (Vec256ie const & x) {
1715     return x;
1716 }
1717 
reinterpret_i(Vec256fe const & x)1718 static inline Vec256ie reinterpret_i (Vec256fe  const & x) {
1719     return Vec256ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high()));
1720 }
1721 
reinterpret_i(Vec256de const & x)1722 static inline Vec256ie reinterpret_i (Vec256de const & x) {
1723     return Vec256ie(reinterpret_i(x.get_low()), reinterpret_i(x.get_high()));
1724 }
1725 
reinterpret_f(Vec256ie const & x)1726 static inline Vec256fe  reinterpret_f (Vec256ie const & x) {
1727     return Vec256fe(reinterpret_f(x.get_low()), reinterpret_f(x.get_high()));
1728 }
1729 
reinterpret_f(Vec256fe const & x)1730 static inline Vec256fe  reinterpret_f (Vec256fe  const & x) {
1731     return x;
1732 }
1733 
reinterpret_f(Vec256de const & x)1734 static inline Vec256fe  reinterpret_f (Vec256de const & x) {
1735     return Vec256fe(reinterpret_f(x.get_low()), reinterpret_f(x.get_high()));
1736 }
1737 
reinterpret_d(Vec256ie const & x)1738 static inline Vec256de reinterpret_d (Vec256ie const & x) {
1739     return Vec256de(reinterpret_d(x.get_low()), reinterpret_d(x.get_high()));
1740 }
1741 
reinterpret_d(Vec256fe const & x)1742 static inline Vec256de reinterpret_d (Vec256fe  const & x) {
1743     return Vec256de(reinterpret_d(x.get_low()), reinterpret_d(x.get_high()));
1744 }
1745 
reinterpret_d(Vec256de const & x)1746 static inline Vec256de reinterpret_d (Vec256de const & x) {
1747     return x;
1748 }
1749 
1750 
1751 /*****************************************************************************
1752 *
1753 *          Vector permute and blend functions
1754 *
1755 ******************************************************************************
1756 *
1757 * The permute function can reorder the elements of a vector and optionally
1758 * set some elements to zero.
1759 *
1760 * The indexes are inserted as template parameters in <>. These indexes must be
1761 * constants. Each template parameter is an index to the element you want to
1762 * select. An index of -1 will generate zero. An index of -256 means don't care.
1763 *
1764 * Example:
1765 * Vec4d a(10., 11., 12., 13.);    // a is (10, 11, 12, 13)
1766 * Vec4d b;
1767 * b = permute4d<1,0,-1,3>(a);     // b is (11, 10,  0, 13)
1768 *
1769 *
1770 * The blend function can mix elements from two different vectors and
1771 * optionally set some elements to zero.
1772 *
1773 * The indexes are inserted as template parameters in <>. These indexes must be
1774 * constants. Each template parameter is an index to the element you want to
1775 * select, where indexes 0 - 3 indicate an element from the first source
1776 * vector and indexes 4 - 7 indicate an element from the second source vector.
1777 * A negative index will generate zero.
1778 *
1779 *
1780 * Example:
1781 * Vec4d a(10., 11., 12., 13.);    // a is (10, 11, 12, 13)
1782 * Vec4d b(20., 21., 22., 23.);    // a is (20, 21, 22, 23)
1783 * Vec4d c;
1784 * c = blend4d<4,3,7,-1> (a,b);    // c is (20, 13, 23,  0)
1785 *****************************************************************************/
1786 
1787 // permute vector Vec4d
1788 template <int i0, int i1, int i2, int i3>
permute4d(Vec4d const & a)1789 static inline Vec4d permute4d(Vec4d const & a) {
1790     return Vec4d(blend2d<i0,i1> (a.get_low(), a.get_high()),
1791            blend2d<i2,i3> (a.get_low(), a.get_high()));
1792 }
1793 
1794 // helper function used below
1795 template <int n>
select4(Vec4d const & a,Vec4d const & b)1796 static inline Vec2d select4(Vec4d const & a, Vec4d const & b) {
1797     switch (n) {
1798     case 0:
1799         return a.get_low();
1800     case 1:
1801         return a.get_high();
1802     case 2:
1803         return b.get_low();
1804     case 3:
1805         return b.get_high();
1806     }
1807     return _mm_setzero_pd();
1808 }
1809 
1810 // blend vectors Vec4d
1811 template <int i0, int i1, int i2, int i3>
blend4d(Vec4d const & a,Vec4d const & b)1812 static inline Vec4d blend4d(Vec4d const & a, Vec4d const & b) {
1813     const int j0 = i0 >= 0 ? i0/2 : i0;
1814     const int j1 = i1 >= 0 ? i1/2 : i1;
1815     const int j2 = i2 >= 0 ? i2/2 : i2;
1816     const int j3 = i3 >= 0 ? i3/2 : i3;
1817     Vec2d x0, x1;
1818 
1819     if (j0 == j1 || i0 < 0 || i1 < 0) {  // both from same
1820         const int k0 = j0 >= 0 ? j0 : j1;
1821         x0 = permute2d<i0 & -7, i1 & -7> (select4<k0> (a,b));
1822     }
1823     else {
1824         x0 = blend2d<i0 & -7, (i1 & -7) | 2> (select4<j0>(a,b), select4<j1>(a,b));
1825     }
1826     if (j2 == j3 || i2 < 0 || i3 < 0) {  // both from same
1827         const int k1 = j2 >= 0 ? j2 : j3;
1828         x1 = permute2d<i2 & -7, i3 & -7> (select4<k1> (a,b));
1829     }
1830     else {
1831         x1 = blend2d<i2 & -7, (i3 & -7) | 2> (select4<j2>(a,b), select4<j3>(a,b));
1832     }
1833     return Vec4d(x0,x1);
1834 }
1835 
1836 /*****************************************************************************
1837 *
1838 *          Vector Vec8f permute and blend functions
1839 *
1840 *****************************************************************************/
1841 
1842 // permute vector Vec8f
1843 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
permute8f(Vec8f const & a)1844 static inline Vec8f permute8f(Vec8f const & a) {
1845     return Vec8f(blend4f<i0,i1,i2,i3> (a.get_low(), a.get_high()),
1846                  blend4f<i4,i5,i6,i7> (a.get_low(), a.get_high()));
1847 }
1848 
1849 // helper function used below
1850 template <int n>
select4(Vec8f const & a,Vec8f const & b)1851 static inline Vec4f select4(Vec8f const & a, Vec8f const & b) {
1852     switch (n) {
1853     case 0:
1854         return a.get_low();
1855     case 1:
1856         return a.get_high();
1857     case 2:
1858         return b.get_low();
1859     case 3:
1860         return b.get_high();
1861     }
1862     return _mm_setzero_ps();
1863 }
1864 
1865 // blend vectors Vec8f
1866 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
blend8f(Vec8f const & a,Vec8f const & b)1867 static inline Vec8f blend8f(Vec8f const & a, Vec8f const & b) {
1868     const int j0 = i0 >= 0 ? i0/4 : i0;
1869     const int j1 = i1 >= 0 ? i1/4 : i1;
1870     const int j2 = i2 >= 0 ? i2/4 : i2;
1871     const int j3 = i3 >= 0 ? i3/4 : i3;
1872     const int j4 = i4 >= 0 ? i4/4 : i4;
1873     const int j5 = i5 >= 0 ? i5/4 : i5;
1874     const int j6 = i6 >= 0 ? i6/4 : i6;
1875     const int j7 = i7 >= 0 ? i7/4 : i7;
1876     Vec4f x0, x1;
1877 
1878     const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3;
1879     const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
1880     const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3;
1881     const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7;
1882 
1883     // Combine all the indexes into a single bitfield, with 4 bits for each
1884     const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
1885 
1886     // Mask to zero out negative indexes
1887     const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
1888 
1889     if (r0 < 0) {
1890         x0 = _mm_setzero_ps();
1891     }
1892     else if (((m1 ^ r0*0x4444) & 0xCCCC & mz) == 0) {
1893         // i0 - i3 all from same source
1894         x0 = permute4f<i0 & -13, i1 & -13, i2 & -13, i3 & -13> (select4<r0> (a,b));
1895     }
1896     else if ((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0)) {
1897         // i0 - i3 all from two sources
1898         const int k0 =  i0 >= 0 ? i0 & 3 : i0;
1899         const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0);
1900         const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0);
1901         const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0);
1902         x0 = blend4f<k0,k1,k2,k3> (select4<r0>(a,b), select4<s0>(a,b));
1903     }
1904     else {
1905         // i0 - i3 from three or four different sources
1906         x0 = blend4f<0,1,6,7> (
1907              blend4f<i0 & -13, (i1 & -13) | 4, -0x100, -0x100> (select4<j0>(a,b), select4<j1>(a,b)),
1908              blend4f<-0x100, -0x100, i2 & -13, (i3 & -13) | 4> (select4<j2>(a,b), select4<j3>(a,b)));
1909     }
1910 
1911     if (r1 < 0) {
1912         x1 = _mm_setzero_ps();
1913     }
1914     else if (((m1 ^ r1*0x44440000u) & 0xCCCC0000 & mz) == 0) {
1915         // i4 - i7 all from same source
1916         x1 = permute4f<i4 & -13, i5 & -13, i6 & -13, i7 & -13> (select4<r1> (a,b));
1917     }
1918     else if ((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1)) {
1919         // i4 - i7 all from two sources
1920         const int k4 =  i4 >= 0 ? i4 & 3 : i4;
1921         const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0);
1922         const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0);
1923         const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0);
1924         x1 = blend4f<k4,k5,k6,k7> (select4<r1>(a,b), select4<s1>(a,b));
1925     }
1926     else {
1927         // i4 - i7 from three or four different sources
1928         x1 = blend4f<0,1,6,7> (
1929              blend4f<i4 & -13, (i5 & -13) | 4, -0x100, -0x100> (select4<j4>(a,b), select4<j5>(a,b)),
1930              blend4f<-0x100, -0x100, i6 & -13, (i7 & -13) | 4> (select4<j6>(a,b), select4<j7>(a,b)));
1931     }
1932 
1933     return Vec8f(x0,x1);
1934 }
1935 
1936 /*****************************************************************************
1937 *
1938 *          Vector lookup functions
1939 *
1940 ******************************************************************************
1941 *
1942 * These functions use vector elements as indexes into a table.
1943 * The table is given as one or more vectors or as an array.
1944 *
1945 * This can be used for several purposes:
1946 *  - table lookup
1947 *  - permute or blend with variable indexes
1948 *  - blend from more than two sources
1949 *  - gather non-contiguous data
1950 *
1951 * An index out of range may produce any value - the actual value produced is
1952 * implementation dependent and may be different for different instruction
1953 * sets. An index out of range does not produce an error message or exception.
1954 *
1955 * Example:
1956 * Vec4i a(2,0,0,3);               // index  a is (  2,   0,   0,   3)
1957 * Vec4f b(1.0f,1.1f,1.2f,1.3f);   // table  b is (1.0, 1.1, 1.2, 1.3)
1958 * Vec4f c;
1959 * c = lookup4 (a,b);              // result c is (1.2, 1.0, 1.0, 1.3)
1960 *
1961 *****************************************************************************/
1962 
1963 #ifdef VECTORI256_H  // Vec8i and Vec4q must be defined
1964 
lookup8(Vec8i const & index,Vec8f const & table)1965 static inline Vec8f lookup8(Vec8i const & index, Vec8f const & table) {
1966     Vec4f  r0 = lookup8(index.get_low() , table.get_low(), table.get_high());
1967     Vec4f  r1 = lookup8(index.get_high(), table.get_low(), table.get_high());
1968     return Vec8f(r0, r1);
1969 }
1970 
1971 template <int n>
lookup(Vec8i const & index,float const * table)1972 static inline Vec8f lookup(Vec8i const & index, float const * table) {
1973     if (n <= 0) return 0;
1974     if (n <= 4) {
1975         Vec4f table1 = Vec4f().load(table);
1976         return Vec8f(
1977             lookup4 (index.get_low(),  table1),
1978             lookup4 (index.get_high(), table1));
1979     }
1980     if (n <= 8) {
1981         return lookup8(index, Vec8f().load(table));
1982     }
1983     // Limit index
1984     Vec8ui index1;
1985     if ((n & (n-1)) == 0) {
1986         // n is a power of 2, make index modulo n
1987         index1 = Vec8ui(index) & (n-1);
1988     }
1989     else {
1990         // n is not a power of 2, limit to n-1
1991         index1 = min(Vec8ui(index), n-1);
1992     }
1993     return Vec8f(table[index1[0]],table[index1[1]],table[index1[2]],table[index1[3]],
1994     table[index1[4]],table[index1[5]],table[index1[6]],table[index1[7]]);
1995 }
1996 
lookup4(Vec4q const & index,Vec4d const & table)1997 static inline Vec4d lookup4(Vec4q const & index, Vec4d const & table) {
1998     Vec2d  r0 = lookup4(index.get_low() , table.get_low(), table.get_high());
1999     Vec2d  r1 = lookup4(index.get_high(), table.get_low(), table.get_high());
2000     return Vec4d(r0, r1);
2001 }
2002 
2003 template <int n>
lookup(Vec4q const & index,double const * table)2004 static inline Vec4d lookup(Vec4q const & index, double const * table) {
2005     if (n <= 0) return 0;
2006     if (n <= 2) {
2007         Vec2d table1 = Vec2d().load(table);
2008         return Vec4d(
2009             lookup2 (index.get_low(),  table1),
2010             lookup2 (index.get_high(), table1));
2011     }
2012     // Limit index
2013     Vec8ui index1;
2014     if ((n & (n-1)) == 0) {
2015         // n is a power of 2, make index modulo n
2016         index1 = Vec8ui(index) & constant8i<n-1, 0, n-1, 0, n-1, 0, n-1, 0>();
2017     }
2018     else {
2019         // n is not a power of 2, limit to n-1
2020         index1 = min(Vec8ui(index), constant8i<n-1, 0, n-1, 0, n-1, 0, n-1, 0>() );
2021     }
2022     Vec4q index2 = Vec4q(index1);
2023     return Vec4d(table[index2[0]],table[index2[1]],table[index2[2]],table[index2[3]]);
2024 }
2025 #endif  // VECTORI256_H
2026 
2027 /*****************************************************************************
2028 *
2029 *          Vector scatter functions
2030 *
2031 ******************************************************************************
2032 *
2033 * These functions write the elements of a vector to arbitrary positions in an
2034 * array in memory. Each vector element is written to an array position
2035 * determined by an index. An element is not written if the corresponding
2036 * index is out of range.
2037 * The indexes can be specified as constant template parameters or as an
2038 * integer vector.
2039 *
2040 * The scatter functions are useful if the data are distributed in a sparce
2041 * manner into the array. If the array is dense then it is more efficient
2042 * to permute the data into the right positions and then write the whole
2043 * permuted vector into the array.
2044 *
2045 * Example:
2046 * Vec8d a(10,11,12,13,14,15,16,17);
2047 * double b[16] = {0};
2048 * scatter<0,2,14,10,1,-1,5,9>(a,b);
2049 * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
2050 *
2051 *****************************************************************************/
2052 
2053 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
scatter(Vec8f const & data,float * array)2054 static inline void scatter(Vec8f const & data, float * array) {
2055     const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7};
2056     for (int i = 0; i < 8; i++) {
2057         if (index[i] >= 0) array[index[i]] = data[i];
2058     }
2059 }
2060 
2061 template <int i0, int i1, int i2, int i3>
scatter(Vec4d const & data,double * array)2062 static inline void scatter(Vec4d const & data, double * array) {
2063     const int index[4] = {i0,i1,i2,i3};
2064     for (int i = 0; i < 4; i++) {
2065         if (index[i] >= 0) array[index[i]] = data[i];
2066     }
2067 }
2068 
scatter(Vec8i const & index,uint32_t limit,Vec8f const & data,float * array)2069 static inline void scatter(Vec8i const & index, uint32_t limit, Vec8f const & data, float * array) {
2070     for (int i = 0; i < 8; i++) {
2071         if (uint32_t(index[i]) < limit) array[index[i]] = data[i];
2072     }
2073 }
2074 
scatter(Vec4q const & index,uint32_t limit,Vec4d const & data,double * array)2075 static inline void scatter(Vec4q const & index, uint32_t limit, Vec4d const & data, double * array) {
2076     for (int i = 0; i < 4; i++) {
2077         if (uint64_t(index[i]) < uint64_t(limit)) array[index[i]] = data[i];
2078     }
2079 }
2080 
scatter(Vec4i const & index,uint32_t limit,Vec4d const & data,double * array)2081 static inline void scatter(Vec4i const & index, uint32_t limit, Vec4d const & data, double * array) {
2082     for (int i = 0; i < 4; i++) {
2083         if (uint32_t(index[i]) < limit) array[index[i]] = data[i];
2084     }
2085 }
2086 
2087 /*****************************************************************************
2088 *
2089 *          Horizontal scan functions
2090 *
2091 *****************************************************************************/
2092 
2093 // Get index to the first element that is true. Return -1 if all are false
2094 
horizontal_find_first(Vec8fb const & x)2095 static inline int horizontal_find_first(Vec8fb const & x) {
2096     return horizontal_find_first(Vec8ib(x));
2097 }
2098 
horizontal_find_first(Vec4db const & x)2099 static inline int horizontal_find_first(Vec4db const & x) {
2100     return horizontal_find_first(Vec4qb(x));
2101 }
2102 
2103 // Count the number of elements that are true
horizontal_count(Vec8fb const & x)2104 static inline uint32_t horizontal_count(Vec8fb const & x) {
2105     return horizontal_count(Vec8ib(x));
2106 }
2107 
horizontal_count(Vec4db const & x)2108 static inline uint32_t horizontal_count(Vec4db const & x) {
2109     return horizontal_count(Vec4qb(x));
2110 }
2111 
2112 /*****************************************************************************
2113 *
2114 *          Boolean <-> bitfield conversion functions
2115 *
2116 *****************************************************************************/
2117 
2118 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec8fb const & x)2119 static inline uint8_t to_bits(Vec8fb const & x) {
2120     return to_bits(Vec8ib(x));
2121 }
2122 
2123 // to_Vec8fb: convert integer bitfield to boolean vector
to_Vec8fb(uint8_t x)2124 static inline Vec8fb to_Vec8fb(uint8_t x) {
2125     return Vec8fb(to_Vec8ib(x));
2126 }
2127 
2128 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec4db const & x)2129 static inline uint8_t to_bits(Vec4db const & x) {
2130     return to_bits(Vec4qb(x));
2131 }
2132 
2133 // to_Vec4db: convert integer bitfield to boolean vector
to_Vec4db(uint8_t x)2134 static inline Vec4db to_Vec4db(uint8_t x) {
2135     return Vec4db(to_Vec4qb(x));
2136 }
2137 
2138 #ifdef VCL_NAMESPACE
2139 }
2140 #endif
2141 
2142 #endif // VECTORF256_H
2143