1 /****************************  vectorf256.h   *******************************
2 * Author:        Agner Fog
3 * Date created:  2012-05-30
4 * Last modified: 2017-07-27
5 * Version:       1.30
6 * Project:       vector classes
7 * Description:
8 * Header file defining 256-bit floating point vector classes as interface
9 * to intrinsic functions in x86 microprocessors with AVX instruction set.
10 *
11 * Instructions:
12 * Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired
13 * instruction set, which must be at least AVX.
14 *
15 * The following vector classes are defined here:
16 * Vec8f     Vector of 8 single precision floating point numbers
17 * Vec8fb    Vector of 8 Booleans for use with Vec8f
18 * Vec4d     Vector of 4 double precision floating point numbers
19 * Vec4db    Vector of 4 Booleans for use with Vec4d
20 *
21 * Each vector object is represented internally in the CPU as a 256-bit register.
22 * This header file defines operators and functions for these vectors.
23 *
24 * For example:
25 * Vec4d a(1., 2., 3., 4.), b(5., 6., 7., 8.), c;
26 * c = a + b;     // now c contains (6., 8., 10., 12.)
27 *
28 * For detailed instructions, see VectorClass.pdf
29 *
30 * (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses
31 *****************************************************************************/
32 
33 // check combination of header files
34 #if defined (VECTORF256_H)
35 #if    VECTORF256_H != 2
36 #error Two different versions of vectorf256.h included
37 #endif
38 #else
39 #define VECTORF256_H  2
40 
41 #if INSTRSET < 7   // AVX required
42 #error Please compile for the AVX instruction set or higher
43 #endif
44 
45 #include "vectorf128.h"  // Define 128-bit vectors
46 
47 #ifdef VCL_NAMESPACE
48 namespace VCL_NAMESPACE {
49 #endif
50 
51 /*****************************************************************************
52 *
53 *          select functions
54 *
55 *****************************************************************************/
56 // Select between two __m256 sources, element by element. Used in various functions
57 // and operators. Corresponds to this pseudocode:
58 // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
59 // Each element in s must be either 0 (false) or 0xFFFFFFFF (true).
selectf(__m256 const & s,__m256 const & a,__m256 const & b)60 static inline __m256 selectf (__m256 const & s, __m256 const & a, __m256 const & b) {
61     return _mm256_blendv_ps (b, a, s);
62 }
63 
64 // Same, with two __m256d sources.
65 // and operators. Corresponds to this pseudocode:
66 // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
67 // Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other
68 // values are allowed.
selectd(__m256d const & s,__m256d const & a,__m256d const & b)69 static inline __m256d selectd (__m256d const & s, __m256d const & a, __m256d const & b) {
70     return _mm256_blendv_pd (b, a, s);
71 }
72 
73 
74 
75 /*****************************************************************************
76 *
77 *          Generate compile-time constant vector
78 *
79 *****************************************************************************/
80 // Generate a constant vector of 8 integers stored in memory,
81 // load as __m256
82 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
constant8f()83 static inline __m256 constant8f() {
84     static const union {
85         int     i[8];
86         __m256  ymm;
87     } u = {{i0,i1,i2,i3,i4,i5,i6,i7}};
88     return u.ymm;
89 }
90 
91 
92 /*****************************************************************************
93 *
94 *         Join two 128-bit vectors
95 *
96 *****************************************************************************/
97 #define set_m128r(lo,hi) _mm256_insertf128_ps(_mm256_castps128_ps256(lo),(hi),1)
98     // _mm256_set_m128(hi,lo); // not defined in all versions of immintrin.h
99 
100 
101 /*****************************************************************************
102 *
103 *          Vec8fb: Vector of 8 Booleans for use with Vec8f
104 *
105 *****************************************************************************/
106 
107 class Vec8fb {
108 protected:
109     __m256 ymm; // Float vector
110 public:
111     // Default constructor:
Vec8fb()112     Vec8fb() {
113     }
114     // Constructor to build from all elements:
Vec8fb(bool b0,bool b1,bool b2,bool b3,bool b4,bool b5,bool b6,bool b7)115     Vec8fb(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) {
116 #if INSTRSET >= 8  // AVX2
117         ymm = _mm256_castsi256_ps(_mm256_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3, -(int)b4, -(int)b5, -(int)b6, -(int)b7));
118 #else
119         __m128 blo = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3));
120         __m128 bhi = _mm_castsi128_ps(_mm_setr_epi32(-(int)b4, -(int)b5, -(int)b6, -(int)b7));
121         ymm = set_m128r(blo,bhi);
122 #endif
123     }
124     // Constructor to build from two Vec4fb:
Vec8fb(Vec4fb const & a0,Vec4fb const & a1)125     Vec8fb(Vec4fb const & a0, Vec4fb const & a1) {
126         ymm = set_m128r(a0, a1);
127     }
128     // Constructor to convert from type __m256 used in intrinsics:
Vec8fb(__m256 const & x)129     Vec8fb(__m256 const & x) {
130         ymm = x;
131     }
132     // Assignment operator to convert from type __m256 used in intrinsics:
133     Vec8fb & operator = (__m256 const & x) {
134         ymm = x;
135         return *this;
136     }
137     // Constructor to broadcast the same value into all elements:
Vec8fb(bool b)138     Vec8fb(bool b) {
139 #if INSTRSET >= 8  // AVX2
140         ymm = _mm256_castsi256_ps(_mm256_set1_epi32(-(int)b));
141 #else
142         __m128 b1 = _mm_castsi128_ps(_mm_set1_epi32(-(int)b));
143         //ymm = _mm256_set_m128(b1,b1);
144         ymm = set_m128r(b1,b1);
145 #endif
146     }
147     // Assignment operator to broadcast scalar value:
148     Vec8fb & operator = (bool b) {
149         *this = Vec8fb(b);
150         return *this;
151     }
152 private: // Prevent constructing from int, etc.
153     Vec8fb(int b);
154     Vec8fb & operator = (int x);
155 public:
156     // Type cast operator to convert to __m256 used in intrinsics
__m256()157     operator __m256() const {
158         return ymm;
159     }
160 #if defined (VECTORI256_H)
161 #if VECTORI256_H >= 2  // AVX2 version
162     // Constructor to convert from type Vec8ib used as Boolean for integer vectors
Vec8fb(Vec8ib const & x)163     Vec8fb(Vec8ib const & x) {
164         ymm = _mm256_castsi256_ps(x);
165     }
166     // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors
167     Vec8fb & operator = (Vec8ib const & x) {
168         ymm = _mm256_castsi256_ps(x);
169         return *this;
170     }
171 #ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
172     // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors
Vec8ib()173     operator Vec8ib() const {
174         return _mm256_castps_si256(ymm);
175     }
176 #endif
177 #else
178     // Constructor to convert from type Vec8ib used as Boolean for integer vectors
Vec8fb(Vec8ib const & x)179     Vec8fb(Vec8ib const & x) {
180         ymm = set_m128r(_mm_castsi128_ps(x.get_low()), _mm_castsi128_ps(x.get_high()));
181     }
182     // Assignment operator to convert from type Vec8ib used as Boolean for integer vectors
183     Vec8fb & operator = (Vec8ib const & x) {
184         ymm = set_m128r(_mm_castsi128_ps(x.get_low()), _mm_castsi128_ps(x.get_high()));
185         return *this;
186     }
187     // Type cast operator to convert to type Vec8ib used as Boolean for integer vectors
Vec8ib()188     operator Vec8ib() const {
189         return Vec8i(_mm_castps_si128(get_low()), _mm_castps_si128(get_high()));
190     }
191 #endif
192 #endif // VECTORI256_H
193     // Member function to change a single element in vector
194     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,bool value)195     Vec8fb const & insert(uint32_t index, bool value) {
196         static const int32_t maskl[16] = {0,0,0,0,0,0,0,0,-1,0,0,0,0,0,0,0};
197         __m256 mask  = _mm256_loadu_ps((float const*)(maskl+8-(index & 7))); // mask with FFFFFFFF at index position
198         if (value) {
199             ymm = _mm256_or_ps(ymm,mask);
200         }
201         else {
202             ymm = _mm256_andnot_ps(mask,ymm);
203         }
204         return *this;
205     }
206     // Member function extract a single element from vector
extract(uint32_t index)207     bool extract(uint32_t index) const {
208         union {
209             float   f[8];
210             int32_t i[8];
211         } u;
212         _mm256_storeu_ps(u.f, ymm);
213         return u.i[index & 7] != 0;
214     }
215     // Extract a single element. Operator [] can only read an element, not write.
216     bool operator [] (uint32_t index) const {
217         return extract(index);
218     }
219     // Member functions to split into two Vec4fb:
get_low()220     Vec4fb get_low() const {
221         return _mm256_castps256_ps128(ymm);
222     }
get_high()223     Vec4fb get_high() const {
224         return _mm256_extractf128_ps(ymm,1);
225     }
size()226     static int size () {
227         return 8;
228     }
229 };
230 
231 
232 /*****************************************************************************
233 *
234 *          Operators for Vec8fb
235 *
236 *****************************************************************************/
237 
238 // vector operator & : bitwise and
239 static inline Vec8fb operator & (Vec8fb const & a, Vec8fb const & b) {
240     return _mm256_and_ps(a, b);
241 }
242 static inline Vec8fb operator && (Vec8fb const & a, Vec8fb const & b) {
243     return a & b;
244 }
245 
246 // vector operator &= : bitwise and
247 static inline Vec8fb & operator &= (Vec8fb & a, Vec8fb const & b) {
248     a = a & b;
249     return a;
250 }
251 
252 // vector operator | : bitwise or
253 static inline Vec8fb operator | (Vec8fb const & a, Vec8fb const & b) {
254     return _mm256_or_ps(a, b);
255 }
256 static inline Vec8fb operator || (Vec8fb const & a, Vec8fb const & b) {
257     return a | b;
258 }
259 
260 // vector operator |= : bitwise or
261 static inline Vec8fb & operator |= (Vec8fb & a, Vec8fb const & b) {
262     a = a | b;
263     return a;
264 }
265 
266 // vector operator ^ : bitwise xor
267 static inline Vec8fb operator ^ (Vec8fb const & a, Vec8fb const & b) {
268     return _mm256_xor_ps(a, b);
269 }
270 
271 // vector operator ^= : bitwise xor
272 static inline Vec8fb & operator ^= (Vec8fb & a, Vec8fb const & b) {
273     a = a ^ b;
274     return a;
275 }
276 
277 // vector operator ~ : bitwise not
278 static inline Vec8fb operator ~ (Vec8fb const & a) {
279     return _mm256_xor_ps(a, constant8f<-1,-1,-1,-1,-1,-1,-1,-1>());
280 }
281 
282 // vector operator ! : logical not
283 // (operator ! is less efficient than operator ~. Use only where not
284 // all bits in an element are the same)
285 static inline Vec8fb operator ! (Vec8fb const & a) {
286 return Vec8fb( !Vec8ib(a));
287 }
288 
289 // Functions for Vec8fb
290 
291 // andnot: a & ~ b
andnot(Vec8fb const & a,Vec8fb const & b)292 static inline Vec8fb andnot(Vec8fb const & a, Vec8fb const & b) {
293     return _mm256_andnot_ps(b, a);
294 }
295 
296 
297 
298 /*****************************************************************************
299 *
300 *          Horizontal Boolean functions
301 *
302 *****************************************************************************/
303 
304 // horizontal_and. Returns true if all bits are 1
horizontal_and(Vec8fb const & a)305 static inline bool horizontal_and (Vec8fb const & a) {
306     return _mm256_testc_ps(a,constant8f<-1,-1,-1,-1,-1,-1,-1,-1>()) != 0;
307 }
308 
309 // horizontal_or. Returns true if at least one bit is 1
horizontal_or(Vec8fb const & a)310 static inline bool horizontal_or (Vec8fb const & a) {
311     return ! _mm256_testz_ps(a,a);
312 }
313 
314 
315 /*****************************************************************************
316 *
317 *          Vec4db: Vector of 4 Booleans for use with Vec4d
318 *
319 *****************************************************************************/
320 
321 class Vec4db {
322 protected:
323     __m256d ymm; // double vector
324 public:
325     // Default constructor:
Vec4db()326     Vec4db() {
327     }
328     // Constructor to build from all elements:
Vec4db(bool b0,bool b1,bool b2,bool b3)329     Vec4db(bool b0, bool b1, bool b2, bool b3) {
330 #if INSTRSET >= 8  // AVX2
331         ymm = _mm256_castsi256_pd(_mm256_setr_epi64x(-(int64_t)b0, -(int64_t)b1, -(int64_t)b2, -(int64_t)b3));
332 #else
333         __m128 blo = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1));
334         __m128 bhi = _mm_castsi128_ps(_mm_setr_epi32(-(int)b2, -(int)b2, -(int)b3, -(int)b3));
335         ymm = _mm256_castps_pd(set_m128r(blo, bhi));
336 #endif
337     }
338     // Constructor to build from two Vec2db:
Vec4db(Vec2db const & a0,Vec2db const & a1)339     Vec4db(Vec2db const & a0, Vec2db const & a1) {
340         ymm = _mm256_castps_pd(set_m128r(_mm_castpd_ps(a0),_mm_castpd_ps(a1)));
341         //ymm = _mm256_set_m128d(a1, a0);
342     }
343     // Constructor to convert from type __m256d used in intrinsics:
Vec4db(__m256d const & x)344     Vec4db(__m256d const & x) {
345         ymm = x;
346     }
347     // Assignment operator to convert from type __m256d used in intrinsics:
348     Vec4db & operator = (__m256d const & x) {
349         ymm = x;
350         return *this;
351     }
352     // Constructor to broadcast the same value into all elements:
Vec4db(bool b)353     Vec4db(bool b) {
354 #if INSTRSET >= 8  // AVX2
355         ymm = _mm256_castsi256_pd(_mm256_set1_epi64x(-(int64_t)b));
356 #else
357         __m128 b1 = _mm_castsi128_ps(_mm_set1_epi32(-(int)b));
358         ymm = _mm256_castps_pd(set_m128r(b1,b1));
359 #endif
360     }
361     // Assignment operator to broadcast scalar value:
362     Vec4db & operator = (bool b) {
363         ymm = _mm256_castsi256_pd(_mm256_set1_epi32(-int32_t(b)));
364         return *this;
365     }
366 private: // Prevent constructing from int, etc.
367     Vec4db(int b);
368     Vec4db & operator = (int x);
369 public:
370     // Type cast operator to convert to __m256d used in intrinsics
__m256d()371     operator __m256d() const {
372         return ymm;
373     }
374 #ifdef VECTORI256_H
375 #if VECTORI256_H == 2  // 256 bit integer vectors are available, AVX2
376     // Constructor to convert from type Vec4qb used as Boolean for integer vectors
Vec4db(Vec4qb const & x)377     Vec4db(Vec4qb const & x) {
378         ymm = _mm256_castsi256_pd(x);
379     }
380     // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors
381     Vec4db & operator = (Vec4qb const & x) {
382         ymm = _mm256_castsi256_pd(x);
383         return *this;
384     }
385 #ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
386     // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors
Vec4qb()387     operator Vec4qb() const {
388         return _mm256_castpd_si256(ymm);
389     }
390 #endif
391 #else   // 256 bit integer vectors emulated without AVX2
392     // Constructor to convert from type Vec4qb used as Boolean for integer vectors
Vec4db(Vec4qb const & x)393     Vec4db(Vec4qb const & x) {
394         *this = Vec4db(_mm_castsi128_pd(x.get_low()), _mm_castsi128_pd(x.get_high()));
395     }
396     // Assignment operator to convert from type Vec4qb used as Boolean for integer vectors
397     Vec4db & operator = (Vec4qb const & x) {
398         *this = Vec4db(_mm_castsi128_pd(x.get_low()), _mm_castsi128_pd(x.get_high()));
399         return *this;
400     }
401     // Type cast operator to convert to type Vec4qb used as Boolean for integer vectors
Vec4qb()402     operator Vec4qb() const {
403         return Vec4q(_mm_castpd_si128(get_low()), _mm_castpd_si128(get_high()));
404     }
405 #endif
406 #endif // VECTORI256_H
407     // Member function to change a single element in vector
408     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,bool value)409     Vec4db const & insert(uint32_t index, bool value) {
410         static const int32_t maskl[16] = {0,0,0,0,0,0,0,0,-1,-1,0,0,0,0,0,0};
411         __m256d mask = _mm256_loadu_pd((double const*)(maskl+8-(index&3)*2)); // mask with FFFFFFFFFFFFFFFF at index position
412         if (value) {
413             ymm = _mm256_or_pd(ymm,mask);
414         }
415         else {
416             ymm = _mm256_andnot_pd(mask,ymm);
417         }
418         return *this;
419     }
420     // Member function extract a single element from vector
extract(uint32_t index)421     bool extract(uint32_t index) const {
422         union {
423             double  f[8];
424             int32_t i[16];
425         } u;
426         _mm256_storeu_pd(u.f, ymm);
427         return u.i[(index & 3) * 2 + 1] != 0;
428     }
429     // Extract a single element. Operator [] can only read an element, not write.
430     bool operator [] (uint32_t index) const {
431         return extract(index);
432     }
433     // Member functions to split into two Vec4fb:
get_low()434     Vec2db get_low() const {
435         return _mm256_castpd256_pd128(ymm);
436     }
get_high()437     Vec2db get_high() const {
438         return _mm256_extractf128_pd(ymm,1);
439     }
size()440     static int size () {
441         return 4;
442     }
443 };
444 
445 
446 /*****************************************************************************
447 *
448 *          Operators for Vec4db
449 *
450 *****************************************************************************/
451 
452 // vector operator & : bitwise and
453 static inline Vec4db operator & (Vec4db const & a, Vec4db const & b) {
454     return _mm256_and_pd(a, b);
455 }
456 static inline Vec4db operator && (Vec4db const & a, Vec4db const & b) {
457     return a & b;
458 }
459 
460 // vector operator &= : bitwise and
461 static inline Vec4db & operator &= (Vec4db & a, Vec4db const & b) {
462     a = a & b;
463     return a;
464 }
465 
466 // vector operator | : bitwise or
467 static inline Vec4db operator | (Vec4db const & a, Vec4db const & b) {
468     return _mm256_or_pd(a, b);
469 }
470 static inline Vec4db operator || (Vec4db const & a, Vec4db const & b) {
471     return a | b;
472 }
473 
474 // vector operator |= : bitwise or
475 static inline Vec4db & operator |= (Vec4db & a, Vec4db const & b) {
476     a = a | b;
477     return a;
478 }
479 
480 // vector operator ^ : bitwise xor
481 static inline Vec4db operator ^ (Vec4db const & a, Vec4db const & b) {
482     return _mm256_xor_pd(a, b);
483 }
484 
485 // vector operator ^= : bitwise xor
486 static inline Vec4db & operator ^= (Vec4db & a, Vec4db const & b) {
487     a = a ^ b;
488     return a;
489 }
490 
491 // vector operator ~ : bitwise not
492 static inline Vec4db operator ~ (Vec4db const & a) {
493     return _mm256_xor_pd(a, _mm256_castps_pd (constant8f<-1,-1,-1,-1,-1,-1,-1,-1>()));
494 }
495 
496 // vector operator ! : logical not
497 // (operator ! is less efficient than operator ~. Use only where not
498 // all bits in an element are the same)
499 static inline Vec4db operator ! (Vec4db const & a) {
500 return Vec4db( ! Vec4qb(a));
501 }
502 
503 // Functions for Vec8fb
504 
505 // andnot: a & ~ b
andnot(Vec4db const & a,Vec4db const & b)506 static inline Vec4db andnot(Vec4db const & a, Vec4db const & b) {
507     return _mm256_andnot_pd(b, a);
508 }
509 
510 
511 /*****************************************************************************
512 *
513 *          Horizontal Boolean functions
514 *
515 *****************************************************************************/
516 
517 // horizontal_and. Returns true if all bits are 1
horizontal_and(Vec4db const & a)518 static inline bool horizontal_and (Vec4db const & a) {
519 #if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
520     return horizontal_and(Vec256b(_mm256_castpd_si256(a)));
521 #else  // split into 128 bit vectors
522     return horizontal_and(a.get_low() & a.get_high());
523 #endif
524 }
525 
526 // horizontal_or. Returns true if at least one bit is 1
horizontal_or(Vec4db const & a)527 static inline bool horizontal_or (Vec4db const & a) {
528 #if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
529     return horizontal_or(Vec256b(_mm256_castpd_si256(a)));
530 #else  // split into 128 bit vectors
531     return horizontal_or(a.get_low() | a.get_high());
532 #endif
533 }
534 
535 
536  /*****************************************************************************
537 *
538 *          Vec8f: Vector of 8 single precision floating point values
539 *
540 *****************************************************************************/
541 
542 class Vec8f {
543 protected:
544     __m256 ymm; // Float vector
545 public:
546     // Default constructor:
Vec8f()547     Vec8f() {
548     }
549     // Constructor to broadcast the same value into all elements:
Vec8f(float f)550     Vec8f(float f) {
551         ymm = _mm256_set1_ps(f);
552     }
553     // Constructor to build from all elements:
Vec8f(float f0,float f1,float f2,float f3,float f4,float f5,float f6,float f7)554     Vec8f(float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7) {
555         ymm = _mm256_setr_ps(f0, f1, f2, f3, f4, f5, f6, f7);
556     }
557     // Constructor to build from two Vec4f:
Vec8f(Vec4f const & a0,Vec4f const & a1)558     Vec8f(Vec4f const & a0, Vec4f const & a1) {
559         ymm = set_m128r(a0, a1);
560         //ymm = _mm256_set_m128(a1, a0);
561     }
562     // Constructor to convert from type __m256 used in intrinsics:
Vec8f(__m256 const & x)563     Vec8f(__m256 const & x) {
564         ymm = x;
565     }
566     // Assignment operator to convert from type __m256 used in intrinsics:
567     Vec8f & operator = (__m256 const & x) {
568         ymm = x;
569         return *this;
570     }
571     // Type cast operator to convert to __m256 used in intrinsics
__m256()572     operator __m256() const {
573         return ymm;
574     }
575     // Member function to load from array (unaligned)
load(float const * p)576     Vec8f & load(float const * p) {
577         ymm = _mm256_loadu_ps(p);
578         return *this;
579     }
580     // Member function to load from array, aligned by 32
581     // You may use load_a instead of load if you are certain that p points to an address
582     // divisible by 32.
load_a(float const * p)583     Vec8f & load_a(float const * p) {
584         ymm = _mm256_load_ps(p);
585         return *this;
586     }
587     // Member function to store into array (unaligned)
store(float * p)588     void store(float * p) const {
589         _mm256_storeu_ps(p, ymm);
590     }
591     // Member function to store into array, aligned by 32
592     // You may use store_a instead of store if you are certain that p points to an address
593     // divisible by 32.
store_a(float * p)594     void store_a(float * p) const {
595         _mm256_store_ps(p, ymm);
596     }
597     // Partial load. Load n elements and set the rest to 0
load_partial(int n,float const * p)598     Vec8f & load_partial(int n, float const * p) {
599         if (n > 0 && n <= 4) {
600             *this = Vec8f(Vec4f().load_partial(n, p), _mm_setzero_ps());
601             // ymm = _mm256_castps128_ps256(Vec4f().load_partial<n>(p)); (this doesn't work on MS compiler due to sloppy definition of the cast)
602         }
603         else if (n > 4 && n <= 8) {
604             *this = Vec8f(Vec4f().load(p), Vec4f().load_partial(n - 4, p + 4));
605         }
606         else {
607             ymm = _mm256_setzero_ps();
608         }
609         return *this;
610     }
611     // Partial store. Store n elements
store_partial(int n,float * p)612     void store_partial(int n, float * p) const {
613         if (n <= 4) {
614             get_low().store_partial(n, p);
615         }
616         else if (n <= 8) {
617             get_low().store(p);
618             get_high().store_partial(n - 4, p + 4);
619         }
620     }
621     // cut off vector to n elements. The last 8-n elements are set to zero
cutoff(int n)622     Vec8f & cutoff(int n) {
623         if (uint32_t(n) >= 8) return *this;
624         static const union {
625             int32_t i[16];
626             float   f[16];
627         } mask = {{-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0}};
628         *this = Vec8fb(*this) & Vec8fb(Vec8f().load(mask.f + 8 - n));
629         return *this;
630     }
631     // Member function to change a single element in vector
632     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,float value)633     Vec8f const & insert(uint32_t index, float value) {
634         __m256 v0 = _mm256_broadcast_ss(&value);
635         switch (index) {
636         case 0:
637             ymm = _mm256_blend_ps (ymm, v0, 1);  break;
638         case 1:
639             ymm = _mm256_blend_ps (ymm, v0, 2);  break;
640         case 2:
641             ymm = _mm256_blend_ps (ymm, v0, 4);  break;
642         case 3:
643             ymm = _mm256_blend_ps (ymm, v0, 8);  break;
644         case 4:
645             ymm = _mm256_blend_ps (ymm, v0, 0x10);  break;
646         case 5:
647             ymm = _mm256_blend_ps (ymm, v0, 0x20);  break;
648         case 6:
649             ymm = _mm256_blend_ps (ymm, v0, 0x40);  break;
650         default:
651             ymm = _mm256_blend_ps (ymm, v0, 0x80);  break;
652         }
653         return *this;
654     }
655     // Member function extract a single element from vector
extract(uint32_t index)656     float extract(uint32_t index) const {
657         float x[8];
658         store(x);
659         return x[index & 7];
660     }
661     // Extract a single element. Use store function if extracting more than one element.
662     // Operator [] can only read an element, not write.
663     float operator [] (uint32_t index) const {
664         return extract(index);
665     }
666     // Member functions to split into two Vec4f:
get_low()667     Vec4f get_low() const {
668         return _mm256_castps256_ps128(ymm);
669     }
get_high()670     Vec4f get_high() const {
671         return _mm256_extractf128_ps(ymm,1);
672     }
size()673     static int size () {
674         return 8;
675     }
676 };
677 
678 
679 /*****************************************************************************
680 *
681 *          Operators for Vec8f
682 *
683 *****************************************************************************/
684 
685 // vector operator + : add element by element
686 static inline Vec8f operator + (Vec8f const & a, Vec8f const & b) {
687     return _mm256_add_ps(a, b);
688 }
689 
690 // vector operator + : add vector and scalar
691 static inline Vec8f operator + (Vec8f const & a, float b) {
692     return a + Vec8f(b);
693 }
694 static inline Vec8f operator + (float a, Vec8f const & b) {
695     return Vec8f(a) + b;
696 }
697 
698 // vector operator += : add
699 static inline Vec8f & operator += (Vec8f & a, Vec8f const & b) {
700     a = a + b;
701     return a;
702 }
703 
704 // postfix operator ++
705 static inline Vec8f operator ++ (Vec8f & a, int) {
706     Vec8f a0 = a;
707     a = a + 1.0f;
708     return a0;
709 }
710 
711 // prefix operator ++
712 static inline Vec8f & operator ++ (Vec8f & a) {
713     a = a + 1.0f;
714     return a;
715 }
716 
717 // vector operator - : subtract element by element
718 static inline Vec8f operator - (Vec8f const & a, Vec8f const & b) {
719     return _mm256_sub_ps(a, b);
720 }
721 
722 // vector operator - : subtract vector and scalar
723 static inline Vec8f operator - (Vec8f const & a, float b) {
724     return a - Vec8f(b);
725 }
726 static inline Vec8f operator - (float a, Vec8f const & b) {
727     return Vec8f(a) - b;
728 }
729 
730 // vector operator - : unary minus
731 // Change sign bit, even for 0, INF and NAN
732 static inline Vec8f operator - (Vec8f const & a) {
733     return _mm256_xor_ps(a, constant8f<(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000> ());
734 }
735 
736 // vector operator -= : subtract
737 static inline Vec8f & operator -= (Vec8f & a, Vec8f const & b) {
738     a = a - b;
739     return a;
740 }
741 
742 // postfix operator --
743 static inline Vec8f operator -- (Vec8f & a, int) {
744     Vec8f a0 = a;
745     a = a - 1.0f;
746     return a0;
747 }
748 
749 // prefix operator --
750 static inline Vec8f & operator -- (Vec8f & a) {
751     a = a - 1.0f;
752     return a;
753 }
754 
755 // vector operator * : multiply element by element
756 static inline Vec8f operator * (Vec8f const & a, Vec8f const & b) {
757     return _mm256_mul_ps(a, b);
758 }
759 
760 // vector operator * : multiply vector and scalar
761 static inline Vec8f operator * (Vec8f const & a, float b) {
762     return a * Vec8f(b);
763 }
764 static inline Vec8f operator * (float a, Vec8f const & b) {
765     return Vec8f(a) * b;
766 }
767 
768 // vector operator *= : multiply
769 static inline Vec8f & operator *= (Vec8f & a, Vec8f const & b) {
770     a = a * b;
771     return a;
772 }
773 
774 // vector operator / : divide all elements by same integer
775 static inline Vec8f operator / (Vec8f const & a, Vec8f const & b) {
776     return _mm256_div_ps(a, b);
777 }
778 
779 // vector operator / : divide vector and scalar
780 static inline Vec8f operator / (Vec8f const & a, float b) {
781     return a / Vec8f(b);
782 }
783 static inline Vec8f operator / (float a, Vec8f const & b) {
784     return Vec8f(a) / b;
785 }
786 
787 // vector operator /= : divide
788 static inline Vec8f & operator /= (Vec8f & a, Vec8f const & b) {
789     a = a / b;
790     return a;
791 }
792 
793 // vector operator == : returns true for elements for which a == b
794 static inline Vec8fb operator == (Vec8f const & a, Vec8f const & b) {
795     return _mm256_cmp_ps(a, b, 0);
796 }
797 
798 // vector operator != : returns true for elements for which a != b
799 static inline Vec8fb operator != (Vec8f const & a, Vec8f const & b) {
800     return _mm256_cmp_ps(a, b, 4);
801 }
802 
803 // vector operator < : returns true for elements for which a < b
804 static inline Vec8fb operator < (Vec8f const & a, Vec8f const & b) {
805     return _mm256_cmp_ps(a, b, 1);
806 }
807 
808 // vector operator <= : returns true for elements for which a <= b
809 static inline Vec8fb operator <= (Vec8f const & a, Vec8f const & b) {
810     return _mm256_cmp_ps(a, b, 2);
811 }
812 
813 // vector operator > : returns true for elements for which a > b
814 static inline Vec8fb operator > (Vec8f const & a, Vec8f const & b) {
815     return b < a;
816 }
817 
818 // vector operator >= : returns true for elements for which a >= b
819 static inline Vec8fb operator >= (Vec8f const & a, Vec8f const & b) {
820     return b <= a;
821 }
822 
823 // Bitwise logical operators
824 
825 // vector operator & : bitwise and
826 static inline Vec8f operator & (Vec8f const & a, Vec8f const & b) {
827     return _mm256_and_ps(a, b);
828 }
829 
830 // vector operator &= : bitwise and
831 static inline Vec8f & operator &= (Vec8f & a, Vec8f const & b) {
832     a = a & b;
833     return a;
834 }
835 
836 // vector operator & : bitwise and of Vec8f and Vec8fb
837 static inline Vec8f operator & (Vec8f const & a, Vec8fb const & b) {
838     return _mm256_and_ps(a, b);
839 }
840 static inline Vec8f operator & (Vec8fb const & a, Vec8f const & b) {
841     return _mm256_and_ps(a, b);
842 }
843 
844 // vector operator | : bitwise or
845 static inline Vec8f operator | (Vec8f const & a, Vec8f const & b) {
846     return _mm256_or_ps(a, b);
847 }
848 
849 // vector operator |= : bitwise or
850 static inline Vec8f & operator |= (Vec8f & a, Vec8f const & b) {
851     a = a | b;
852     return a;
853 }
854 
855 // vector operator ^ : bitwise xor
856 static inline Vec8f operator ^ (Vec8f const & a, Vec8f const & b) {
857     return _mm256_xor_ps(a, b);
858 }
859 
860 // vector operator ^= : bitwise xor
861 static inline Vec8f & operator ^= (Vec8f & a, Vec8f const & b) {
862     a = a ^ b;
863     return a;
864 }
865 
866 // vector operator ! : logical not. Returns Boolean vector
867 static inline Vec8fb operator ! (Vec8f const & a) {
868     return a == Vec8f(0.0f);
869 }
870 
871 
872 /*****************************************************************************
873 *
874 *          Functions for Vec8f
875 *
876 *****************************************************************************/
877 
878 // Select between two operands. Corresponds to this pseudocode:
879 // for (int i = 0; i < 8; i++) result[i] = s[i] ? a[i] : b[i];
880 // Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
select(Vec8fb const & s,Vec8f const & a,Vec8f const & b)881 static inline Vec8f select (Vec8fb const & s, Vec8f const & a, Vec8f const & b) {
882     return _mm256_blendv_ps (b, a, s);
883 }
884 
885 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec8fb const & f,Vec8f const & a,Vec8f const & b)886 static inline Vec8f if_add (Vec8fb const & f, Vec8f const & a, Vec8f const & b) {
887     return a + (Vec8f(f) & b);
888 }
889 
890 // Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
if_mul(Vec8fb const & f,Vec8f const & a,Vec8f const & b)891 static inline Vec8f if_mul (Vec8fb const & f, Vec8f const & a, Vec8f const & b) {
892     return a * select(f, b, 1.f);
893 }
894 
895 
896 // General arithmetic functions, etc.
897 
898 // Horizontal add: Calculates the sum of all vector elements.
horizontal_add(Vec8f const & a)899 static inline float horizontal_add (Vec8f const & a) {
900     __m256 t1 = _mm256_hadd_ps(a,a);
901     __m256 t2 = _mm256_hadd_ps(t1,t1);
902     __m128 t3 = _mm256_extractf128_ps(t2,1);
903     __m128 t4 = _mm_add_ss(_mm256_castps256_ps128(t2),t3);
904     return _mm_cvtss_f32(t4);
905 }
906 
907 // function max: a > b ? a : b
max(Vec8f const & a,Vec8f const & b)908 static inline Vec8f max(Vec8f const & a, Vec8f const & b) {
909     return _mm256_max_ps(a,b);
910 }
911 
912 // function min: a < b ? a : b
min(Vec8f const & a,Vec8f const & b)913 static inline Vec8f min(Vec8f const & a, Vec8f const & b) {
914     return _mm256_min_ps(a,b);
915 }
916 
917 // function abs: absolute value
918 // Removes sign bit, even for -0.0f, -INF and -NAN
abs(Vec8f const & a)919 static inline Vec8f abs(Vec8f const & a) {
920     __m256 mask = constant8f<0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF,0x7FFFFFFF> ();
921     return _mm256_and_ps(a,mask);
922 }
923 
924 // function sqrt: square root
sqrt(Vec8f const & a)925 static inline Vec8f sqrt(Vec8f const & a) {
926     return _mm256_sqrt_ps(a);
927 }
928 
929 // function square: a * a
square(Vec8f const & a)930 static inline Vec8f square(Vec8f const & a) {
931     return a * a;
932 }
933 
934 // pow(Vec8f, int):
935 template <typename TT> static Vec8f pow(Vec8f const & a, TT const & n);
936 
937 // Raise floating point numbers to integer power n
938 template <>
939 inline Vec8f pow<int>(Vec8f const & x0, int const & n) {
940     return pow_template_i<Vec8f>(x0, n);
941 }
942 
943 // allow conversion from unsigned int
944 template <>
945 inline Vec8f pow<uint32_t>(Vec8f const & x0, uint32_t const & n) {
946     return pow_template_i<Vec8f>(x0, (int)n);
947 }
948 
949 
950 // Raise floating point numbers to integer power n, where n is a compile-time constant
951 template <int n>
pow_n(Vec8f const & a)952 static inline Vec8f pow_n(Vec8f const & a) {
953     if (n < 0)    return Vec8f(1.0f) / pow_n<-n>(a);
954     if (n == 0)   return Vec8f(1.0f);
955     if (n >= 256) return pow(a, n);
956     Vec8f x = a;                       // a^(2^i)
957     Vec8f y;                           // accumulator
958     const int lowest = n - (n & (n-1));// lowest set bit in n
959     if (n & 1) y = x;
960     if (n < 2) return y;
961     x = x*x;                           // x^2
962     if (n & 2) {
963         if (lowest == 2) y = x; else y *= x;
964     }
965     if (n < 4) return y;
966     x = x*x;                           // x^4
967     if (n & 4) {
968         if (lowest == 4) y = x; else y *= x;
969     }
970     if (n < 8) return y;
971     x = x*x;                           // x^8
972     if (n & 8) {
973         if (lowest == 8) y = x; else y *= x;
974     }
975     if (n < 16) return y;
976     x = x*x;                           // x^16
977     if (n & 16) {
978         if (lowest == 16) y = x; else y *= x;
979     }
980     if (n < 32) return y;
981     x = x*x;                           // x^32
982     if (n & 32) {
983         if (lowest == 32) y = x; else y *= x;
984     }
985     if (n < 64) return y;
986     x = x*x;                           // x^64
987     if (n & 64) {
988         if (lowest == 64) y = x; else y *= x;
989     }
990     if (n < 128) return y;
991     x = x*x;                           // x^128
992     if (n & 128) {
993         if (lowest == 128) y = x; else y *= x;
994     }
995     return y;
996 }
997 
998 template <int n>
pow(Vec8f const & a,Const_int_t<n>)999 static inline Vec8f pow(Vec8f const & a, Const_int_t<n>) {
1000     return pow_n<n>(a);
1001 }
1002 
1003 
1004 // function round: round to nearest integer (even). (result as float vector)
round(Vec8f const & a)1005 static inline Vec8f round(Vec8f const & a) {
1006     return _mm256_round_ps(a, 0+8);
1007 }
1008 
1009 // function truncate: round towards zero. (result as float vector)
truncate(Vec8f const & a)1010 static inline Vec8f truncate(Vec8f const & a) {
1011     return _mm256_round_ps(a, 3+8);
1012 }
1013 
1014 // function floor: round towards minus infinity. (result as float vector)
floor(Vec8f const & a)1015 static inline Vec8f floor(Vec8f const & a) {
1016     return _mm256_round_ps(a, 1+8);
1017 }
1018 
1019 // function ceil: round towards plus infinity. (result as float vector)
ceil(Vec8f const & a)1020 static inline Vec8f ceil(Vec8f const & a) {
1021     return _mm256_round_ps(a, 2+8);
1022 }
1023 
1024 #ifdef VECTORI256_H  // 256 bit integer vectors are available
1025 #if VECTORI256_H > 1  // AVX2
1026 // function round_to_int: round to nearest integer (even). (result as integer vector)
round_to_int(Vec8f const & a)1027 static inline Vec8i round_to_int(Vec8f const & a) {
1028     // Note: assume MXCSR control register is set to rounding
1029     return _mm256_cvtps_epi32(a);
1030 }
1031 
1032 // function truncate_to_int: round towards zero. (result as integer vector)
truncate_to_int(Vec8f const & a)1033 static inline Vec8i truncate_to_int(Vec8f const & a) {
1034     return _mm256_cvttps_epi32(a);
1035 }
1036 
1037 // function to_float: convert integer vector to float vector
to_float(Vec8i const & a)1038 static inline Vec8f to_float(Vec8i const & a) {
1039     return _mm256_cvtepi32_ps(a);
1040 }
1041 
1042 // function to_float: convert unsigned integer vector to float vector
to_float(Vec8ui const & a)1043 static inline Vec8f to_float(Vec8ui const & a) {
1044 #ifdef __AVX512VL__
1045     return _mm256_cvtepu32_ps(a);
1046 #else
1047     Vec8f b = to_float(Vec8i(a & 0x7FFFFFFF));             // 31 bits
1048     Vec8i c = Vec8i(a) >> 31;                              // generate mask from highest bit
1049     Vec8f d = Vec8f(2147483648.f) & Vec8f(_mm256_castsi256_ps(c));// mask floating point constant 2^31
1050     return b + d;
1051 #endif
1052 }
1053 
1054 #else // no AVX2
1055 
1056 // function round_to_int: round to nearest integer (even). (result as integer vector)
round_to_int(Vec8f const & a)1057 static inline Vec8i round_to_int(Vec8f const & a) {
1058     // Note: assume MXCSR control register is set to rounding
1059     return Vec8i(_mm_cvtps_epi32(a.get_low()), _mm_cvtps_epi32(a.get_high()));
1060 }
1061 
1062 // function truncate_to_int: round towards zero. (result as integer vector)
truncate_to_int(Vec8f const & a)1063 static inline Vec8i truncate_to_int(Vec8f const & a) {
1064     return Vec8i(_mm_cvttps_epi32(a.get_low()), _mm_cvttps_epi32(a.get_high()));
1065 }
1066 
1067 // function to_float: convert integer vector to float vector
to_float(Vec8i const & a)1068 static inline Vec8f to_float(Vec8i const & a) {
1069     return Vec8f(_mm_cvtepi32_ps(a.get_low()), _mm_cvtepi32_ps(a.get_high()));
1070 }
1071 
1072 // function to_float: convert unsigned integer vector to float vector
to_float(Vec8ui const & a)1073 static inline Vec8f to_float(Vec8ui const & a) {
1074     return Vec8f(to_float(a.get_low()), to_float(a.get_high()));
1075 }
1076 #endif
1077 #endif // VECTORI256_H
1078 
1079 
1080 // Fused multiply and add functions
1081 
1082 // Multiply and add
mul_add(Vec8f const & a,Vec8f const & b,Vec8f const & c)1083 static inline Vec8f mul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
1084 #ifdef __FMA__
1085     return _mm256_fmadd_ps(a, b, c);
1086 #elif defined (__FMA4__)
1087     return _mm256_macc_ps(a, b, c);
1088 #else
1089     return a * b + c;
1090 #endif
1091 
1092 }
1093 
1094 // Multiply and subtract
mul_sub(Vec8f const & a,Vec8f const & b,Vec8f const & c)1095 static inline Vec8f mul_sub(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
1096 #ifdef __FMA__
1097     return _mm256_fmsub_ps(a, b, c);
1098 #elif defined (__FMA4__)
1099     return _mm256_msub_ps(a, b, c);
1100 #else
1101     return a * b - c;
1102 #endif
1103 }
1104 
1105 // Multiply and inverse subtract
nmul_add(Vec8f const & a,Vec8f const & b,Vec8f const & c)1106 static inline Vec8f nmul_add(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
1107 #ifdef __FMA__
1108     return _mm256_fnmadd_ps(a, b, c);
1109 #elif defined (__FMA4__)
1110     return _mm256_nmacc_ps(a, b, c);
1111 #else
1112     return c - a * b;
1113 #endif
1114 }
1115 
1116 
1117 // Multiply and subtract with extra precision on the intermediate calculations,
1118 // even if FMA instructions not supported, using Veltkamp-Dekker split
mul_sub_x(Vec8f const & a,Vec8f const & b,Vec8f const & c)1119 static inline Vec8f mul_sub_x(Vec8f const & a, Vec8f const & b, Vec8f const & c) {
1120 #ifdef __FMA__
1121     return _mm256_fmsub_ps(a, b, c);
1122 #elif defined (__FMA4__)
1123     return _mm256_msub_ps(a, b, c);
1124 #else
1125     // calculate a * b - c with extra precision
1126     const int b12 = -(1 << 12);                  // mask to remove lower 12 bits
1127     Vec8f upper_mask = constant8f<b12,b12,b12,b12,b12,b12,b12,b12>();
1128     Vec8f a_high = a & upper_mask;               // split into high and low parts
1129     Vec8f b_high = b & upper_mask;
1130     Vec8f a_low  = a - a_high;
1131     Vec8f b_low  = b - b_high;
1132     Vec8f r1 = a_high * b_high;                  // this product is exact
1133     Vec8f r2 = r1 - c;                           // subtract c from high product
1134     Vec8f r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product
1135     return r3; // + ((r2 - r1) + c);
1136 #endif
1137 }
1138 
1139 
1140 // Approximate math functions
1141 
1142 // approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11)
approx_recipr(Vec8f const & a)1143 static inline Vec8f approx_recipr(Vec8f const & a) {
1144 #if INSTRSET >= 9  // use more accurate version if available. (none of these will raise exceptions on zero)
1145 #ifdef __AVX512ER__  // AVX512ER: full precision
1146     // todo: if future processors have both AVX512ER and AVX512VL: _mm256_rcp28_round_ps(a, _MM_FROUND_NO_EXC);
1147     return _mm512_castps512_ps256(_mm512_rcp28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC));
1148 #elif defined __AVX512VL__  // AVX512VL: 14 bit precision
1149     return _mm256_rcp14_ps(a);
1150 #else  // AVX512F: 14 bit precision
1151     return _mm512_castps512_ps256(_mm512_rcp14_ps(_mm512_castps256_ps512(a)));
1152 #endif
1153 #else  // AVX: 11 bit precision
1154     return _mm256_rcp_ps(a);
1155 #endif
1156 }
1157 
1158 // approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11)
approx_rsqrt(Vec8f const & a)1159 static inline Vec8f approx_rsqrt(Vec8f const & a) {
1160 #if INSTRSET >= 9  // use more accurate version if available. (none of these will raise exceptions on zero)
1161 #ifdef __AVX512ER__  // AVX512ER: full precision
1162     // todo: if future processors have both AVX512ER and AVX521VL: _mm256_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC);
1163     return _mm512_castps512_ps256(_mm512_rsqrt28_round_ps(_mm512_castps256_ps512(a), _MM_FROUND_NO_EXC));
1164 #elif defined __AVX512VL__  // AVX512VL: 14 bit precision
1165     return _mm256_rsqrt14_ps(a);
1166 #else  // AVX512F: 14 bit precision
1167     return _mm512_castps512_ps256(_mm512_rsqrt14_ps(_mm512_castps256_ps512(a)));
1168 #endif
1169 #else  // AVX: 11 bit precision
1170     return _mm256_rsqrt_ps(a);
1171 #endif
1172 }
1173 
1174 
1175 // Math functions using fast bit manipulation
1176 
1177 #ifdef VECTORI256_H  // 256 bit integer vectors are available, AVX2
1178 // Extract the exponent as an integer
1179 // exponent(a) = floor(log2(abs(a)));
1180 // exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
exponent(Vec8f const & a)1181 static inline Vec8i exponent(Vec8f const & a) {
1182 #if  VECTORI256_H > 1  // AVX2
1183     Vec8ui t1 = _mm256_castps_si256(a);// reinterpret as 32-bit integer
1184     Vec8ui t2 = t1 << 1;               // shift out sign bit
1185     Vec8ui t3 = t2 >> 24;              // shift down logical to position 0
1186     Vec8i  t4 = Vec8i(t3) - 0x7F;      // subtract bias from exponent
1187     return t4;
1188 #else  // no AVX2
1189     return Vec8i(exponent(a.get_low()), exponent(a.get_high()));
1190 #endif
1191 }
1192 #endif
1193 
1194 // Extract the fraction part of a floating point number
1195 // a = 2^exponent(a) * fraction(a), except for a = 0
1196 // fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f
fraction(Vec8f const & a)1197 static inline Vec8f fraction(Vec8f const & a) {
1198 #if defined (VECTORI256_H) && VECTORI256_H > 2  // 256 bit integer vectors are available, AVX2
1199     Vec8ui t1 = _mm256_castps_si256(a);   // reinterpret as 32-bit integer
1200     Vec8ui t2 = (t1 & 0x007FFFFF) | 0x3F800000; // set exponent to 0 + bias
1201     return _mm256_castsi256_ps(t2);
1202 #else
1203     return Vec8f(fraction(a.get_low()), fraction(a.get_high()));
1204 #endif
1205 }
1206 
1207 #ifdef VECTORI256_H  // 256 bit integer vectors are available, AVX2
1208 // Fast calculation of pow(2,n) with n integer
1209 // n  =    0 gives 1.0f
1210 // n >=  128 gives +INF
1211 // n <= -127 gives 0.0f
1212 // This function will never produce denormals, and never raise exceptions
exp2(Vec8i const & n)1213 static inline Vec8f exp2(Vec8i const & n) {
1214 #if  VECTORI256_H > 1  // AVX2
1215     Vec8i t1 = max(n,  -0x7F);         // limit to allowed range
1216     Vec8i t2 = min(t1,  0x80);
1217     Vec8i t3 = t2 + 0x7F;              // add bias
1218     Vec8i t4 = t3 << 23;               // put exponent into position 23
1219     return _mm256_castsi256_ps(t4);    // reinterpret as float
1220 #else
1221     return Vec8f(exp2(n.get_low()), exp2(n.get_high()));
1222 #endif
1223 }
1224 //static inline Vec8f exp2(Vec8f const & x); // defined in vectormath_exp.h
1225 
1226 #endif // VECTORI256_H
1227 
1228 
1229 // Categorization functions
1230 
1231 // Function sign_bit: gives true for elements that have the sign bit set
1232 // even for -0.0f, -INF and -NAN
1233 // Note that sign_bit(Vec8f(-0.0f)) gives true, while Vec8f(-0.0f) < Vec8f(0.0f) gives false
1234 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
sign_bit(Vec8f const & a)1235 static inline Vec8fb sign_bit(Vec8f const & a) {
1236 #if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
1237     Vec8i t1 = _mm256_castps_si256(a);    // reinterpret as 32-bit integer
1238     Vec8i t2 = t1 >> 31;                  // extend sign bit
1239     return _mm256_castsi256_ps(t2);       // reinterpret as 32-bit Boolean
1240 #else
1241     return Vec8fb(sign_bit(a.get_low()), sign_bit(a.get_high()));
1242 #endif
1243 }
1244 
1245 // Function sign_combine: changes the sign of a when b has the sign bit set
1246 // same as select(sign_bit(b), -a, a)
sign_combine(Vec8f const & a,Vec8f const & b)1247 static inline Vec8f sign_combine(Vec8f const & a, Vec8f const & b) {
1248     Vec8f signmask = constant8f<(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000,(int)0x80000000>();  // -0.0
1249     return a ^ (b & signmask);
1250 }
1251 
1252 // Function is_finite: gives true for elements that are normal, denormal or zero,
1253 // false for INF and NAN
1254 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
is_finite(Vec8f const & a)1255 static inline Vec8fb is_finite(Vec8f const & a) {
1256 #if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
1257     Vec8i t1 = _mm256_castps_si256(a);    // reinterpret as 32-bit integer
1258     Vec8i t2 = t1 << 1;                // shift out sign bit
1259     Vec8ib t3 = Vec8i(t2 & 0xFF000000) != 0xFF000000; // exponent field is not all 1s
1260     return t3;
1261 #else
1262     return Vec8fb(is_finite(a.get_low()), is_finite(a.get_high()));
1263 #endif
1264 }
1265 
1266 // Function is_inf: gives true for elements that are +INF or -INF
1267 // false for finite numbers and NAN
1268 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
is_inf(Vec8f const & a)1269 static inline Vec8fb is_inf(Vec8f const & a) {
1270 #if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
1271     Vec8i t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer
1272     Vec8i t2 = t1 << 1;                // shift out sign bit
1273     return t2 == 0xFF000000;           // exponent is all 1s, fraction is 0
1274 #else
1275     return Vec8fb(is_inf(a.get_low()), is_inf(a.get_high()));
1276 #endif
1277 }
1278 
1279 // Function is_nan: gives true for elements that are +NAN or -NAN
1280 // false for finite numbers and +/-INF
1281 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
is_nan(Vec8f const & a)1282 static inline Vec8fb is_nan(Vec8f const & a) {
1283 #if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
1284     Vec8i t1 = _mm256_castps_si256(a); // reinterpret as 32-bit integer
1285     Vec8i t2 = t1 << 1;                // shift out sign bit
1286     Vec8i t3 = 0xFF000000;             // exponent mask
1287     Vec8i t4 = t2 & t3;                // exponent
1288     Vec8i t5 = _mm256_andnot_si256(t3,t2);// fraction
1289     return Vec8ib(t4 == t3 && t5 != 0);// exponent = all 1s and fraction != 0
1290 #else
1291     return Vec8fb(is_nan(a.get_low()), is_nan(a.get_high()));
1292 #endif
1293 }
1294 
1295 // Function is_subnormal: gives true for elements that are denormal (subnormal)
1296 // false for finite numbers, zero, NAN and INF
is_subnormal(Vec8f const & a)1297 static inline Vec8fb is_subnormal(Vec8f const & a) {
1298 #if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
1299     Vec8i t1 = _mm256_castps_si256(a);    // reinterpret as 32-bit integer
1300     Vec8i t2 = t1 << 1;                   // shift out sign bit
1301     Vec8i t3 = 0xFF000000;                // exponent mask
1302     Vec8i t4 = t2 & t3;                   // exponent
1303     Vec8i t5 = _mm256_andnot_si256(t3,t2);// fraction
1304     return Vec8ib(t4 == 0 && t5 != 0);    // exponent = 0 and fraction != 0
1305 #else
1306     return Vec8fb(is_subnormal(a.get_low()), is_subnormal(a.get_high()));
1307 #endif
1308 }
1309 
1310 // Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
1311 // false for finite numbers, NAN and INF
is_zero_or_subnormal(Vec8f const & a)1312 static inline Vec8fb is_zero_or_subnormal(Vec8f const & a) {
1313 #if defined (VECTORI256_H) && VECTORI256_H > 1   // 256 bit integer vectors are available, AVX2
1314     Vec8i t = _mm256_castps_si256(a);            // reinterpret as 32-bit integer
1315           t &= 0x7F800000;                       // isolate exponent
1316     return t == 0;                               // exponent = 0
1317 #else
1318     return Vec8fb(is_zero_or_subnormal(a.get_low()), is_zero_or_subnormal(a.get_high()));
1319 #endif
1320 }
1321 
1322 // Function infinite4f: returns a vector where all elements are +INF
infinite8f()1323 static inline Vec8f infinite8f() {
1324     return constant8f<0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000,0x7F800000>();
1325 }
1326 
1327 // Function nan4f: returns a vector where all elements are +NAN (quiet)
1328 static inline Vec8f nan8f(int n = 0x10) {
1329     return _mm256_castsi256_ps(_mm256_set1_epi32(0x7FC00000 + n));
1330 }
1331 
1332 // change signs on vectors Vec8f
1333 // Each index i0 - i7 is 1 for changing sign on the corresponding element, 0 for no change
1334 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
change_sign(Vec8f const & a)1335 static inline Vec8f change_sign(Vec8f const & a) {
1336     if ((i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7) == 0) return a;
1337     __m256 mask = constant8f<i0 ? (int)0x80000000 : 0, i1 ? (int)0x80000000 : 0, i2 ? (int)0x80000000 : 0, i3 ? (int)0x80000000 : 0,
1338         i4 ? (int)0x80000000 : 0, i5 ? (int)0x80000000 : 0, i6 ? (int)0x80000000 : 0, i7 ? (int)0x80000000 : 0> ();
1339     return _mm256_xor_ps(a, mask);
1340 }
1341 
1342 
1343 /*****************************************************************************
1344 *
1345 *          Vec4d: Vector of 4 double precision floating point values
1346 *
1347 *****************************************************************************/
1348 
1349 class Vec4d {
1350 protected:
1351     __m256d ymm; // double vector
1352 public:
1353     // Default constructor:
Vec4d()1354     Vec4d() {
1355     }
1356     // Constructor to broadcast the same value into all elements:
Vec4d(double d)1357     Vec4d(double d) {
1358         ymm = _mm256_set1_pd(d);
1359     }
1360     // Constructor to build from all elements:
Vec4d(double d0,double d1,double d2,double d3)1361     Vec4d(double d0, double d1, double d2, double d3) {
1362         ymm = _mm256_setr_pd(d0, d1, d2, d3);
1363     }
1364     // Constructor to build from two Vec2d:
Vec4d(Vec2d const & a0,Vec2d const & a1)1365     Vec4d(Vec2d const & a0, Vec2d const & a1) {
1366         ymm = _mm256_castps_pd(set_m128r(_mm_castpd_ps(a0), _mm_castpd_ps(a1)));
1367         //ymm = _mm256_set_m128d(a1, a0);
1368     }
1369     // Constructor to convert from type __m256d used in intrinsics:
Vec4d(__m256d const & x)1370     Vec4d(__m256d const & x) {
1371         ymm = x;
1372     }
1373     // Assignment operator to convert from type __m256d used in intrinsics:
1374     Vec4d & operator = (__m256d const & x) {
1375         ymm = x;
1376         return *this;
1377     }
1378     // Type cast operator to convert to __m256d used in intrinsics
__m256d()1379     operator __m256d() const {
1380         return ymm;
1381     }
1382     // Member function to load from array (unaligned)
load(double const * p)1383     Vec4d & load(double const * p) {
1384         ymm = _mm256_loadu_pd(p);
1385         return *this;
1386     }
1387     // Member function to load from array, aligned by 32
1388     // You may use load_a instead of load if you are certain that p points to an address
1389     // divisible by 32
load_a(double const * p)1390     Vec4d & load_a(double const * p) {
1391         ymm = _mm256_load_pd(p);
1392         return *this;
1393     }
1394     // Member function to store into array (unaligned)
store(double * p)1395     void store(double * p) const {
1396         _mm256_storeu_pd(p, ymm);
1397     }
1398     // Member function to store into array, aligned by 32
1399     // You may use store_a instead of store if you are certain that p points to an address
1400     // divisible by 32
store_a(double * p)1401     void store_a(double * p) const {
1402         _mm256_store_pd(p, ymm);
1403     }
1404     // Partial load. Load n elements and set the rest to 0
load_partial(int n,double const * p)1405     Vec4d & load_partial(int n, double const * p) {
1406         if (n > 0 && n <= 2) {
1407             *this = Vec4d(Vec2d().load_partial(n, p), _mm_setzero_pd());
1408         }
1409         else if (n > 2 && n <= 4) {
1410             *this = Vec4d(Vec2d().load(p), Vec2d().load_partial(n - 2, p + 2));
1411         }
1412         else {
1413             ymm = _mm256_setzero_pd();
1414         }
1415         return *this;
1416     }
1417     // Partial store. Store n elements
store_partial(int n,double * p)1418     void store_partial(int n, double * p) const {
1419         if (n <= 2) {
1420             get_low().store_partial(n, p);
1421         }
1422         else if (n <= 4) {
1423             get_low().store(p);
1424             get_high().store_partial(n - 2, p + 2);
1425         }
1426     }
1427     // cut off vector to n elements. The last 4-n elements are set to zero
cutoff(int n)1428     Vec4d & cutoff(int n) {
1429         ymm = _mm256_castps_pd(Vec8f(_mm256_castpd_ps(ymm)).cutoff(n*2));
1430         return *this;
1431     }
1432     // Member function to change a single element in vector
1433     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,double value)1434     Vec4d const & insert(uint32_t index, double value) {
1435         __m256d v0 = _mm256_broadcast_sd(&value);
1436         switch (index) {
1437         case 0:
1438             ymm = _mm256_blend_pd (ymm, v0, 1);  break;
1439         case 1:
1440             ymm = _mm256_blend_pd (ymm, v0, 2);  break;
1441         case 2:
1442             ymm = _mm256_blend_pd (ymm, v0, 4);  break;
1443         default:
1444             ymm = _mm256_blend_pd (ymm, v0, 8);  break;
1445         }
1446         return *this;
1447     }
1448     // Member function extract a single element from vector
extract(uint32_t index)1449     double extract(uint32_t index) const {
1450         double x[4];
1451         store(x);
1452         return x[index & 3];
1453     }
1454     // Extract a single element. Use store function if extracting more than one element.
1455     // Operator [] can only read an element, not write.
1456     double operator [] (uint32_t index) const {
1457         return extract(index);
1458     }
1459     // Member functions to split into two Vec2d:
get_low()1460     Vec2d get_low() const {
1461         return _mm256_castpd256_pd128(ymm);
1462     }
get_high()1463     Vec2d get_high() const {
1464         return _mm256_extractf128_pd(ymm,1);
1465     }
size()1466     static int size () {
1467         return 4;
1468     }
1469 };
1470 
1471 
1472 
1473 /*****************************************************************************
1474 *
1475 *          Operators for Vec4d
1476 *
1477 *****************************************************************************/
1478 
1479 // vector operator + : add element by element
1480 static inline Vec4d operator + (Vec4d const & a, Vec4d const & b) {
1481     return _mm256_add_pd(a, b);
1482 }
1483 
1484 // vector operator + : add vector and scalar
1485 static inline Vec4d operator + (Vec4d const & a, double b) {
1486     return a + Vec4d(b);
1487 }
1488 static inline Vec4d operator + (double a, Vec4d const & b) {
1489     return Vec4d(a) + b;
1490 }
1491 
1492 // vector operator += : add
1493 static inline Vec4d & operator += (Vec4d & a, Vec4d const & b) {
1494     a = a + b;
1495     return a;
1496 }
1497 
1498 // postfix operator ++
1499 static inline Vec4d operator ++ (Vec4d & a, int) {
1500     Vec4d a0 = a;
1501     a = a + 1.0;
1502     return a0;
1503 }
1504 
1505 // prefix operator ++
1506 static inline Vec4d & operator ++ (Vec4d & a) {
1507     a = a + 1.0;
1508     return a;
1509 }
1510 
1511 // vector operator - : subtract element by element
1512 static inline Vec4d operator - (Vec4d const & a, Vec4d const & b) {
1513     return _mm256_sub_pd(a, b);
1514 }
1515 
1516 // vector operator - : subtract vector and scalar
1517 static inline Vec4d operator - (Vec4d const & a, double b) {
1518     return a - Vec4d(b);
1519 }
1520 static inline Vec4d operator - (double a, Vec4d const & b) {
1521     return Vec4d(a) - b;
1522 }
1523 
1524 // vector operator - : unary minus
1525 // Change sign bit, even for 0, INF and NAN
1526 static inline Vec4d operator - (Vec4d const & a) {
1527     return _mm256_xor_pd(a, _mm256_castps_pd(constant8f<0,(int)0x80000000,0,(int)0x80000000,0,(int)0x80000000,0,(int)0x80000000> ()));
1528 }
1529 
1530 // vector operator -= : subtract
1531 static inline Vec4d & operator -= (Vec4d & a, Vec4d const & b) {
1532     a = a - b;
1533     return a;
1534 }
1535 
1536 // postfix operator --
1537 static inline Vec4d operator -- (Vec4d & a, int) {
1538     Vec4d a0 = a;
1539     a = a - 1.0;
1540     return a0;
1541 }
1542 
1543 // prefix operator --
1544 static inline Vec4d & operator -- (Vec4d & a) {
1545     a = a - 1.0;
1546     return a;
1547 }
1548 
1549 // vector operator * : multiply element by element
1550 static inline Vec4d operator * (Vec4d const & a, Vec4d const & b) {
1551     return _mm256_mul_pd(a, b);
1552 }
1553 
1554 // vector operator * : multiply vector and scalar
1555 static inline Vec4d operator * (Vec4d const & a, double b) {
1556     return a * Vec4d(b);
1557 }
1558 static inline Vec4d operator * (double a, Vec4d const & b) {
1559     return Vec4d(a) * b;
1560 }
1561 
1562 // vector operator *= : multiply
1563 static inline Vec4d & operator *= (Vec4d & a, Vec4d const & b) {
1564     a = a * b;
1565     return a;
1566 }
1567 
1568 // vector operator / : divide all elements by same integer
1569 static inline Vec4d operator / (Vec4d const & a, Vec4d const & b) {
1570     return _mm256_div_pd(a, b);
1571 }
1572 
1573 // vector operator / : divide vector and scalar
1574 static inline Vec4d operator / (Vec4d const & a, double b) {
1575     return a / Vec4d(b);
1576 }
1577 static inline Vec4d operator / (double a, Vec4d const & b) {
1578     return Vec4d(a) / b;
1579 }
1580 
1581 // vector operator /= : divide
1582 static inline Vec4d & operator /= (Vec4d & a, Vec4d const & b) {
1583     a = a / b;
1584     return a;
1585 }
1586 
1587 // vector operator == : returns true for elements for which a == b
1588 static inline Vec4db operator == (Vec4d const & a, Vec4d const & b) {
1589     return _mm256_cmp_pd(a, b, 0);
1590 }
1591 
1592 // vector operator != : returns true for elements for which a != b
1593 static inline Vec4db operator != (Vec4d const & a, Vec4d const & b) {
1594     return _mm256_cmp_pd(a, b, 4);
1595 }
1596 
1597 // vector operator < : returns true for elements for which a < b
1598 static inline Vec4db operator < (Vec4d const & a, Vec4d const & b) {
1599     return _mm256_cmp_pd(a, b, 1);
1600 }
1601 
1602 // vector operator <= : returns true for elements for which a <= b
1603 static inline Vec4db operator <= (Vec4d const & a, Vec4d const & b) {
1604     return _mm256_cmp_pd(a, b, 2);
1605 }
1606 
1607 // vector operator > : returns true for elements for which a > b
1608 static inline Vec4db operator > (Vec4d const & a, Vec4d const & b) {
1609     return b < a;
1610 }
1611 
1612 // vector operator >= : returns true for elements for which a >= b
1613 static inline Vec4db operator >= (Vec4d const & a, Vec4d const & b) {
1614     return b <= a;
1615 }
1616 
1617 // Bitwise logical operators
1618 
1619 // vector operator & : bitwise and
1620 static inline Vec4d operator & (Vec4d const & a, Vec4d const & b) {
1621     return _mm256_and_pd(a, b);
1622 }
1623 
1624 // vector operator &= : bitwise and
1625 static inline Vec4d & operator &= (Vec4d & a, Vec4d const & b) {
1626     a = a & b;
1627     return a;
1628 }
1629 
1630 // vector operator & : bitwise and of Vec4d and Vec4db
1631 static inline Vec4d operator & (Vec4d const & a, Vec4db const & b) {
1632     return _mm256_and_pd(a, b);
1633 }
1634 static inline Vec4d operator & (Vec4db const & a, Vec4d const & b) {
1635     return _mm256_and_pd(a, b);
1636 }
1637 
1638 // vector operator | : bitwise or
1639 static inline Vec4d operator | (Vec4d const & a, Vec4d const & b) {
1640     return _mm256_or_pd(a, b);
1641 }
1642 
1643 // vector operator |= : bitwise or
1644 static inline Vec4d & operator |= (Vec4d & a, Vec4d const & b) {
1645     a = a | b;
1646     return a;
1647 }
1648 
1649 // vector operator ^ : bitwise xor
1650 static inline Vec4d operator ^ (Vec4d const & a, Vec4d const & b) {
1651     return _mm256_xor_pd(a, b);
1652 }
1653 
1654 // vector operator ^= : bitwise xor
1655 static inline Vec4d & operator ^= (Vec4d & a, Vec4d const & b) {
1656     a = a ^ b;
1657     return a;
1658 }
1659 
1660 // vector operator ! : logical not. Returns Boolean vector
1661 static inline Vec4db operator ! (Vec4d const & a) {
1662     return a == Vec4d(0.0);
1663 }
1664 
1665 
1666 /*****************************************************************************
1667 *
1668 *          Functions for Vec4d
1669 *
1670 *****************************************************************************/
1671 
1672 // Select between two operands. Corresponds to this pseudocode:
1673 // for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
1674 // Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true).
1675 // No other values are allowed.
select(Vec4db const & s,Vec4d const & a,Vec4d const & b)1676 static inline Vec4d select (Vec4db const & s, Vec4d const & a, Vec4d const & b) {
1677     return _mm256_blendv_pd(b, a, s);
1678 }
1679 
1680 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec4db const & f,Vec4d const & a,Vec4d const & b)1681 static inline Vec4d if_add (Vec4db const & f, Vec4d const & a, Vec4d const & b) {
1682     return a + (Vec4d(f) & b);
1683 }
1684 
1685 // Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
if_mul(Vec4db const & f,Vec4d const & a,Vec4d const & b)1686 static inline Vec4d if_mul (Vec4db const & f, Vec4d const & a, Vec4d const & b) {
1687     return a * select(f, b, 1.);
1688 }
1689 
1690 
1691 // General arithmetic functions, etc.
1692 
1693 // Horizontal add: Calculates the sum of all vector elements.
horizontal_add(Vec4d const & a)1694 static inline double horizontal_add (Vec4d const & a) {
1695     __m256d t1 = _mm256_hadd_pd(a,a);
1696     __m128d t2 = _mm256_extractf128_pd(t1,1);
1697     __m128d t3 = _mm_add_sd(_mm256_castpd256_pd128(t1),t2);
1698     return _mm_cvtsd_f64(t3);
1699 }
1700 
1701 // function max: a > b ? a : b
max(Vec4d const & a,Vec4d const & b)1702 static inline Vec4d max(Vec4d const & a, Vec4d const & b) {
1703     return _mm256_max_pd(a,b);
1704 }
1705 
1706 // function min: a < b ? a : b
min(Vec4d const & a,Vec4d const & b)1707 static inline Vec4d min(Vec4d const & a, Vec4d const & b) {
1708     return _mm256_min_pd(a,b);
1709 }
1710 
1711 // function abs: absolute value
1712 // Removes sign bit, even for -0.0f, -INF and -NAN
abs(Vec4d const & a)1713 static inline Vec4d abs(Vec4d const & a) {
1714     __m256d mask = _mm256_castps_pd(constant8f<-1,0x7FFFFFFF,-1,0x7FFFFFFF,-1,0x7FFFFFFF,-1,0x7FFFFFFF> ());
1715     return _mm256_and_pd(a,mask);
1716 }
1717 
1718 // function sqrt: square root
sqrt(Vec4d const & a)1719 static inline Vec4d sqrt(Vec4d const & a) {
1720     return _mm256_sqrt_pd(a);
1721 }
1722 
1723 // function square: a * a
square(Vec4d const & a)1724 static inline Vec4d square(Vec4d const & a) {
1725     return a * a;
1726 }
1727 
1728 // pow(Vec4d, int):
1729 template <typename TT> static Vec4d pow(Vec4d const & a, TT const & n);
1730 
1731 // Raise floating point numbers to integer power n
1732 template <>
1733 inline Vec4d pow<int>(Vec4d const & x0, int const & n) {
1734     return pow_template_i<Vec4d>(x0, n);
1735 }
1736 
1737 // allow conversion from unsigned int
1738 template <>
1739 inline Vec4d pow<uint32_t>(Vec4d const & x0, uint32_t const & n) {
1740     return pow_template_i<Vec4d>(x0, (int)n);
1741 }
1742 
1743 
1744 // Raise floating point numbers to integer power n, where n is a compile-time constant
1745 template <int n>
pow_n(Vec4d const & a)1746 static inline Vec4d pow_n(Vec4d const & a) {
1747     if (n < 0)    return Vec4d(1.0) / pow_n<-n>(a);
1748     if (n == 0)   return Vec4d(1.0);
1749     if (n >= 256) return pow(a, n);
1750     Vec4d x = a;                       // a^(2^i)
1751     Vec4d y;                           // accumulator
1752     const int lowest = n - (n & (n-1));// lowest set bit in n
1753     if (n & 1) y = x;
1754     if (n < 2) return y;
1755     x = x*x;                           // x^2
1756     if (n & 2) {
1757         if (lowest == 2) y = x; else y *= x;
1758     }
1759     if (n < 4) return y;
1760     x = x*x;                           // x^4
1761     if (n & 4) {
1762         if (lowest == 4) y = x; else y *= x;
1763     }
1764     if (n < 8) return y;
1765     x = x*x;                           // x^8
1766     if (n & 8) {
1767         if (lowest == 8) y = x; else y *= x;
1768     }
1769     if (n < 16) return y;
1770     x = x*x;                           // x^16
1771     if (n & 16) {
1772         if (lowest == 16) y = x; else y *= x;
1773     }
1774     if (n < 32) return y;
1775     x = x*x;                           // x^32
1776     if (n & 32) {
1777         if (lowest == 32) y = x; else y *= x;
1778     }
1779     if (n < 64) return y;
1780     x = x*x;                           // x^64
1781     if (n & 64) {
1782         if (lowest == 64) y = x; else y *= x;
1783     }
1784     if (n < 128) return y;
1785     x = x*x;                           // x^128
1786     if (n & 128) {
1787         if (lowest == 128) y = x; else y *= x;
1788     }
1789     return y;
1790 }
1791 
1792 template <int n>
pow(Vec4d const & a,Const_int_t<n>)1793 static inline Vec4d pow(Vec4d const & a, Const_int_t<n>) {
1794     return pow_n<n>(a);
1795 }
1796 
1797 
1798 // function round: round to nearest integer (even). (result as double vector)
round(Vec4d const & a)1799 static inline Vec4d round(Vec4d const & a) {
1800     return _mm256_round_pd(a, 0+8);
1801 }
1802 
1803 // function truncate: round towards zero. (result as double vector)
truncate(Vec4d const & a)1804 static inline Vec4d truncate(Vec4d const & a) {
1805     return _mm256_round_pd(a, 3+8);
1806 }
1807 
1808 // function floor: round towards minus infinity. (result as double vector)
floor(Vec4d const & a)1809 static inline Vec4d floor(Vec4d const & a) {
1810     return _mm256_round_pd(a, 1+8);
1811 }
1812 
1813 // function ceil: round towards plus infinity. (result as double vector)
ceil(Vec4d const & a)1814 static inline Vec4d ceil(Vec4d const & a) {
1815     return _mm256_round_pd(a, 2+8);
1816 }
1817 
1818 // function round_to_int: round to nearest integer (even). (result as integer vector)
round_to_int(Vec4d const & a)1819 static inline Vec4i round_to_int(Vec4d const & a) {
1820     // Note: assume MXCSR control register is set to rounding
1821     return _mm256_cvtpd_epi32(a);
1822 }
1823 
1824 // function truncate_to_int: round towards zero. (result as integer vector)
truncate_to_int(Vec4d const & a)1825 static inline Vec4i truncate_to_int(Vec4d const & a) {
1826     return _mm256_cvttpd_epi32(a);
1827 }
1828 
1829 #ifdef VECTORI256_H  // 256 bit integer vectors are available
1830 
1831 // function truncate_to_int64: round towards zero. (inefficient)
truncate_to_int64(Vec4d const & a)1832 static inline Vec4q truncate_to_int64(Vec4d const & a) {
1833 #if defined (__AVX512DQ__) && defined (__AVX512VL__)
1834     //return _mm256_maskz_cvttpd_epi64( __mmask8(0xFF), a);
1835     return _mm256_cvttpd_epi64(a);
1836 #else
1837     double aa[4];
1838     a.store(aa);
1839     return Vec4q(int64_t(aa[0]), int64_t(aa[1]), int64_t(aa[2]), int64_t(aa[3]));
1840 #endif
1841 }
1842 
1843 // function truncate_to_int64_limited: round towards zero.
1844 // result as 64-bit integer vector, but with limited range. Deprecated!
truncate_to_int64_limited(Vec4d const & a)1845 static inline Vec4q truncate_to_int64_limited(Vec4d const & a) {
1846 #if defined (__AVX512DQ__) && defined (__AVX512VL__)
1847     return truncate_to_int64(a);
1848 #elif VECTORI256_H > 1
1849     // Note: assume MXCSR control register is set to rounding
1850     Vec2q   b = _mm256_cvttpd_epi32(a);                    // round to 32-bit integers
1851     __m256i c = permute4q<0,-256,1,-256>(Vec4q(b,b));      // get bits 64-127 to position 128-191
1852     __m256i s = _mm256_srai_epi32(c, 31);                  // sign extension bits
1853     return      _mm256_unpacklo_epi32(c, s);               // interleave with sign extensions
1854 #else
1855     return Vec4q(truncate_to_int64_limited(a.get_low()), truncate_to_int64_limited(a.get_high()));
1856 #endif
1857 }
1858 
1859 // function round_to_int64: round to nearest or even. (inefficient)
round_to_int64(Vec4d const & a)1860 static inline Vec4q round_to_int64(Vec4d const & a) {
1861 #if defined (__AVX512DQ__) && defined (__AVX512VL__)
1862     return _mm256_cvtpd_epi64(a);
1863 #else
1864     return truncate_to_int64(round(a));
1865 #endif
1866 }
1867 
1868 // function round_to_int64_limited: round to nearest integer (even)
1869 // result as 64-bit integer vector, but with limited range. Deprecated!
round_to_int64_limited(Vec4d const & a)1870 static inline Vec4q round_to_int64_limited(Vec4d const & a) {
1871 #if defined (__AVX512DQ__) && defined (__AVX512VL__)
1872     return round_to_int64(a);
1873 #elif VECTORI256_H > 1
1874     // Note: assume MXCSR control register is set to rounding
1875     Vec2q   b = _mm256_cvtpd_epi32(a);                     // round to 32-bit integers
1876     __m256i c = permute4q<0,-256,1,-256>(Vec4q(b,b));      // get bits 64-127 to position 128-191
1877     __m256i s = _mm256_srai_epi32(c, 31);                  // sign extension bits
1878     return      _mm256_unpacklo_epi32(c, s);               // interleave with sign extensions
1879 #else
1880     return Vec4q(round_to_int64_limited(a.get_low()), round_to_int64_limited(a.get_high()));
1881 #endif
1882 }
1883 
1884 // function to_double: convert integer vector elements to double vector (inefficient)
to_double(Vec4q const & a)1885 static inline Vec4d to_double(Vec4q const & a) {
1886 #if defined (__AVX512DQ__) && defined (__AVX512VL__)
1887         return _mm256_maskz_cvtepi64_pd( __mmask16(0xFF), a);
1888 #else
1889         int64_t aa[4];
1890         a.store(aa);
1891         return Vec4d(double(aa[0]), double(aa[1]), double(aa[2]), double(aa[3]));
1892 #endif
1893 }
1894 
1895 // function to_double_limited: convert integer vector elements to double vector
1896 // limited to abs(x) < 2^31. Deprecated!
to_double_limited(Vec4q const & x)1897 static inline Vec4d to_double_limited(Vec4q const & x) {
1898 #if defined (__AVX512DQ__) && defined (__AVX512VL__)
1899     return to_double(x);
1900 #else
1901     Vec8i compressed = permute8i<0,2,4,6,-256,-256,-256,-256>(Vec8i(x));
1902     return _mm256_cvtepi32_pd(compressed.get_low());  // AVX
1903 #endif
1904 }
1905 
1906 #endif // VECTORI256_H
1907 
1908 // function to_double: convert integer vector to double vector
to_double(Vec4i const & a)1909 static inline Vec4d to_double(Vec4i const & a) {
1910     return _mm256_cvtepi32_pd(a);
1911 }
1912 
1913 // function compress: convert two Vec4d to one Vec8f
compress(Vec4d const & low,Vec4d const & high)1914 static inline Vec8f compress (Vec4d const & low, Vec4d const & high) {
1915     __m128 t1 = _mm256_cvtpd_ps(low);
1916     __m128 t2 = _mm256_cvtpd_ps(high);
1917     return Vec8f(t1, t2);
1918 }
1919 
1920 // Function extend_low : convert Vec8f vector elements 0 - 3 to Vec4d
extend_low(Vec8f const & a)1921 static inline Vec4d extend_low(Vec8f const & a) {
1922     return _mm256_cvtps_pd(_mm256_castps256_ps128(a));
1923 }
1924 
1925 // Function extend_high : convert Vec8f vector elements 4 - 7 to Vec4d
extend_high(Vec8f const & a)1926 static inline Vec4d extend_high (Vec8f const & a) {
1927     return _mm256_cvtps_pd(_mm256_extractf128_ps(a,1));
1928 }
1929 
1930 // Fused multiply and add functions
1931 
1932 // Multiply and add
mul_add(Vec4d const & a,Vec4d const & b,Vec4d const & c)1933 static inline Vec4d mul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
1934 #ifdef __FMA__
1935     return _mm256_fmadd_pd(a, b, c);
1936 #elif defined (__FMA4__)
1937     return _mm256_macc_pd(a, b, c);
1938 #else
1939     return a * b + c;
1940 #endif
1941 
1942 }
1943 
1944 
1945 // Multiply and subtract
mul_sub(Vec4d const & a,Vec4d const & b,Vec4d const & c)1946 static inline Vec4d mul_sub(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
1947 #ifdef __FMA__
1948     return _mm256_fmsub_pd(a, b, c);
1949 #elif defined (__FMA4__)
1950     return _mm256_msub_pd(a, b, c);
1951 #else
1952     return a * b - c;
1953 #endif
1954 
1955 }
1956 
1957 // Multiply and inverse subtract
nmul_add(Vec4d const & a,Vec4d const & b,Vec4d const & c)1958 static inline Vec4d nmul_add(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
1959 #ifdef __FMA__
1960     return _mm256_fnmadd_pd(a, b, c);
1961 #elif defined (__FMA4__)
1962     return _mm256_nmacc_pd(a, b, c);
1963 #else
1964     return c - a * b;
1965 #endif
1966 }
1967 
1968 // Multiply and subtract with extra precision on the intermediate calculations,
1969 // even if FMA instructions not supported, using Veltkamp-Dekker split
mul_sub_x(Vec4d const & a,Vec4d const & b,Vec4d const & c)1970 static inline Vec4d mul_sub_x(Vec4d const & a, Vec4d const & b, Vec4d const & c) {
1971 #ifdef __FMA__
1972     return _mm256_fmsub_pd(a, b, c);
1973 #elif defined (__FMA4__)
1974     return _mm256_msub_pd(a, b, c);
1975 #else
1976     // calculate a * b - c with extra precision
1977     // mask to remove lower 27 bits
1978     Vec4d upper_mask = _mm256_castps_pd(constant8f<(int)0xF8000000,-1,(int)0xF8000000,-1,(int)0xF8000000,-1,(int)0xF8000000,-1>());
1979     Vec4d a_high = a & upper_mask;               // split into high and low parts
1980     Vec4d b_high = b & upper_mask;
1981     Vec4d a_low  = a - a_high;
1982     Vec4d b_low  = b - b_high;
1983     Vec4d r1 = a_high * b_high;                  // this product is exact
1984     Vec4d r2 = r1 - c;                           // subtract c from high product
1985     Vec4d r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product
1986     return r3; // + ((r2 - r1) + c);
1987 #endif
1988 }
1989 
1990 
1991 // Math functions using fast bit manipulation
1992 
1993 #ifdef VECTORI256_H  // 256 bit integer vectors are available
1994 // Extract the exponent as an integer
1995 // exponent(a) = floor(log2(abs(a)));
1996 // exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
exponent(Vec4d const & a)1997 static inline Vec4q exponent(Vec4d const & a) {
1998 #if VECTORI256_H > 1  // AVX2
1999     Vec4uq t1 = _mm256_castpd_si256(a);// reinterpret as 64-bit integer
2000     Vec4uq t2 = t1 << 1;               // shift out sign bit
2001     Vec4uq t3 = t2 >> 53;              // shift down logical to position 0
2002     Vec4q  t4 = Vec4q(t3) - 0x3FF;     // subtract bias from exponent
2003     return t4;
2004 #else
2005     return Vec4q(exponent(a.get_low()), exponent(a.get_high()));
2006 #endif
2007 }
2008 
2009 // Extract the fraction part of a floating point number
2010 // a = 2^exponent(a) * fraction(a), except for a = 0
2011 // fraction(1.0) = 1.0, fraction(5.0) = 1.25
fraction(Vec4d const & a)2012 static inline Vec4d fraction(Vec4d const & a) {
2013 #if VECTORI256_H > 1  // AVX2
2014     Vec4uq t1 = _mm256_castpd_si256(a);   // reinterpret as 64-bit integer
2015     Vec4uq t2 = Vec4uq((t1 & 0x000FFFFFFFFFFFFF) | 0x3FF0000000000000); // set exponent to 0 + bias
2016     return _mm256_castsi256_pd(t2);
2017 #else
2018     return Vec4d(fraction(a.get_low()), fraction(a.get_high()));
2019 #endif
2020 }
2021 
2022 // Fast calculation of pow(2,n) with n integer
2023 // n  =     0 gives 1.0
2024 // n >=  1024 gives +INF
2025 // n <= -1023 gives 0.0
2026 // This function will never produce denormals, and never raise exceptions
exp2(Vec4q const & n)2027 static inline Vec4d exp2(Vec4q const & n) {
2028 #if VECTORI256_H > 1  // AVX2
2029     Vec4q t1 = max(n,  -0x3FF);        // limit to allowed range
2030     Vec4q t2 = min(t1,  0x400);
2031     Vec4q t3 = t2 + 0x3FF;             // add bias
2032     Vec4q t4 = t3 << 52;               // put exponent into position 52
2033     return _mm256_castsi256_pd(t4);       // reinterpret as double
2034 #else
2035     return Vec4d(exp2(n.get_low()), exp2(n.get_high()));
2036 #endif
2037 }
2038 //static inline Vec4d exp2(Vec4d const & x); // defined in vectormath_exp.h
2039 #endif
2040 
2041 
2042 // Categorization functions
2043 
2044 // Function sign_bit: gives true for elements that have the sign bit set
2045 // even for -0.0, -INF and -NAN
2046 // Note that sign_bit(Vec4d(-0.0)) gives true, while Vec4d(-0.0) < Vec4d(0.0) gives false
sign_bit(Vec4d const & a)2047 static inline Vec4db sign_bit(Vec4d const & a) {
2048 #if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
2049     Vec4q t1 = _mm256_castpd_si256(a);    // reinterpret as 64-bit integer
2050     Vec4q t2 = t1 >> 63;               // extend sign bit
2051     return _mm256_castsi256_pd(t2);       // reinterpret as 64-bit Boolean
2052 #else
2053     return Vec4db(sign_bit(a.get_low()),sign_bit(a.get_high()));
2054 #endif
2055 }
2056 
2057 // Function sign_combine: changes the sign of a when b has the sign bit set
2058 // same as select(sign_bit(b), -a, a)
sign_combine(Vec4d const & a,Vec4d const & b)2059 static inline Vec4d sign_combine(Vec4d const & a, Vec4d const & b) {
2060     Vec4d signmask = _mm256_castps_pd(constant8f<0,(int)0x80000000,0,(int)0x80000000,0,(int)0x80000000,0,(int)0x80000000>());  // -0.0
2061     return a ^ (b & signmask);
2062 }
2063 
2064 // Function is_finite: gives true for elements that are normal, denormal or zero,
2065 // false for INF and NAN
is_finite(Vec4d const & a)2066 static inline Vec4db is_finite(Vec4d const & a) {
2067 #if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
2068     Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer
2069     Vec4q t2 = t1 << 1;                // shift out sign bit
2070     Vec4q t3 = 0xFFE0000000000000;     // exponent mask
2071     Vec4qb t4 = Vec4q(t2 & t3) != t3;  // exponent field is not all 1s
2072     return t4;
2073 #else
2074     return Vec4db(is_finite(a.get_low()),is_finite(a.get_high()));
2075 #endif
2076 }
2077 
2078 // Function is_inf: gives true for elements that are +INF or -INF
2079 // false for finite numbers and NAN
is_inf(Vec4d const & a)2080 static inline Vec4db is_inf(Vec4d const & a) {
2081 #if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
2082     Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer
2083     Vec4q t2 = t1 << 1;                // shift out sign bit
2084     return t2 == 0xFFE0000000000000;   // exponent is all 1s, fraction is 0
2085 #else
2086     return Vec4db(is_inf(a.get_low()),is_inf(a.get_high()));
2087 #endif
2088 }
2089 
2090 // Function is_nan: gives true for elements that are +NAN or -NAN
2091 // false for finite numbers and +/-INF
is_nan(Vec4d const & a)2092 static inline Vec4db is_nan(Vec4d const & a) {
2093 #if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
2094     Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer
2095     Vec4q t2 = t1 << 1;                // shift out sign bit
2096     Vec4q t3 = 0xFFE0000000000000;     // exponent mask
2097     Vec4q t4 = t2 & t3;                // exponent
2098     Vec4q t5 = _mm256_andnot_si256(t3,t2);// fraction
2099     return Vec4qb(t4 == t3 && t5 != 0);// exponent = all 1s and fraction != 0
2100 #else
2101     return Vec4db(is_nan(a.get_low()),is_nan(a.get_high()));
2102 #endif
2103 }
2104 
2105 // Function is_subnormal: gives true for elements that are denormal (subnormal)
2106 // false for finite numbers, zero, NAN and INF
is_subnormal(Vec4d const & a)2107 static inline Vec4db is_subnormal(Vec4d const & a) {
2108 #if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
2109     Vec4q t1 = _mm256_castpd_si256(a); // reinterpret as 64-bit integer
2110     Vec4q t2 = t1 << 1;                // shift out sign bit
2111     Vec4q t3 = 0xFFE0000000000000;     // exponent mask
2112     Vec4q t4 = t2 & t3;                // exponent
2113     Vec4q t5 = _mm256_andnot_si256(t3,t2);// fraction
2114     return Vec4qb(t4 == 0 && t5 != 0); // exponent = 0 and fraction != 0
2115 #else
2116     return Vec4db(is_subnormal(a.get_low()),is_subnormal(a.get_high()));
2117 #endif
2118 }
2119 
2120 // Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
2121 // false for finite numbers, NAN and INF
is_zero_or_subnormal(Vec4d const & a)2122 static inline Vec4db is_zero_or_subnormal(Vec4d const & a) {
2123 #if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
2124     Vec4q t = _mm256_castpd_si256(a);     // reinterpret as 32-bit integer
2125           t &= 0x7FF0000000000000ll;   // isolate exponent
2126     return t == 0;                     // exponent = 0
2127 #else
2128     return Vec4db(is_zero_or_subnormal(a.get_low()),is_zero_or_subnormal(a.get_high()));
2129 #endif
2130 }
2131 
2132 // Function infinite2d: returns a vector where all elements are +INF
infinite4d()2133 static inline Vec4d infinite4d() {
2134     return _mm256_castps_pd(constant8f<0,0x7FF00000,0,0x7FF00000,0,0x7FF00000,0,0x7FF00000>());
2135 }
2136 
2137 // Function nan4d: returns a vector where all elements are +NAN (quiet)
2138 static inline Vec4d nan4d(int n = 0x10) {
2139 #if defined (VECTORI256_H) && VECTORI256_H > 1  // 256 bit integer vectors are available, AVX2
2140     return _mm256_castsi256_pd(Vec4q(0x7FF8000000000000 + n));
2141 #else
2142     return Vec4d(nan2d(n),nan2d(n));
2143 #endif
2144 }
2145 
2146 // change signs on vectors Vec4d
2147 // Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
2148 template <int i0, int i1, int i2, int i3>
change_sign(Vec4d const & a)2149 static inline Vec4d change_sign(Vec4d const & a) {
2150     if ((i0 | i1 | i2 | i3) == 0) return a;
2151     __m256 mask = constant8f<0, i0 ? (int)0x80000000 : 0, 0, i1 ? (int)0x80000000 : 0, 0, i2 ? (int)0x80000000 : 0, 0, i3 ? (int)0x80000000 : 0> ();
2152     return _mm256_xor_pd(a, _mm256_castps_pd(mask));
2153 }
2154 
2155 
2156 /*****************************************************************************
2157 *
2158 *          Functions for reinterpretation between vector types
2159 *
2160 *****************************************************************************/
2161 
2162 #if defined (VECTORI256_H) && VECTORI256_H >= 2
2163 // AVX2 vectors defined
2164 
2165 
2166 // ABI version 4 or later needed on Gcc for correct mangling of 256-bit intrinsic vectors.
2167 // It is recommended to compile with -fabi-version=0 to get the latest abi version
2168 #if !defined (GCC_VERSION) || (defined (__GXX_ABI_VERSION) && __GXX_ABI_VERSION >= 1004)
reinterpret_i(__m256i const & x)2169 static inline __m256i reinterpret_i (__m256i const & x) {
2170     return x;
2171 }
2172 
reinterpret_i(__m256 const & x)2173 static inline __m256i reinterpret_i (__m256  const & x) {
2174     return _mm256_castps_si256(x);
2175 }
2176 
reinterpret_i(__m256d const & x)2177 static inline __m256i reinterpret_i (__m256d const & x) {
2178     return _mm256_castpd_si256(x);
2179 }
2180 
reinterpret_f(__m256i const & x)2181 static inline __m256  reinterpret_f (__m256i const & x) {
2182     return _mm256_castsi256_ps(x);
2183 }
2184 
reinterpret_f(__m256 const & x)2185 static inline __m256  reinterpret_f (__m256  const & x) {
2186     return x;
2187 }
2188 
reinterpret_f(__m256d const & x)2189 static inline __m256  reinterpret_f (__m256d const & x) {
2190     return _mm256_castpd_ps(x);
2191 }
2192 
reinterpret_d(__m256i const & x)2193 static inline __m256d reinterpret_d (__m256i const & x) {
2194     return _mm256_castsi256_pd(x);
2195 }
2196 
reinterpret_d(__m256 const & x)2197 static inline __m256d reinterpret_d (__m256  const & x) {
2198     return _mm256_castps_pd(x);
2199 }
2200 
reinterpret_d(__m256d const & x)2201 static inline __m256d reinterpret_d (__m256d const & x) {
2202     return x;
2203 }
2204 
2205 #else  // __GXX_ABI_VERSION < 1004
2206 
reinterpret_i(Vec32c const & x)2207 static inline __m256i reinterpret_i (Vec32c const & x) {
2208     return x;
2209 }
2210 
reinterpret_i(Vec16s const & x)2211 static inline __m256i reinterpret_i (Vec16s const & x) {
2212     return x;
2213 }
2214 
reinterpret_i(Vec8i const & x)2215 static inline __m256i reinterpret_i (Vec8i const & x) {
2216     return x;
2217 }
2218 
reinterpret_i(Vec4q const & x)2219 static inline __m256i reinterpret_i (Vec4q const & x) {
2220     return x;
2221 }
2222 
reinterpret_i(Vec8f const & x)2223 static inline __m256i reinterpret_i (Vec8f  const & x) {
2224     return _mm256_castps_si256(x);
2225 }
2226 
reinterpret_i(Vec4d const & x)2227 static inline __m256i reinterpret_i (Vec4d const & x) {
2228     return _mm256_castpd_si256(x);
2229 }
2230 
reinterpret_f(Vec32c const & x)2231 static inline __m256  reinterpret_f (Vec32c const & x) {
2232     return _mm256_castsi256_ps(x);
2233 }
2234 
reinterpret_f(Vec16s const & x)2235 static inline __m256  reinterpret_f (Vec16s const & x) {
2236     return _mm256_castsi256_ps(x);
2237 }
2238 
reinterpret_f(Vec8i const & x)2239 static inline __m256  reinterpret_f (Vec8i const & x) {
2240     return _mm256_castsi256_ps(x);
2241 }
2242 
reinterpret_f(Vec4q const & x)2243 static inline __m256  reinterpret_f (Vec4q const & x) {
2244     return _mm256_castsi256_ps(x);
2245 }
2246 
reinterpret_f(Vec8f const & x)2247 static inline __m256  reinterpret_f (Vec8f  const & x) {
2248     return x;
2249 }
2250 
reinterpret_f(Vec4d const & x)2251 static inline __m256  reinterpret_f (Vec4d const & x) {
2252     return _mm256_castpd_ps(x);
2253 }
2254 
reinterpret_d(Vec32c const & x)2255 static inline __m256d reinterpret_d (Vec32c const & x) {
2256     return _mm256_castsi256_pd(x);
2257 }
2258 
reinterpret_d(Vec16s const & x)2259 static inline __m256d reinterpret_d (Vec16s const & x) {
2260     return _mm256_castsi256_pd(x);
2261 }
2262 
reinterpret_d(Vec8i const & x)2263 static inline __m256d reinterpret_d (Vec8i const & x) {
2264     return _mm256_castsi256_pd(x);
2265 }
2266 
reinterpret_d(Vec4q const & x)2267 static inline __m256d reinterpret_d (Vec4q const & x) {
2268     return _mm256_castsi256_pd(x);
2269 }
2270 
reinterpret_d(Vec8f const & x)2271 static inline __m256d reinterpret_d (Vec8f  const & x) {
2272     return _mm256_castps_pd(x);
2273 }
2274 
reinterpret_d(Vec4d const & x)2275 static inline __m256d reinterpret_d (Vec4d const & x) {
2276     return x;
2277 }
2278 
2279 #endif  // __GXX_ABI_VERSION
2280 
2281 #else
2282 // AVX2 emulated in vectori256e.h, AVX supported
2283 
2284 // ABI version 4 or later needed on Gcc for correct mangling of 256-bit intrinsic vectors.
2285 // It is recommended to compile with -fabi-version=0 to get the latest abi version
2286 #if !defined (GCC_VERSION) || (defined (__GXX_ABI_VERSION) && __GXX_ABI_VERSION >= 1004)
2287 
reinterpret_i(__m256 const & x)2288 static inline Vec256ie reinterpret_i (__m256  const & x) {
2289     Vec8f xx(x);
2290     return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high()));
2291 }
2292 
reinterpret_i(__m256d const & x)2293 static inline Vec256ie reinterpret_i (__m256d const & x) {
2294     Vec4d xx(x);
2295     return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high()));
2296 }
2297 
reinterpret_f(__m256 const & x)2298 static inline __m256  reinterpret_f (__m256  const & x) {
2299     return x;
2300 }
2301 
reinterpret_f(__m256d const & x)2302 static inline __m256  reinterpret_f (__m256d const & x) {
2303     return _mm256_castpd_ps(x);
2304 }
2305 
reinterpret_d(__m256 const & x)2306 static inline __m256d reinterpret_d (__m256  const & x) {
2307     return _mm256_castps_pd(x);
2308 }
2309 
reinterpret_d(__m256d const & x)2310 static inline __m256d reinterpret_d (__m256d const & x) {
2311     return x;
2312 }
2313 
2314 #else  // __GXX_ABI_VERSION < 1004
2315 
reinterpret_i(Vec8f const & x)2316 static inline Vec256ie reinterpret_i (Vec8f const & x) {
2317     Vec8f xx(x);
2318     return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high()));
2319 }
2320 
reinterpret_i(Vec4d const & x)2321 static inline Vec256ie reinterpret_i (Vec4d const & x) {
2322     Vec4d xx(x);
2323     return Vec256ie(reinterpret_i(xx.get_low()), reinterpret_i(xx.get_high()));
2324 }
2325 
reinterpret_f(Vec8f const & x)2326 static inline __m256  reinterpret_f (Vec8f const & x) {
2327     return x;
2328 }
2329 
reinterpret_f(Vec4d const & x)2330 static inline __m256  reinterpret_f (Vec4d const & x) {
2331     return _mm256_castpd_ps(x);
2332 }
2333 
reinterpret_d(Vec8f const & x)2334 static inline __m256d reinterpret_d (Vec8f const & x) {
2335     return _mm256_castps_pd(x);
2336 }
2337 
reinterpret_d(Vec4d const & x)2338 static inline __m256d reinterpret_d (Vec4d const & x) {
2339     return x;
2340 }
2341 
2342 #endif  // __GXX_ABI_VERSION
2343 
reinterpret_i(Vec256ie const & x)2344 static inline Vec256ie reinterpret_i (Vec256ie const & x) {
2345     return x;
2346 }
2347 
reinterpret_f(Vec256ie const & x)2348 static inline __m256  reinterpret_f (Vec256ie const & x) {
2349     return Vec8f(Vec4f(reinterpret_f(x.get_low())), Vec4f(reinterpret_f(x.get_high())));
2350 }
2351 
reinterpret_d(Vec256ie const & x)2352 static inline __m256d reinterpret_d (Vec256ie const & x) {
2353     return Vec4d(Vec2d(reinterpret_d(x.get_low())), Vec2d(reinterpret_d(x.get_high())));
2354 }
2355 
2356 #endif  // VECTORI256_H
2357 
2358 
2359 /*****************************************************************************
2360 *
2361 *          Vector permute and blend functions
2362 *
2363 ******************************************************************************
2364 *
2365 * The permute function can reorder the elements of a vector and optionally
2366 * set some elements to zero.
2367 *
2368 * The indexes are inserted as template parameters in <>. These indexes must be
2369 * constants. Each template parameter is an index to the element you want to
2370 * select. An index of -1 will generate zero. An index of -256 means don't care.
2371 *
2372 * Example:
2373 * Vec4d a(10., 11., 12., 13.);    // a is (10, 11, 12, 13)
2374 * Vec4d b;
2375 * b = permute4d<1,0,-1,3>(a);     // b is (11, 10,  0, 13)
2376 *
2377 *
2378 * The blend function can mix elements from two different vectors and
2379 * optionally set some elements to zero.
2380 *
2381 * The indexes are inserted as template parameters in <>. These indexes must be
2382 * constants. Each template parameter is an index to the element you want to
2383 * select, where indexes 0 - 3 indicate an element from the first source
2384 * vector and indexes 4 - 7 indicate an element from the second source vector.
2385 * A negative index will generate zero.
2386 *
2387 *
2388 * Example:
2389 * Vec4d a(10., 11., 12., 13.);    // a is (10, 11, 12, 13)
2390 * Vec4d b(20., 21., 22., 23.);    // a is (20, 21, 22, 23)
2391 * Vec4d c;
2392 * c = blend4d<4,3,7,-1> (a,b);    // c is (20, 13, 23,  0)
2393 *
2394 * A lot of the code here is metaprogramming aiming to find the instructions
2395 * that best fit the template parameters and instruction set. The metacode
2396 * will be reduced out to leave only a few vector instructions in release
2397 * mode with optimization on.
2398 *****************************************************************************/
2399 
2400 // permute vector Vec4d
2401 template <int i0, int i1, int i2, int i3>
permute4d(Vec4d const & a)2402 static inline Vec4d permute4d(Vec4d const & a) {
2403 
2404     const int ior = i0 | i1 | i2 | i3;  // OR indexes
2405 
2406     // is zeroing needed
2407     const bool do_zero    = ior < 0 && (ior & 0x80); // at least one index is negative, and not -0x100
2408 
2409     // is shuffling needed
2410     const bool do_shuffle = (i0>0) || (i1!=1 && i1>=0) || (i2!=2 && i2>=0) || (i3!=3 && i3>=0);
2411 
2412     if (!do_shuffle) {       // no shuffling needed
2413         if (do_zero) {       // zeroing
2414             if ((i0 & i1 & i2 & i3) < 0) {
2415                 return _mm256_setzero_pd(); // zero everything
2416             }
2417             // zero some elements
2418             __m256d const mask = _mm256_castps_pd (
2419                 constant8f< -int(i0>=0), -int(i0>=0), -int(i1>=0), -int(i1>=0), -int(i2>=0), -int(i2>=0), -int(i3>=0), -int(i3>=0) > ());
2420             return _mm256_and_pd(a, mask);     // zero with AND mask
2421         }
2422         else {
2423             return a;  // do nothing
2424         }
2425     }
2426 #if INSTRSET >= 8  // AVX2: use VPERMPD
2427     __m256d x = _mm256_permute4x64_pd(a, (i0&3) | (i1&3)<<2 | (i2&3)<<4 | (i3&3)<<6);
2428     if (do_zero) {       // zeroing
2429         // zero some elements
2430         __m256d const mask2 = _mm256_castps_pd (
2431             constant8f< -int(i0>=0), -int(i0>=0), -int(i1>=0), -int(i1>=0), -int(i2>=0), -int(i2>=0), -int(i3>=0), -int(i3>=0) > ());
2432         x = _mm256_and_pd(x, mask2);     // zero with AND mask
2433     }
2434     return x;
2435 #else   // AVX
2436 
2437     // Needed contents of low/high part of each source register in VSHUFPD
2438     // 0: a.low, 1: a.high, 3: zero
2439     const int s1 = (i0 < 0 ? 3 : (i0 & 2) >> 1) | (i2 < 0 ? 0x30 : (i2 & 2) << 3);
2440     const int s2 = (i1 < 0 ? 3 : (i1 & 2) >> 1) | (i3 < 0 ? 0x30 : (i3 & 2) << 3);
2441     // permute mask
2442     const int sm = (i0 < 0 ? 0 : (i0 & 1)) | (i1 < 0 ? 1 : (i1 & 1)) << 1 | (i2 < 0 ? 0 : (i2 & 1)) << 2 | (i3 < 0 ? 1 : (i3 & 1)) << 3;
2443 
2444     if (s1 == 0x01 || s1 == 0x11 || s2 == 0x01 || s2 == 0x11) {
2445         // too expensive to use 256 bit permute, split into two 128 bit permutes
2446         Vec2d alo = a.get_low();
2447         Vec2d ahi = a.get_high();
2448         Vec2d rlo = blend2d<i0, i1> (alo, ahi);
2449         Vec2d rhi = blend2d<i2, i3> (alo, ahi);
2450         return Vec4d(rlo, rhi);
2451     }
2452 
2453     // make operands for VSHUFPD
2454     __m256d r1, r2;
2455 
2456     switch (s1) {
2457     case 0x00:  // LL
2458         r1 = _mm256_insertf128_pd(a,_mm256_castpd256_pd128(a),1);  break;
2459     case 0x03:  // LZ
2460         r1 = _mm256_insertf128_pd(do_zero ? _mm256_setzero_pd() : __m256d(a), _mm256_castpd256_pd128(a), 1);
2461         break;
2462     case 0x10:  // LH
2463         r1 = a;  break;
2464     case 0x13:  // ZH
2465         r1 = do_zero ? _mm256_and_pd(a, _mm256_castps_pd(constant8f<0,0,0,0,-1,-1,-1,-1>())) : __m256d(a);  break;
2466     case 0x30:  // LZ
2467         if (do_zero) {
2468             __m128d t  = _mm256_castpd256_pd128(a);
2469             t  = _mm_and_pd(t,t);
2470             r1 = _mm256_castpd128_pd256(t);
2471         }
2472         else r1 = a;
2473         break;
2474     case 0x31:  // HZ
2475         r1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(a,1));  break;
2476     case 0x33:  // ZZ
2477         r1 = do_zero ? _mm256_setzero_pd() : __m256d(a);  break;
2478     default:;   // Not needed. Avoid warning in Clang
2479     }
2480 
2481     if (s2 == s1) {
2482         if (sm == 0x0A) return r1;
2483         r2 = r1;
2484     }
2485     else {
2486         switch (s2) {
2487         case 0x00:  // LL
2488             r2 = _mm256_insertf128_pd(a,_mm256_castpd256_pd128(a),1);  break;
2489         case 0x03:  // ZL
2490             r2 = _mm256_insertf128_pd(do_zero ? _mm256_setzero_pd() : __m256d(a), _mm256_castpd256_pd128(a), 1);
2491             break;
2492         case 0x10:  // LH
2493             r2 = a;  break;
2494         case 0x13:  // ZH
2495             r2 = do_zero ? _mm256_and_pd(a,_mm256_castps_pd(constant8f<0,0,0,0,-1,-1,-1,-1>())) : __m256d(a);  break;
2496         case 0x30:  // LZ
2497             if (do_zero) {
2498                 __m128d t  = _mm256_castpd256_pd128(a);
2499                 t  = _mm_and_pd(t,t);
2500                 r2 = _mm256_castpd128_pd256(t);
2501             }
2502             else r2 = a;
2503             break;
2504         case 0x31:  // HZ
2505             r2 = _mm256_castpd128_pd256(_mm256_extractf128_pd(a,1));  break;
2506         case 0x33:  // ZZ
2507             r2 = do_zero ? _mm256_setzero_pd() : __m256d(a);  break;
2508         default:;   // Not needed. Avoid warning in Clang
2509         }
2510     }
2511     return  _mm256_shuffle_pd(r1, r2, sm);
2512 
2513 #endif  // INSTRSET >= 8
2514 }
2515 
2516 
2517 // blend vectors Vec4d
2518 template <int i0, int i1, int i2, int i3>
blend4d(Vec4d const & a,Vec4d const & b)2519 static inline Vec4d blend4d(Vec4d const & a, Vec4d const & b) {
2520 
2521     // Combine all the indexes into a single bitfield, with 8 bits for each
2522     const int m1 = (i0 & 7) | (i1 & 7) << 8 | (i2 & 7) << 16 | (i3 & 7) << 24;
2523 
2524     // Mask to zero out negative indexes
2525     const uint32_t mz = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8 | (i2 < 0 ? 0 : 0xFF) << 16 | (i3 < 0 ? 0 : 0xFF) << 24;
2526 
2527     if (mz == 0) return _mm256_setzero_pd();  // all zero
2528 
2529     __m256d t1;
2530     if ((((m1 & 0xFEFEFEFE) ^ 0x06020400) & mz) == 0) {
2531         // fits VSHUFPD(a,b)
2532         t1 = _mm256_shuffle_pd(a, b, (i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3);
2533         if (mz == 0xFFFFFFFF) return t1;
2534         return permute4d<i0 < 0 ? -1 : 0, i1 < 0 ? -1 : 1, i2 < 0 ? -1 : 2, i3 < 0 ? -1 : 3> (t1);
2535     }
2536     if ((((m1 & 0xFEFEFEFE) ^0x02060004) & mz) == 0) {
2537         // fits VSHUFPD(b,a)
2538         t1 = _mm256_shuffle_pd(b, a, (i0 & 1) | (i1 & 1) << 1 | (i2 & 1) << 2 | (i3 & 1) << 3);
2539         if (mz == 0xFFFFFFFF) return t1;
2540         return permute4d<i0 < 0 ? -1 : 0, i1 < 0 ? -1 : 1, i2 < 0 ? -1 : 2, i3 < 0 ? -1 : 3> (t1);
2541     }
2542     if ((((m1 & 0x03030303) ^ 0x03020100) & mz) == 0) {
2543         // blend and zero, no permute
2544         if ((m1 & 0x04040404 & mz) == 0) {
2545             t1 = a;
2546         }
2547         else if (((m1 ^ 0x04040404) & 0x04040404 & mz) == 0) {
2548             t1 = b;
2549         }
2550         else {
2551             t1 = _mm256_blend_pd(a, b, (i0&4)>>2 | (i1&4)>>1 | (i2&4) | (i3&4) << 1);
2552         }
2553         if (mz == 0xFFFFFFFF) return t1;
2554         return permute4d<i0 < 0 ? -1 : 0, i1 < 0 ? -1 : 1, i2 < 0 ? -1 : 2, i3 < 0 ? -1 : 3> (t1);
2555     }
2556     if ((m1 & 0x04040404 & mz) == 0) {
2557         // all from a
2558         return permute4d<i0, i1, i2, i3> (a);
2559     }
2560     if (((m1 ^ 0x04040404) & 0x04040404 & mz) == 0) {
2561         // all from b
2562         return permute4d<i0 ^ 4, i1 ^ 4, i2 ^ 4, i3 ^ 4> (b);
2563     }
2564     // check if we can do 128-bit blend/permute
2565     if (((m1 ^ 0x01000100) & 0x01010101 & mz) == 0) {
2566         const uint32_t j0 = uint32_t((i0 >= 0 ? i0 : i1 >= 0 ? i1 : -1) >> 1);
2567         const uint32_t j1 = uint32_t((i2 >= 0 ? i2 : i3 >= 0 ? i3 : -1) >> 1);
2568         if (((m1 ^ ((j0 & 3) * 0x00000202 | (j1 & 3) * 0x02020000)) & 0x06060606 & mz) == 0) {
2569             t1 = _mm256_permute2f128_pd(a, b, (j0 & 0x0F) | (j1 & 0x0F) << 4);
2570             const bool partialzero = (((i0 | i1) ^ j0) & 0x80) != 0 || (((i2 | i3) ^ j1) & 0x80) != 0;
2571             if (partialzero) {
2572                 // zero some elements
2573                 __m256d mask = _mm256_castps_pd (constant8f <
2574                     i0 < 0 ? 0 : -1, i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i1 < 0 ? 0 : -1,
2575                     i2 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1, i3 < 0 ? 0 : -1 > ());
2576                 return _mm256_and_pd(t1, mask);
2577             }
2578             else return t1;
2579         }
2580     }
2581     // general case. combine two permutes
2582     Vec4d a1 = permute4d <
2583         (uint32_t)i0 < 4 ? i0 : -0x100,
2584         (uint32_t)i1 < 4 ? i1 : -0x100,
2585         (uint32_t)i2 < 4 ? i2 : -0x100,
2586         (uint32_t)i3 < 4 ? i3 : -0x100 > (a);
2587     Vec4d b1 = permute4d <
2588         (uint32_t)(i0^4) < 4 ? (i0^4) : -0x100,
2589         (uint32_t)(i1^4) < 4 ? (i1^4) : -0x100,
2590         (uint32_t)(i2^4) < 4 ? (i2^4) : -0x100,
2591         (uint32_t)(i3^4) < 4 ? (i3^4) : -0x100 > (b);
2592     t1 = _mm256_blend_pd(a1, b1, (i0&4)>>2 | (i1&4)>>1 | (i2&4) | (i3&4) << 1);
2593     if (mz == 0xFFFFFFFF) return t1;
2594     return permute4d<i0 < 0 ? -1 : 0, i1 < 0 ? -1 : 1, i2 < 0 ? -1 : 2, i3 < 0 ? -1 : 3> (t1);
2595 }
2596 
2597 /*****************************************************************************
2598 *
2599 *          Vector Vec8f permute and blend functions
2600 *
2601 *****************************************************************************/
2602 
2603 // permute vector Vec8f
2604 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
permute8f(Vec8f const & a)2605 static inline Vec8f permute8f(Vec8f const & a) {
2606 
2607     __m256 t1, mask;
2608 
2609     const int ior = i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7;  // OR indexes
2610 
2611     // is zeroing needed
2612     const bool do_zero    = ior < 0 && (ior & 0x80); // at least one index is negative, and not -0x100
2613 
2614     // is shuffling needed
2615     const bool do_shuffle = (i0>0) || (i1!=1 && i1>=0) || (i2!=2 && i2>=0) || (i3!=3 && i3>=0) ||
2616         (i4!=4 && i4>=0) || (i5!=5 && i5>=0) || (i6!=6 && i6>=0) || (i7!=7 && i7>=0);
2617 
2618     if (!do_shuffle) {       // no shuffling needed
2619         if (do_zero) {       // zeroing
2620             if ((i0 & i1 & i2 & i3 & i4 & i5 & i6 & i7) < 0) {
2621                 return _mm256_setzero_ps(); // zero everything
2622             }
2623             // zero some elements
2624             mask = constant8f< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0), -int(i4>=0), -int(i5>=0), -int(i6>=0), -int(i7>=0) > ();
2625             return _mm256_and_ps(a, mask);     // zero with AND mask
2626         }
2627         else {
2628             return a;  // do nothing
2629         }
2630     }
2631 
2632 #if INSTRSET >= 8  // AVX2: use VPERMPS
2633     if (do_shuffle) {    // shuffling
2634         mask = constant8f< i0 & 7, i1 & 7, i2 & 7, i3 & 7, i4 & 7, i5 & 7, i6 & 7, i7 & 7 > ();
2635 #if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)
2636         // bug in MS VS 11 beta: operands in wrong order. fixed in 11.0
2637         t1 = _mm256_permutevar8x32_ps(mask, _mm256_castps_si256(a));      //  problem in immintrin.h
2638 #elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
2639         // Gcc 4.7.0 has wrong parameter type and operands in wrong order. fixed in version 4.7.1
2640         t1 = _mm256_permutevar8x32_ps(mask, a);
2641 #else   // no bug version
2642         t1 = _mm256_permutevar8x32_ps(a, _mm256_castps_si256(mask));
2643 #endif
2644     }
2645     else {
2646         t1 = a;          // no shuffling
2647     }
2648     if (do_zero) {       // zeroing
2649         if ((i0 & i1 & i2 & i3 & i4 & i5 & i6 & i7) < 0) {
2650             return _mm256_setzero_ps(); // zero everything
2651         }
2652         // zero some elements
2653         mask = constant8f< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0), -int(i4>=0), -int(i5>=0), -int(i6>=0), -int(i7>=0) > ();
2654         t1 = _mm256_and_ps(t1, mask);     // zero with AND mask
2655     }
2656     return t1;
2657 #else   // AVX
2658 
2659     // Combine all the indexes into a single bitfield, with 4 bits for each
2660     const int m1 = (i0&7) | (i1&7)<<4 | (i2&7)<<8 | (i3&7)<<12 | (i4&7)<<16 | (i5&7)<<20 | (i6&7)<<24 | (i7&7)<<28;
2661 
2662     // Mask to zero out negative indexes
2663     const int m2 = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
2664 
2665     // Check if it is possible to use VSHUFPS. Index n must match index n+4 on bit 0-1, and even index n must match odd index n+1 on bit 2
2666     const bool sps = ((m1 ^ (m1 >> 16)) & 0x3333 & m2 & (m2 >> 16)) == 0  &&  ((m1 ^ (m1 >> 4)) & 0x04040404 & m2 & m2 >> 4) == 0;
2667 
2668     if (sps) {   // can use VSHUFPS
2669 
2670         // Index of each pair (i[n],i[n+1])
2671         const int j0 = i0 >= 0 ? i0 : i1;
2672         const int j1 = i2 >= 0 ? i2 : i3;
2673         const int j2 = i4 >= 0 ? i4 : i5;
2674         const int j3 = i6 >= 0 ? i6 : i7;
2675 
2676         // Index of each pair (i[n],i[n+4])
2677         const int k0 = i0 >= 0 ? i0 : i4;
2678         const int k1 = i1 >= 0 ? i1 : i5;
2679         const int k2 = i2 >= 0 ? i2 : i6;
2680         const int k3 = i3 >= 0 ? i3 : i7;
2681 
2682         // Needed contents of low/high part of each source register in VSHUFPS
2683         // 0: a.low, 1: a.high, 3: zero or don't care
2684         const int s1 = (j0 < 0 ? 3 : (j0 & 4) >> 2) | (j2 < 0 ? 0x30 : (j2 & 4) << 2);
2685         const int s2 = (j1 < 0 ? 3 : (j1 & 4) >> 2) | (j3 < 0 ? 0x30 : (j3 & 4) << 2);
2686 
2687         // calculate cost of using VSHUFPS
2688         const int cost1 = (s1 == 0x01 || s1 == 0x11) ? 2 : (s1 == 0x00 || s1 == 0x03 || s1 == 0x31) ? 1 : 0;
2689         const int cost2 = (s2 == s1) ? 0 : (s2 == 0x01 || s2 == 0x11) ? 2 : (s2 == 0x00 || (s2 == 0x03 && (s1 & 0xF0) != 0x00) || (s2 == 0x31 && (s1 & 0x0F) != 0x01)) ? 1 : 0;
2690 
2691         if (cost1 + cost2 <= 3) {
2692 
2693             // permute mask
2694             const int sm = (k0 < 0 ? 0 : (k0 & 3)) | (k1 < 0 ? 1 : (k1 & 3)) << 2 | (k2 < 0 ? 2 : (k2 & 3)) << 4 | (k3 < 0 ? 3 : (k3 & 3)) << 6;
2695 
2696             // make operands for VSHUFPS
2697             __m256 r1, r2;
2698 
2699             switch (s1) {
2700             case 0x00:  // LL
2701             case 0x03:  // ZL
2702                 r1 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(a),1);  break;
2703             case 0x01:  // HL
2704                 r1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1));
2705                 r1 = _mm256_insertf128_ps(r1,_mm256_castps256_ps128(a),1);  break;
2706             case 0x10:  // LH
2707             case 0x13:  // ZH
2708             case 0x30:  // LZ
2709             case 0x33:  // ZZ
2710                 r1 = a;  break;
2711             case 0x11:  // HH
2712                 r1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1));
2713                 r1 = _mm256_insertf128_ps(r1,_mm256_castps256_ps128(r1),1);  break;
2714             case 0x31:  // HZ
2715                 r1 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1));  break;
2716             }
2717 
2718             if (s2 == s1) {
2719                 if (sm == 0xE4) return r1;
2720                 r2 = r1;
2721             }
2722             else {
2723                 switch (s2) {
2724                 case 0x00:  // LL
2725                     r2 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(a),1);  break;
2726                 case 0x03:  // ZL
2727                     if ((s1 & 0xF0) == 0x00) r2 = r1;
2728                     else {
2729                         r2 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(a),1);
2730                     }
2731                     break;
2732                 case 0x01:  // HL
2733                     r2 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1));
2734                     r2 = _mm256_insertf128_ps(r2,_mm256_castps256_ps128(a),1);  break;
2735                 case 0x10:  // LH
2736                 case 0x13:  // ZH
2737                 case 0x30:  // LZ
2738                 case 0x33:  // ZZ
2739                     r2 = a;  break;
2740                 case 0x11:  // HH
2741                     r2 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1));
2742                     r2 = _mm256_insertf128_ps(r2,_mm256_castps256_ps128(r2),1);  break;
2743                 case 0x31:  // HZ
2744                     if ((s1 & 0x0F) == 0x01) r2 = r1;
2745                     else {
2746                         r2 = _mm256_castps128_ps256(_mm256_extractf128_ps(a,1));
2747                     }
2748                     break;
2749                 }
2750             }
2751 
2752             // now the permute instruction
2753             t1 = _mm256_shuffle_ps(r1, r2, sm);
2754 
2755             if (do_zero) {
2756                 // zero some elements
2757                 mask = constant8f< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0), -int(i4>=0), -int(i5>=0), -int(i6>=0), -int(i7>=0) > ();
2758                 t1 = _mm256_and_ps(t1, mask);     // zero with AND mask
2759             }
2760             return t1;
2761         }
2762     }
2763     // not using VSHUFPS. Split into low and high part
2764     Vec4f alo = a.get_low();
2765     Vec4f ahi = a.get_high();
2766     Vec4f rlo = blend4f<i0, i1, i2, i3> (alo, ahi);
2767     Vec4f rhi = blend4f<i4, i5, i6, i7> (alo, ahi);
2768     return Vec8f(rlo, rhi);
2769 #endif
2770 }
2771 
2772 
2773 // blend vectors Vec8f
2774 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
blend8f(Vec8f const & a,Vec8f const & b)2775 static inline Vec8f blend8f(Vec8f const & a, Vec8f const & b) {
2776 
2777     const int ior = i0 | i1 | i2 | i3 | i4 | i5 | i6 | i7;  // OR indexes
2778 
2779     // is zeroing needed
2780     const bool do_zero  = ior < 0 && (ior & 0x80); // at least one index is negative, and not -0x100
2781 
2782     // Combine all the indexes into a single bitfield, with 4 bits for each
2783     const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
2784 
2785     // Mask to zero out negative indexes
2786     const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
2787 
2788     __m256 t1, mask;
2789 
2790     if (mz == 0) return _mm256_setzero_ps();  // all zero
2791 
2792     if ((m1 & 0x88888888 & mz) == 0) {
2793         // all from a
2794         return permute8f<i0, i1, i2, i3, i4, i5, i6, i7> (a);
2795     }
2796 
2797     if (((m1 ^ 0x88888888) & 0x88888888 & mz) == 0) {
2798         // all from b
2799         return permute8f<i0&~8, i1&~8, i2&~8, i3&~8, i4&~8, i5&~8, i6&~8, i7&~8> (b);
2800     }
2801 
2802     if ((((m1 & 0x77777777) ^ 0x76543210) & mz) == 0) {
2803         // blend and zero, no permute
2804         mask = constant8f<(i0&8)?0:-1, (i1&8)?0:-1, (i2&8)?0:-1, (i3&8)?0:-1, (i4&8)?0:-1, (i5&8)?0:-1, (i6&8)?0:-1, (i7&8)?0:-1> ();
2805         t1   = select(mask, a, b);
2806         if (!do_zero) return t1;
2807         // zero some elements
2808         mask = constant8f< (i0<0&&(i0&8)) ? 0 : -1, (i1<0&&(i1&8)) ? 0 : -1, (i2<0&&(i2&8)) ? 0 : -1, (i3<0&&(i3&8)) ? 0 : -1,
2809             (i4<0&&(i4&8)) ? 0 : -1, (i5<0&&(i5&8)) ? 0 : -1, (i6<0&&(i6&8)) ? 0 : -1, (i7<0&&(i7&8)) ? 0 : -1 > ();
2810         return _mm256_and_ps(t1, mask);
2811     }
2812 
2813     // check if we can do 128-bit blend/permute
2814     if (((m1 ^ 0x32103210) & 0x33333333 & mz) == 0) {
2815         const uint32_t j0 = (i0 >= 0 ? i0 : i1 >= 0 ? i1 : i2 >= 0 ? i2 : i3 >= 0 ? i3 : -1) >> 2;
2816         const uint32_t j1 = (i4 >= 0 ? i4 : i5 >= 0 ? i5 : i6 >= 0 ? i6 : i7 >= 0 ? i7 : -1) >> 2;
2817         if (((m1 ^ ((j0 & 3) * 0x00004444 | (j1 & 3) * 0x44440000)) & 0xCCCCCCCC & mz) == 0) {
2818             t1 = _mm256_permute2f128_ps(a, b, (j0 & 0x0F) | (j1 & 0x0F) << 4);
2819             const bool partialzero = (((i0 | i1 | i2 | i3) ^ j0) & 0x80) != 0 || (((i4 | i5 | i6 | i7) ^ j1) & 0x80) != 0;
2820             if (partialzero) {
2821                 // zero some elements
2822                 mask = constant8f< i0 < 0 ? 0 : -1, i1 < 0 ? 0 : -1, i2 < 0 ? 0 : -1, i3 < 0 ? 0 : -1,
2823                     i4 < 0 ? 0 : -1, i5 < 0 ? 0 : -1, i6 < 0 ? 0 : -1, i7 < 0 ? 0 : -1 > ();
2824                 return _mm256_and_ps(t1, mask);
2825             }
2826             else return t1;
2827         }
2828     }
2829     // Not checking special cases for vunpckhps, vunpcklps: they are too rare
2830 
2831     // Check if it is possible to use VSHUFPS.
2832     // Index n must match index n+4 on bit 0-1, and even index n must match odd index n+1 on bit 2-3
2833     const bool sps = ((m1 ^ (m1 >> 16)) & 0x3333 & mz & (mz >> 16)) == 0  &&  ((m1 ^ (m1 >> 4)) & 0x0C0C0C0C & mz & mz >> 4) == 0;
2834 
2835     if (sps) {   // can use VSHUFPS
2836 
2837         // Index of each pair (i[n],i[n+1])
2838         const int j0 = i0 >= 0 ? i0 : i1;
2839         const int j1 = i2 >= 0 ? i2 : i3;
2840         const int j2 = i4 >= 0 ? i4 : i5;
2841         const int j3 = i6 >= 0 ? i6 : i7;
2842 
2843         // Index of each pair (i[n],i[n+4])
2844         const int k0 = i0 >= 0 ? i0 : i4;
2845         const int k1 = i1 >= 0 ? i1 : i5;
2846         const int k2 = i2 >= 0 ? i2 : i6;
2847         const int k3 = i3 >= 0 ? i3 : i7;
2848 
2849         // Needed contents of low/high part of each source register in VSHUFPS
2850         // 0: a.low, 1: a.high, 2: b.low, 3: b.high, 4: zero or don't care
2851         const int s1 = (j0 < 0 ? 4 : (j0 & 0xC) >> 2) | (j2 < 0 ? 0x30 : (j2 & 0xC) << 2);
2852         const int s2 = (j1 < 0 ? 3 : (j1 & 0xC) >> 2) | (j3 < 0 ? 0x30 : (j3 & 0xC) << 2);
2853 
2854         // permute mask
2855         const int sm = (k0 < 0 ? 0 : (k0 & 3)) | (k1 < 0 ? 1 : (k1 & 3)) << 2 | (k2 < 0 ? 2 : (k2 & 3)) << 4 | (k3 < 0 ? 3 : (k3 & 3)) << 6;
2856 
2857         __m256 r1, r2;
2858         __m128 ahi = _mm256_extractf128_ps(a,1);    // 1
2859         __m128 bhi = _mm256_extractf128_ps(b,1);    // 3
2860 
2861         switch (s1) {
2862         case 0x00:  case 0x04:
2863             r1 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(a),1);  break;
2864         case 0x01:  case 0x41:
2865             r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),_mm256_castps256_ps128(a),1);  break;
2866         case 0x02:
2867             r1 = _mm256_insertf128_ps(b,_mm256_castps256_ps128(a),1);  break;
2868         case 0x03:
2869             r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),_mm256_castps256_ps128(a),1);  break;
2870         case 0x10:  case 0x14:  case 0x40:  case 0x44:
2871             r1 = a;  break;
2872         case 0x11:
2873             r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),ahi,1);  break;
2874         case 0x12:
2875             r1 = _mm256_insertf128_ps(b,ahi,1);  break;
2876         case 0x13:
2877             r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),ahi,1);  break;
2878         case 0x20:
2879             r1 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(b),1);  break;
2880         case 0x21:
2881             r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),_mm256_castps256_ps128(b),1);  break;
2882         case 0x22:  case 0x24:  case 0x42:
2883             r1 = _mm256_insertf128_ps(b,_mm256_castps256_ps128(b),1);  break;
2884         case 0x23:  case 0x43:
2885             r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),_mm256_castps256_ps128(b),1);  break;
2886         case 0x30:
2887             r1 = _mm256_insertf128_ps(a,bhi,1);  break;
2888         case 0x31:
2889             r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),bhi,1);  break;
2890         case 0x32:  case 0x34:
2891             r1 = b;  break;
2892         case 0x33:
2893             r1 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),bhi,1);  break;
2894         }
2895         if (s2 == s1 || ((s2 & 0x04) && ((s1 ^ s2) & 0xF0) == 0) || ((s2 & 0x40) && ((s1 ^ s2) & 0x0F) == 0)) {
2896             // can use r2 = r1
2897             if (sm == 0xE4) return r1;  // no shuffling needed
2898             r2 = r1;
2899         }
2900         else {
2901             switch (s2) {
2902             case 0x00:  case 0x04:
2903                 r2 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(a),1);  break;
2904             case 0x01:  case 0x41:
2905                 r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),_mm256_castps256_ps128(a),1);  break;
2906             case 0x02:
2907                 r2 = _mm256_insertf128_ps(b,_mm256_castps256_ps128(a),1);  break;
2908             case 0x03:
2909                 r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),_mm256_castps256_ps128(a),1);  break;
2910             case 0x10:  case 0x14:  case 0x40:  case 0x44:
2911                 r2 = a;  break;
2912             case 0x11:
2913                 r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),ahi,1);  break;
2914             case 0x12:
2915                 r2 = _mm256_insertf128_ps(b,ahi,1);  break;
2916             case 0x13:
2917                 r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),ahi,1);  break;
2918             case 0x20:
2919                 r2 = _mm256_insertf128_ps(a,_mm256_castps256_ps128(b),1);  break;
2920             case 0x21:
2921                 r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),_mm256_castps256_ps128(b),1);  break;
2922             case 0x22:  case 0x24:  case 0x42:
2923                 r2 = _mm256_insertf128_ps(b,_mm256_castps256_ps128(b),1);  break;
2924             case 0x23:  case 0x43:
2925                 r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),_mm256_castps256_ps128(b),1);  break;
2926             case 0x30:
2927                 r2 = _mm256_insertf128_ps(a,bhi,1);  break;
2928             case 0x31:
2929                 r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(ahi),bhi,1);  break;
2930             case 0x32:  case 0x34:
2931                 r2 = b;  break;
2932             case 0x33:
2933                 r2 = _mm256_insertf128_ps(_mm256_castps128_ps256(bhi),bhi,1);  break;
2934             }
2935         }
2936 
2937         // now the shuffle instruction
2938         t1 = _mm256_shuffle_ps(r1, r2, sm);
2939 
2940         if (do_zero) {
2941             // zero some elements
2942             mask = constant8f< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0), -int(i4>=0), -int(i5>=0), -int(i6>=0), -int(i7>=0) > ();
2943             t1 = _mm256_and_ps(t1, mask);     // zero with AND mask
2944         }
2945         return t1;
2946     }
2947 
2948     // Check if we can use 64-bit blend. Even numbered indexes must be even and odd numbered
2949     // indexes must be equal to the preceding index + 1, except for negative indexes.
2950     if (((m1 ^ 0x10101010) & 0x11111111 & mz) == 0 && ((m1 ^ m1 >> 4) & 0x0E0E0E0E & mz & mz >> 4) == 0) {
2951 
2952         const bool partialzero = int((i0 ^ i1) | (i2 ^ i3) | (i4 ^ i5) | (i6 ^ i7)) < 0; // part of a 64-bit block is zeroed
2953         const int blank1 = partialzero ? -0x100 : -1;  // ignore or zero
2954         const int n0 = i0 > 0 ? i0/2 : i1 > 0 ? i1/2 : blank1;  // indexes for 64 bit blend
2955         const int n1 = i2 > 0 ? i2/2 : i3 > 0 ? i3/2 : blank1;
2956         const int n2 = i4 > 0 ? i4/2 : i5 > 0 ? i5/2 : blank1;
2957         const int n3 = i6 > 0 ? i6/2 : i7 > 0 ? i7/2 : blank1;
2958         t1 = _mm256_castpd_ps (blend4d<n0,n1,n2,n3> (_mm256_castps_pd(a), _mm256_castps_pd(b)));
2959         if (blank1 == -1 || !do_zero) {
2960             return  t1;
2961         }
2962         // need more zeroing
2963         mask = constant8f< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0), -int(i4>=0), -int(i5>=0), -int(i6>=0), -int(i7>=0) > ();
2964         return _mm256_and_ps(t1, mask);     // zero with AND mask
2965     }
2966 
2967     // general case: permute and blend and possible zero
2968     const int blank2 = do_zero ? -1 : -0x100;  // ignore or zero
2969 
2970     Vec8f ta = permute8f <
2971         (uint32_t)i0 < 8 ? i0 : blank2,
2972         (uint32_t)i1 < 8 ? i1 : blank2,
2973         (uint32_t)i2 < 8 ? i2 : blank2,
2974         (uint32_t)i3 < 8 ? i3 : blank2,
2975         (uint32_t)i4 < 8 ? i4 : blank2,
2976         (uint32_t)i5 < 8 ? i5 : blank2,
2977         (uint32_t)i6 < 8 ? i6 : blank2,
2978         (uint32_t)i7 < 8 ? i7 : blank2 > (a);
2979     Vec8f tb = permute8f <
2980         (uint32_t)(i0^8) < 8 ? (i0^8) : blank2,
2981         (uint32_t)(i1^8) < 8 ? (i1^8) : blank2,
2982         (uint32_t)(i2^8) < 8 ? (i2^8) : blank2,
2983         (uint32_t)(i3^8) < 8 ? (i3^8) : blank2,
2984         (uint32_t)(i4^8) < 8 ? (i4^8) : blank2,
2985         (uint32_t)(i5^8) < 8 ? (i5^8) : blank2,
2986         (uint32_t)(i6^8) < 8 ? (i6^8) : blank2,
2987         (uint32_t)(i7^8) < 8 ? (i7^8) : blank2 > (b);
2988 
2989     if (blank2 == -1) {
2990         return  _mm256_or_ps(ta, tb);
2991     }
2992     // no zeroing, need to blend
2993     const int maskb = ((i0 >> 3) & 1) | ((i1 >> 2) & 2) | ((i2 >> 1) & 4) | (i3 & 8) |
2994         ((i4 << 1) & 0x10) | ((i5 << 2) & 0x20) | ((i6 << 3) & 0x40) | ((i7 << 4) & 0x80);
2995     return _mm256_blend_ps(ta, tb, maskb);  // blend
2996 }
2997 
2998 
2999 /*****************************************************************************
3000 *
3001 *          Vector lookup functions
3002 *
3003 ******************************************************************************
3004 *
3005 * These functions use vector elements as indexes into a table.
3006 * The table is given as one or more vectors or as an array.
3007 *
3008 * This can be used for several purposes:
3009 *  - table lookup
3010 *  - permute or blend with variable indexes
3011 *  - blend from more than two sources
3012 *  - gather non-contiguous data
3013 *
3014 * An index out of range may produce any value - the actual value produced is
3015 * implementation dependent and may be different for different instruction
3016 * sets. An index out of range does not produce an error message or exception.
3017 *
3018 * Example:
3019 * Vec4i a(2,0,0,3);               // index  a is (  2,   0,   0,   3)
3020 * Vec4f b(1.0f,1.1f,1.2f,1.3f);   // table  b is (1.0, 1.1, 1.2, 1.3)
3021 * Vec4f c;
3022 * c = lookup4 (a,b);              // result c is (1.2, 1.0, 1.0, 1.3)
3023 *
3024 *****************************************************************************/
3025 
3026 #ifdef VECTORI256_H  // Vec8i and Vec4q must be defined
3027 
lookup8(Vec8i const & index,Vec8f const & table)3028 static inline Vec8f lookup8(Vec8i const & index, Vec8f const & table) {
3029 #if INSTRSET >= 8 && VECTORI256_H > 1 // AVX2
3030 #if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)
3031     // bug in MS VS 11 beta: operands in wrong order. fixed in 11.0
3032     return _mm256_permutevar8x32_ps(_mm256_castsi256_ps(index), _mm256_castps_si256(table));
3033 #elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
3034         // Gcc 4.7.0 has wrong parameter type and operands in wrong order. fixed in version 4.7.1
3035     return _mm256_permutevar8x32_ps(_mm256_castsi256_ps(index), table);
3036 #else
3037     // no bug version
3038     return _mm256_permutevar8x32_ps(table, index);
3039 #endif
3040 
3041 #else // AVX
3042     // swap low and high part of table
3043     __m256  t1 = _mm256_castps128_ps256(_mm256_extractf128_ps(table, 1));
3044     __m256  t2 = _mm256_insertf128_ps(t1, _mm256_castps256_ps128(table), 1);
3045     // join index parts
3046     __m256i index2 = _mm256_insertf128_si256(_mm256_castsi128_si256(index.get_low()), index.get_high(), 1);
3047     // permute within each 128-bit part
3048     __m256  r0 = _mm256_permutevar_ps(table, index2);
3049     __m256  r1 = _mm256_permutevar_ps(t2,    index2);
3050     // high index bit for blend
3051     __m128i k1 = _mm_slli_epi32(index.get_high() ^ 4, 29);
3052     __m128i k0 = _mm_slli_epi32(index.get_low(),      29);
3053     __m256  kk = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_castsi128_ps(k0)), _mm_castsi128_ps(k1), 1);
3054     // blend the two permutes
3055     return _mm256_blendv_ps(r0, r1, kk);
3056 #endif
3057 }
3058 
3059 template <int n>
lookup(Vec8i const & index,float const * table)3060 static inline Vec8f lookup(Vec8i const & index, float const * table) {
3061     if (n <= 0) return 0;
3062     if (n <= 4) {
3063         Vec4f table1 = Vec4f().load(table);
3064         return Vec8f(
3065             lookup4 (index.get_low(),  table1),
3066             lookup4 (index.get_high(), table1));
3067     }
3068 #if INSTRSET < 8  // not AVX2
3069     if (n <= 8) {
3070         return lookup8(index, Vec8f().load(table));
3071     }
3072 #endif
3073     // Limit index
3074     Vec8ui index1;
3075     if ((n & (n-1)) == 0) {
3076         // n is a power of 2, make index modulo n
3077         index1 = Vec8ui(index) & (n-1);
3078     }
3079     else {
3080         // n is not a power of 2, limit to n-1
3081         index1 = min(Vec8ui(index), n-1);
3082     }
3083 #if INSTRSET >= 8 && VECTORI256_H > 1 // AVX2
3084     return _mm256_i32gather_ps(table, index1, 4);
3085 #else // AVX
3086     return Vec8f(table[index1[0]],table[index1[1]],table[index1[2]],table[index1[3]],
3087     table[index1[4]],table[index1[5]],table[index1[6]],table[index1[7]]);
3088 #endif
3089 }
3090 
lookup4(Vec4q const & index,Vec4d const & table)3091 static inline Vec4d lookup4(Vec4q const & index, Vec4d const & table) {
3092 #if INSTRSET >= 8 && VECTORI256_H > 1 // AVX2
3093     // We can't use VPERMPD because it has constant indexes.
3094     // Convert the index to fit VPERMPS
3095     Vec8i index1 = permute8i<0,0,2,2,4,4,6,6> (Vec8i(index+index));
3096     Vec8i index2 = index1 + Vec8i(constant8i<0,1,0,1,0,1,0,1>());
3097 #if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)
3098     // bug in MS VS 11 beta: operands in wrong order. fixed in 11.0
3099     return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(index2), _mm256_castpd_si256(table)));
3100 #elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
3101         // Gcc 4.7.0 has wrong parameter type and operands in wrong order
3102     return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(index2), _mm256_castpd_ps(table)));
3103 #else
3104     // no bug version
3105     return _mm256_castps_pd(_mm256_permutevar8x32_ps(_mm256_castpd_ps(table), index2));
3106 #endif
3107 
3108 #else // AVX
3109     // swap low and high part of table
3110     __m256d t1 = _mm256_castpd128_pd256(_mm256_extractf128_pd(table, 1));
3111     __m256d t2 = _mm256_insertf128_pd(t1, _mm256_castpd256_pd128(table), 1);
3112     // index << 1
3113     __m128i index2lo = index.get_low()  + index.get_low();
3114     __m128i index2hi = index.get_high() + index.get_high();
3115     // join index parts
3116     __m256i index3 = _mm256_insertf128_si256(_mm256_castsi128_si256(index2lo), index2hi, 1);
3117     // permute within each 128-bit part
3118     __m256d r0 = _mm256_permutevar_pd(table, index3);
3119     __m256d r1 = _mm256_permutevar_pd(t2,    index3);
3120     // high index bit for blend
3121     __m128i k1 = _mm_slli_epi64(index.get_high() ^ 2, 62);
3122     __m128i k0 = _mm_slli_epi64(index.get_low(),      62);
3123     __m256d kk = _mm256_insertf128_pd(_mm256_castpd128_pd256(_mm_castsi128_pd(k0)), _mm_castsi128_pd(k1), 1);
3124     // blend the two permutes
3125     return _mm256_blendv_pd(r0, r1, kk);
3126 #endif
3127 }
3128 
3129 template <int n>
lookup(Vec4q const & index,double const * table)3130 static inline Vec4d lookup(Vec4q const & index, double const * table) {
3131     if (n <= 0) return 0;
3132     if (n <= 2) {
3133         Vec2d table1 = Vec2d().load(table);
3134         return Vec4d(
3135             lookup2 (index.get_low(),  table1),
3136             lookup2 (index.get_high(), table1));
3137     }
3138 #if INSTRSET < 8  // not AVX2
3139     if (n <= 4) {
3140         return lookup4(index, Vec4d().load(table));
3141     }
3142 #endif
3143     // Limit index
3144     Vec8ui index1;
3145     if ((n & (n-1)) == 0) {
3146         // n is a power of 2, make index modulo n
3147         index1 = Vec8ui(index) & constant8i<n-1, 0, n-1, 0, n-1, 0, n-1, 0>();
3148     }
3149     else {
3150         // n is not a power of 2, limit to n-1
3151         index1 = min(Vec8ui(index), constant8i<n-1, 0, n-1, 0, n-1, 0, n-1, 0>() );
3152     }
3153 #if INSTRSET >= 8 && VECTORI256_H > 1 // AVX2
3154     return _mm256_i64gather_pd(table, index1, 8);
3155 #else // AVX
3156     Vec4q index2 = Vec4q(index1);
3157     return Vec4d(table[index2[0]],table[index2[1]],table[index2[2]],table[index2[3]]);
3158 #endif
3159 }
3160 #endif  // VECTORI256_H
3161 
3162 /*****************************************************************************
3163 *
3164 *          Gather functions with fixed indexes
3165 *
3166 *****************************************************************************/
3167 // Load elements from array a with indices i0, i1, i2, i3, ..
3168 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
gather8f(void const * a)3169 static inline Vec8f gather8f(void const * a) {
3170     return reinterpret_f(gather8i<i0, i1, i2, i3, i4, i5, i6, i7>(a));
3171 }
3172 
3173 // Load elements from array a with indices i0, i1, i2, i3
3174 template <int i0, int i1, int i2, int i3>
gather4d(void const * a)3175 static inline Vec4d gather4d(void const * a) {
3176     return reinterpret_d(gather4q<i0, i1, i2, i3>(a));
3177 }
3178 
3179 /*****************************************************************************
3180 *
3181 *          Vector scatter functions
3182 *
3183 ******************************************************************************
3184 *
3185 * These functions write the elements of a vector to arbitrary positions in an
3186 * array in memory. Each vector element is written to an array position
3187 * determined by an index. An element is not written if the corresponding
3188 * index is out of range.
3189 * The indexes can be specified as constant template parameters or as an
3190 * integer vector.
3191 *
3192 * The scatter functions are useful if the data are distributed in a sparce
3193 * manner into the array. If the array is dense then it is more efficient
3194 * to permute the data into the right positions and then write the whole
3195 * permuted vector into the array.
3196 *
3197 * Example:
3198 * Vec8d a(10,11,12,13,14,15,16,17);
3199 * double b[16] = {0};
3200 * scatter<0,2,14,10,1,-1,5,9>(a,b);
3201 * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
3202 *
3203 *****************************************************************************/
3204 
3205 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
scatter(Vec8f const & data,float * array)3206 static inline void scatter(Vec8f const & data, float * array) {
3207 #if defined (__AVX512VL__)
3208     __m256i indx = constant8i<i0,i1,i2,i3,i4,i5,i6,i7>();
3209     __mmask16 mask = uint16_t(i0>=0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3| (i4>=0)<<4| (i5>=0)<<5| (i6>=0)<<6| (i7>=0)<<7);
3210     _mm256_mask_i32scatter_ps(array, mask, indx, data, 4);
3211 #elif defined (__AVX512F__)
3212     __m512i indx = _mm512_castsi256_si512(constant8i<i0,i1,i2,i3,i4,i5,i6,i7>());
3213     __mmask16 mask = uint16_t(i0>=0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3| (i4>=0)<<4| (i5>=0)<<5| (i6>=0)<<6| (i7>=0)<<7);
3214     _mm512_mask_i32scatter_ps(array, mask, indx, _mm512_castps256_ps512(data), 4);
3215 #else
3216     const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7};
3217     for (int i = 0; i < 8; i++) {
3218         if (index[i] >= 0) array[index[i]] = data[i];
3219     }
3220 #endif
3221 }
3222 
3223 template <int i0, int i1, int i2, int i3>
scatter(Vec4d const & data,double * array)3224 static inline void scatter(Vec4d const & data, double * array) {
3225 #if defined (__AVX512VL__)
3226     __m128i indx = constant4i<i0,i1,i2,i3>();
3227     __mmask16 mask = uint16_t(i0>=0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3);
3228     _mm256_mask_i32scatter_pd(array, mask, indx, data, 8);
3229 #elif defined (__AVX512F__)
3230     __m256i indx = _mm256_castsi128_si256(constant4i<i0,i1,i2,i3>());
3231     __mmask16 mask = uint16_t(i0>=0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3);
3232     _mm512_mask_i32scatter_pd(array, mask, indx, _mm512_castpd256_pd512(data), 8);
3233 #else
3234     const int index[4] = {i0,i1,i2,i3};
3235     for (int i = 0; i < 4; i++) {
3236         if (index[i] >= 0) array[index[i]] = data[i];
3237     }
3238 #endif
3239 }
3240 
scatter(Vec8i const & index,uint32_t limit,Vec8f const & data,float * array)3241 static inline void scatter(Vec8i const & index, uint32_t limit, Vec8f const & data, float * array) {
3242 #if defined (__AVX512VL__)
3243     __mmask16 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit));
3244     _mm256_mask_i32scatter_ps(array, mask, index, data, 4);
3245 #elif defined (__AVX512F__)
3246     // 16 bit mask. upper 8 bits are (0<0) = false
3247     __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit)));
3248     _mm512_mask_i32scatter_ps(array, mask, _mm512_castsi256_si512(index), _mm512_castps256_ps512(data), 4);
3249 #else
3250     for (int i = 0; i < 8; i++) {
3251         if (uint32_t(index[i]) < limit) array[index[i]] = data[i];
3252     }
3253 #endif
3254 }
3255 
scatter(Vec4q const & index,uint32_t limit,Vec4d const & data,double * array)3256 static inline void scatter(Vec4q const & index, uint32_t limit, Vec4d const & data, double * array) {
3257 #if defined (__AVX512VL__)
3258     __mmask16 mask = _mm256_cmplt_epu64_mask(index, Vec4uq(uint64_t(limit)));
3259     _mm256_mask_i64scatter_pd(array, mask, index, data, 8);
3260 #elif defined (__AVX512F__)
3261     // 16 bit mask. upper 8 bits are (0<0) = false
3262     __mmask16 mask = _mm512_cmplt_epu64_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec4uq(uint64_t(limit))));
3263     _mm512_mask_i64scatter_pd(array, mask, _mm512_castsi256_si512(index), _mm512_castpd256_pd512(data), 8);
3264 #else
3265     for (int i = 0; i < 4; i++) {
3266         if (uint64_t(index[i]) < uint64_t(limit)) array[index[i]] = data[i];
3267     }
3268 #endif
3269 }
3270 
scatter(Vec4i const & index,uint32_t limit,Vec4d const & data,double * array)3271 static inline void scatter(Vec4i const & index, uint32_t limit, Vec4d const & data, double * array) {
3272 #if defined (__AVX512VL__)
3273     __mmask16 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit));
3274     _mm256_mask_i32scatter_pd(array, mask, index, data, 8);
3275 #elif defined (__AVX512F__)
3276     // 16 bit mask. upper 12 bits are (0<0) = false
3277     __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi128_si512(index), _mm512_castsi128_si512(Vec4ui(limit)));
3278     _mm512_mask_i32scatter_pd(array, mask, _mm256_castsi128_si256(index), _mm512_castpd256_pd512(data), 8);
3279 #else
3280     for (int i = 0; i < 4; i++) {
3281         if (uint32_t(index[i]) < limit) array[index[i]] = data[i];
3282     }
3283 #endif
3284 }
3285 
3286 
3287 /*****************************************************************************
3288 *
3289 *          Horizontal scan functions
3290 *
3291 *****************************************************************************/
3292 
3293 // Get index to the first element that is true. Return -1 if all are false
horizontal_find_first(Vec8fb const & x)3294 static inline int horizontal_find_first(Vec8fb const & x) {
3295     return horizontal_find_first(Vec8ib(x));
3296 }
3297 
horizontal_find_first(Vec4db const & x)3298 static inline int horizontal_find_first(Vec4db const & x) {
3299     return horizontal_find_first(Vec4qb(x));
3300 }
3301 
3302 // Count the number of elements that are true
horizontal_count(Vec8fb const & x)3303 static inline uint32_t horizontal_count(Vec8fb const & x) {
3304     return horizontal_count(Vec8ib(x));
3305 }
3306 
horizontal_count(Vec4db const & x)3307 static inline uint32_t horizontal_count(Vec4db const & x) {
3308     return horizontal_count(Vec4qb(x));
3309 }
3310 
3311 /*****************************************************************************
3312 *
3313 *          Boolean <-> bitfield conversion functions
3314 *
3315 *****************************************************************************/
3316 
3317 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec8fb const & x)3318 static inline uint8_t to_bits(Vec8fb const & x) {
3319     return to_bits(Vec8ib(x));
3320 }
3321 
3322 // to_Vec8fb: convert integer bitfield to boolean vector
to_Vec8fb(uint8_t x)3323 static inline Vec8fb to_Vec8fb(uint8_t x) {
3324     return Vec8fb(to_Vec8ib(x));
3325 }
3326 
3327 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec4db const & x)3328 static inline uint8_t to_bits(Vec4db const & x) {
3329     return to_bits(Vec4qb(x));
3330 }
3331 
3332 // to_Vec4db: convert integer bitfield to boolean vector
to_Vec4db(uint8_t x)3333 static inline Vec4db to_Vec4db(uint8_t x) {
3334     return Vec4db(to_Vec4qb(x));
3335 }
3336 
3337 #ifdef VCL_NAMESPACE
3338 }
3339 #endif
3340 
3341 #endif // VECTORF256_H
3342