1 /****************************  vectorf128.h   *******************************
2 * Author:        Agner Fog
3 * Date created:  2012-05-30
4 * Last modified: 2017-05-10
5 * Version:       1.29
6 * Project:       vector classes
7 * Description:
8 * Header file defining floating point vector classes as interface to
9 * intrinsic functions in x86 microprocessors with SSE2 and later instruction
10 * sets up to AVX.
11 *
12 * Instructions:
13 * Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired
14 * instruction set, which must be at least SSE2. Specify the supported
15 * instruction set by a command line define, e.g. __SSE4_1__ if the
16 * compiler does not automatically do so.
17 *
18 * The following vector classes are defined here:
19 * Vec4f     Vector of 4 single precision floating point numbers
20 * Vec4fb    Vector of 4 Booleans for use with Vec4f
21 * Vec2d     Vector of 2 double precision floating point numbers
22 * Vec2db    Vector of 2 Booleans for use with Vec2d
23 *
24 * Each vector object is represented internally in the CPU as a 128-bit register.
25 * This header file defines operators and functions for these vectors.
26 *
27 * For example:
28 * Vec2d a(1.0, 2.0), b(3.0, 4.0), c;
29 * c = a + b;     // now c contains (4.0, 6.0)
30 *
31 * For detailed instructions, see VectorClass.pdf
32 *
33 * (c) Copyright 2012-2017 GNU General Public License http://www.gnu.org/licenses
34 *****************************************************************************/
35 #ifndef VECTORF128_H
36 #define VECTORF128_H
37 
38 #if defined _MSC_VER && _MSC_VER >= 1800
39 // solve problem with ambiguous overloading of pow function in Microsoft math.h:
40 // make sure math.h is included first rather than last
41 #include <math.h>
42 #endif
43 
44 #include "vectori128.h"  // Define integer vectors
45 
46 #ifdef VCL_NAMESPACE
47 namespace VCL_NAMESPACE {
48 #endif
49 
50 /*****************************************************************************
51 *
52 *          select functions
53 *
54 *****************************************************************************/
55 // Select between two __m128 sources, element by element. Used in various functions
56 // and operators. Corresponds to this pseudocode:
57 // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
58 // Each element in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are
59 // allowed. The implementation depends on the instruction set:
60 // If SSE4.1 is supported then only bit 31 in each dword of s is checked,
61 // otherwise all bits in s are used.
selectf(__m128 const & s,__m128 const & a,__m128 const & b)62 static inline __m128 selectf (__m128 const & s, __m128 const & a, __m128 const & b) {
63 #if INSTRSET >= 5   // SSE4.1 supported
64     return _mm_blendv_ps (b, a, s);
65 #else
66     return _mm_or_ps(
67         _mm_and_ps(s,a),
68         _mm_andnot_ps(s,b));
69 #endif
70 }
71 
72 // Same, with two __m128d sources.
73 // and operators. Corresponds to this pseudocode:
74 // for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
75 // Each element in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true). No other
76 // values are allowed. The implementation depends on the instruction set:
77 // If SSE4.1 is supported then only bit 63 in each dword of s is checked,
78 // otherwise all bits in s are used.
selectd(__m128d const & s,__m128d const & a,__m128d const & b)79 static inline __m128d selectd (__m128d const & s, __m128d const & a, __m128d const & b) {
80 #if INSTRSET >= 5   // SSE4.1 supported
81     return _mm_blendv_pd (b, a, s);
82 #else
83     return _mm_or_pd(
84         _mm_and_pd(s,a),
85         _mm_andnot_pd(s,b));
86 #endif
87 }
88 
89 
90 /*****************************************************************************
91 *
92 *          Vec4fb: Vector of 4 Booleans for use with Vec4f
93 *
94 *****************************************************************************/
95 
96 class Vec4fb {
97 protected:
98     __m128 xmm; // Float vector
99 public:
100     // Default constructor:
Vec4fb()101     Vec4fb() {
102     }
103     // Constructor to build from all elements:
Vec4fb(bool b0,bool b1,bool b2,bool b3)104     Vec4fb(bool b0, bool b1, bool b2, bool b3) {
105         xmm = _mm_castsi128_ps(_mm_setr_epi32(-(int)b0, -(int)b1, -(int)b2, -(int)b3));
106     }
107     // Constructor to convert from type __m128 used in intrinsics:
Vec4fb(__m128 const & x)108     Vec4fb(__m128 const & x) {
109         xmm = x;
110     }
111     // Assignment operator to convert from type __m128 used in intrinsics:
112     Vec4fb & operator = (__m128 const & x) {
113         xmm = x;
114         return *this;
115     }
116     // Constructor to broadcast scalar value:
Vec4fb(bool b)117     Vec4fb(bool b) {
118         xmm = _mm_castsi128_ps(_mm_set1_epi32(-int32_t(b)));
119     }
120     // Assignment operator to broadcast scalar value:
121     Vec4fb & operator = (bool b) {
122         *this = Vec4fb(b);
123         return *this;
124     }
125 private: // Prevent constructing from int, etc.
126     Vec4fb(int b);
127     Vec4fb & operator = (int x);
128 public:
129     // Constructor to convert from type Vec4ib used as Boolean for integer vectors
Vec4fb(Vec4ib const & x)130     Vec4fb(Vec4ib const & x) {
131         xmm = _mm_castsi128_ps(x);
132     }
133     // Assignment operator to convert from type Vec4ib used as Boolean for integer vectors
134     Vec4fb & operator = (Vec4ib const & x) {
135         xmm = _mm_castsi128_ps(x);
136         return *this;
137     }
138     // Type cast operator to convert to __m128 used in intrinsics
__m128()139     operator __m128() const {
140         return xmm;
141     }
142     /* Clang problem:
143     The Clang compiler treats the intrinsic vector types __m128, __m128i, and __m128f as identical.
144     I have reported this problem in 2013 but it is still not fixed in 2017!
145     See the bug report at http://llvm.org/bugs/show_bug.cgi?id=17164
146     Additional problem: The version number is not consistent across platforms. The Apple build has
147     different version numbers. We have to rely on __apple_build_version__ on the Mac platform:
148     http://llvm.org/bugs/show_bug.cgi?id=12643
149     I have received reports that there was no aliasing of vector types on __apple_build_version__ = 6020053
150     but apparently the problem has come back. The aliasing of vector types has been reported on
151     __apple_build_version__ = 8000042
152     We have to make switches here when - hopefully - the error some day has been fixed.
153     We need different version checks with and whithout __apple_build_version__
154     */
155 
156 //#if (defined (__clang__) && !defined(__apple_build_version__)) || (defined(__apple_build_version__) && __apple_build_version__ < 6020000)
157 #if defined (__clang__) /* && CLANG_VERSION < xxxxx */ || defined(__apple_build_version__)
158 #define FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
159 #else
160     // Type cast operator to convert to type Vec4ib used as Boolean for integer vectors
Vec4ib()161     operator Vec4ib() const {
162         return _mm_castps_si128(xmm);
163     }
164 #endif
165     // Member function to change a single element in vector
166     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,bool value)167     Vec4fb const & insert(uint32_t index, bool value) {
168         static const int32_t maskl[8] = {0,0,0,0,-1,0,0,0};
169         __m128 mask  = _mm_loadu_ps((float const*)(maskl+4-(index & 3))); // mask with FFFFFFFF at index position
170         if (value) {
171             xmm = _mm_or_ps(xmm,mask);
172         }
173         else {
174             xmm = _mm_andnot_ps(mask,xmm);
175         }
176         return *this;
177     }
178     // Member function extract a single element from vector
extract(uint32_t index)179     bool extract(uint32_t index) const {
180         //return Vec4ib(*this).extract(index);
181         return Vec4ib(_mm_castps_si128(xmm)).extract(index);
182     }
183     // Extract a single element. Operator [] can only read an element, not write.
184     bool operator [] (uint32_t index) const {
185         return extract(index);
186     }
size()187     static int size() {
188         return 4;
189     }
190 };
191 
192 
193 /*****************************************************************************
194 *
195 *          Operators for Vec4fb
196 *
197 *****************************************************************************/
198 
199 // vector operator & : bitwise and
200 static inline Vec4fb operator & (Vec4fb const & a, Vec4fb const & b) {
201     return _mm_and_ps(a, b);
202 }
203 static inline Vec4fb operator && (Vec4fb const & a, Vec4fb const & b) {
204     return a & b;
205 }
206 
207 // vector operator &= : bitwise and
208 static inline Vec4fb & operator &= (Vec4fb & a, Vec4fb const & b) {
209     a = a & b;
210     return a;
211 }
212 
213 // vector operator | : bitwise or
214 static inline Vec4fb operator | (Vec4fb const & a, Vec4fb const & b) {
215     return _mm_or_ps(a, b);
216 }
217 static inline Vec4fb operator || (Vec4fb const & a, Vec4fb const & b) {
218     return a | b;
219 }
220 
221 // vector operator |= : bitwise or
222 static inline Vec4fb & operator |= (Vec4fb & a, Vec4fb const & b) {
223     a = a | b;
224     return a;
225 }
226 
227 // vector operator ^ : bitwise xor
228 static inline Vec4fb operator ^ (Vec4fb const & a, Vec4fb const & b) {
229     return _mm_xor_ps(a, b);
230 }
231 
232 // vector operator ^= : bitwise xor
233 static inline Vec4fb & operator ^= (Vec4fb & a, Vec4fb const & b) {
234     a = a ^ b;
235     return a;
236 }
237 
238 // vector operator ~ : bitwise not
239 static inline Vec4fb operator ~ (Vec4fb const & a) {
240     return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(-1)));
241 }
242 
243 // vector operator ! : logical not
244 // (operator ! is less efficient than operator ~. Use only where not
245 // all bits in an element are the same)
246 static inline Vec4fb operator ! (Vec4fb const & a) {
247     return Vec4fb( ! Vec4ib(a));
248 }
249 
250 // Functions for Vec4fb
251 
252 // andnot: a & ~ b
andnot(Vec4fb const & a,Vec4fb const & b)253 static inline Vec4fb andnot(Vec4fb const & a, Vec4fb const & b) {
254     return _mm_andnot_ps(b, a);
255 }
256 
257 
258 /*****************************************************************************
259 *
260 *          Horizontal Boolean functions
261 *
262 *****************************************************************************/
263 
264 // horizontal_and. Returns true if all bits are 1
horizontal_and(Vec4fb const & a)265 static inline bool horizontal_and (Vec4fb const & a) {
266     return _mm_movemask_ps(a) == 0x0F;
267     //return horizontal_and(Vec128b(_mm_castps_si128(a)));
268 }
269 
270 // horizontal_or. Returns true if at least one bit is 1
horizontal_or(Vec4fb const & a)271 static inline bool horizontal_or (Vec4fb const & a) {
272     return _mm_movemask_ps(a) != 0;
273     //return horizontal_or(Vec128b(_mm_castps_si128(a)));
274 }
275 
276 
277 /*****************************************************************************
278 *
279 *          Vec2db: Vector of 2 Booleans for use with Vec2d
280 *
281 *****************************************************************************/
282 
283 class Vec2db {
284 protected:
285     __m128d xmm; // Double vector
286 public:
287     // Default constructor:
Vec2db()288     Vec2db() {
289     }
290     // Constructor to broadcast the same value into all elements:
291     // Constructor to build from all elements:
Vec2db(bool b0,bool b1)292     Vec2db(bool b0, bool b1) {
293         xmm = _mm_castsi128_pd(_mm_setr_epi32(-(int)b0, -(int)b0, -(int)b1, -(int)b1));
294     }
295     // Constructor to convert from type __m128d used in intrinsics:
Vec2db(__m128d const & x)296     Vec2db(__m128d const & x) {
297         xmm = x;
298     }
299     // Assignment operator to convert from type __m128d used in intrinsics:
300     Vec2db & operator = (__m128d const & x) {
301         xmm = x;
302         return *this;
303     }
304     // Constructor to broadcast scalar value:
Vec2db(bool b)305     Vec2db(bool b) {
306         xmm = _mm_castsi128_pd(_mm_set1_epi32(-int32_t(b)));
307     }
308     // Assignment operator to broadcast scalar value:
309     Vec2db & operator = (bool b) {
310         *this = Vec2db(b);
311         return *this;
312     }
313 private: // Prevent constructing from int, etc.
314     Vec2db(int b);
315     Vec2db & operator = (int x);
316 public:
317     // Constructor to convert from type Vec2qb used as Boolean for integer vectors
Vec2db(Vec2qb const & x)318     Vec2db(Vec2qb const & x) {
319         xmm = _mm_castsi128_pd(x);
320     }
321     // Assignment operator to convert from type Vec2qb used as Boolean for integer vectors
322     Vec2db & operator = (Vec2qb const & x) {
323         xmm = _mm_castsi128_pd(x);
324         return *this;
325     }
326     // Type cast operator to convert to __m128d used in intrinsics
__m128d()327     operator __m128d() const {
328         return xmm;
329     }
330 #ifndef FIX_CLANG_VECTOR_ALIAS_AMBIGUITY
331     // Type cast operator to convert to type Vec2qb used as Boolean for integer vectors
Vec2qb()332     operator Vec2qb() const {
333         return _mm_castpd_si128(xmm);
334     }
335 #endif
336     // Member function to change a single element in vector
337     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,bool value)338     Vec2db const & insert(uint32_t index, bool value) {
339         static const int32_t maskl[8] = {0,0,0,0,-1,-1,0,0};
340         __m128 mask  = _mm_loadu_ps((float const*)(maskl+4-(index&1)*2)); // mask with FFFFFFFFFFFFFFFF at index position
341         if (value) {
342             xmm = _mm_or_pd(xmm,_mm_castps_pd(mask));
343         }
344         else {
345             xmm = _mm_andnot_pd(_mm_castps_pd(mask),xmm);
346         }
347         return *this;
348     }
349     // Member function extract a single element from vector
extract(uint32_t index)350     bool extract(uint32_t index) const {
351         return Vec2qb(*this).extract(index);
352     }
353     // Extract a single element. Operator [] can only read an element, not write.
354     bool operator [] (uint32_t index) const {
355         return extract(index);
356     }
size()357     static int size() {
358         return 2;
359     }
360 };
361 
362 
363 /*****************************************************************************
364 *
365 *          Operators for Vec2db
366 *
367 *****************************************************************************/
368 
369 // vector operator & : bitwise and
370 static inline Vec2db operator & (Vec2db const & a, Vec2db const & b) {
371     return _mm_and_pd(a, b);
372 }
373 static inline Vec2db operator && (Vec2db const & a, Vec2db const & b) {
374     return a & b;
375 }
376 
377 // vector operator &= : bitwise and
378 static inline Vec2db & operator &= (Vec2db & a, Vec2db const & b) {
379     a = a & b;
380     return a;
381 }
382 
383 // vector operator | : bitwise or
384 static inline Vec2db operator | (Vec2db const & a, Vec2db const & b) {
385     return _mm_or_pd(a, b);
386 }
387 static inline Vec2db operator || (Vec2db const & a, Vec2db const & b) {
388     return a | b;
389 }
390 
391 // vector operator |= : bitwise or
392 static inline Vec2db & operator |= (Vec2db & a, Vec2db const & b) {
393     a = a | b;
394     return a;
395 }
396 
397 // vector operator ^ : bitwise xor
398 static inline Vec2db operator ^ (Vec2db const & a, Vec2db const & b) {
399     return _mm_xor_pd(a, b);
400 }
401 
402 // vector operator ^= : bitwise xor
403 static inline Vec2db & operator ^= (Vec2db & a, Vec2db const & b) {
404     a = a ^ b;
405     return a;
406 }
407 
408 // vector operator ~ : bitwise not
409 static inline Vec2db operator ~ (Vec2db const & a) {
410     return _mm_xor_pd(a, _mm_castsi128_pd(_mm_set1_epi32(-1)));
411 }
412 
413 // vector operator ! : logical not
414 // (operator ! is less efficient than operator ~. Use only where not
415 // all bits in an element are the same)
416 static inline Vec2db operator ! (Vec2db const & a) {
417     return Vec2db (! Vec2qb(a));
418 }
419 
420 // Functions for Vec2db
421 
422 // andnot: a & ~ b
andnot(Vec2db const & a,Vec2db const & b)423 static inline Vec2db andnot(Vec2db const & a, Vec2db const & b) {
424     return _mm_andnot_pd(b, a);
425 }
426 
427 
428 /*****************************************************************************
429 *
430 *          Horizontal Boolean functions
431 *
432 *****************************************************************************/
433 
434 // horizontal_and. Returns true if all bits are 1
horizontal_and(Vec2db const & a)435 static inline bool horizontal_and (Vec2db const & a) {
436     return _mm_movemask_pd(a) == 3;
437     //return horizontal_and(Vec128b(_mm_castpd_si128(a)));
438 }
439 
440 // horizontal_or. Returns true if at least one bit is 1
horizontal_or(Vec2db const & a)441 static inline bool horizontal_or (Vec2db const & a) {
442     return _mm_movemask_pd(a) != 0;
443     //return horizontal_or(Vec128b(_mm_castpd_si128(a)));
444 }
445 
446 
447 
448 /*****************************************************************************
449 *
450 *          Vec4f: Vector of 4 single precision floating point values
451 *
452 *****************************************************************************/
453 
454 class Vec4f {
455 protected:
456     __m128 xmm; // Float vector
457 public:
458     // Default constructor:
Vec4f()459     Vec4f() {
460     }
461     // Constructor to broadcast the same value into all elements:
Vec4f(float f)462     Vec4f(float f) {
463         xmm = _mm_set1_ps(f);
464     }
465     // Constructor to build from all elements:
Vec4f(float f0,float f1,float f2,float f3)466     Vec4f(float f0, float f1, float f2, float f3) {
467         xmm = _mm_setr_ps(f0, f1, f2, f3);
468     }
469     // Constructor to convert from type __m128 used in intrinsics:
Vec4f(__m128 const & x)470     Vec4f(__m128 const & x) {
471         xmm = x;
472     }
473     // Assignment operator to convert from type __m128 used in intrinsics:
474     Vec4f & operator = (__m128 const & x) {
475         xmm = x;
476         return *this;
477     }
478     // Type cast operator to convert to __m128 used in intrinsics
__m128()479     operator __m128() const {
480         return xmm;
481     }
482     // Member function to load from array (unaligned)
load(float const * p)483     Vec4f & load(float const * p) {
484         xmm = _mm_loadu_ps(p);
485         return *this;
486     }
487     // Member function to load from array, aligned by 16
488     // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1,
489     // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
490     // You may use load_a instead of load if you are certain that p points to an address
491     // divisible by 16.
load_a(float const * p)492     Vec4f & load_a(float const * p) {
493         xmm = _mm_load_ps(p);
494         return *this;
495     }
496     // Member function to store into array (unaligned)
store(float * p)497     void store(float * p) const {
498         _mm_storeu_ps(p, xmm);
499     }
500     // Member function to store into array, aligned by 16
501     // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1,
502     // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
503     // You may use store_a instead of store if you are certain that p points to an address
504     // divisible by 16.
store_a(float * p)505     void store_a(float * p) const {
506         _mm_store_ps(p, xmm);
507     }
508     // Partial load. Load n elements and set the rest to 0
load_partial(int n,float const * p)509     Vec4f & load_partial(int n, float const * p) {
510         __m128 t1, t2;
511         switch (n) {
512         case 1:
513             xmm = _mm_load_ss(p); break;
514         case 2:
515             xmm = _mm_castpd_ps(_mm_load_sd((double const*)p)); break;
516         case 3:
517             t1 = _mm_castpd_ps(_mm_load_sd((double const*)p));
518             t2 = _mm_load_ss(p + 2);
519             xmm = _mm_movelh_ps(t1, t2); break;
520         case 4:
521             load(p); break;
522         default:
523             xmm = _mm_setzero_ps();
524         }
525         return *this;
526     }
527     // Partial store. Store n elements
store_partial(int n,float * p)528     void store_partial(int n, float * p) const {
529         __m128 t1;
530         switch (n) {
531         case 1:
532             _mm_store_ss(p, xmm); break;
533         case 2:
534             _mm_store_sd((double*)p, _mm_castps_pd(xmm)); break;
535         case 3:
536             _mm_store_sd((double*)p, _mm_castps_pd(xmm));
537             t1 = _mm_movehl_ps(xmm,xmm);
538             _mm_store_ss(p + 2, t1); break;
539         case 4:
540             store(p); break;
541         default:;
542         }
543     }
544     // cut off vector to n elements. The last 4-n elements are set to zero
cutoff(int n)545     Vec4f & cutoff(int n) {
546         if (uint32_t(n) >= 4) return *this;
547         static const union {
548             int32_t i[8];
549             float   f[8];
550         } mask = {{1,-1,-1,-1,0,0,0,0}};
551         xmm = _mm_and_ps(xmm, Vec4f().load(mask.f + 4 - n));
552         return *this;
553     }
554     // Member function to change a single element in vector
555     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,float value)556     Vec4f const & insert(uint32_t index, float value) {
557 #if INSTRSET >= 5   // SSE4.1 supported
558         switch (index & 3) {
559         case 0:
560             xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 0 << 4);  break;
561         case 1:
562             xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 1 << 4);  break;
563         case 2:
564             xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 2 << 4);  break;
565         default:
566             xmm = _mm_insert_ps(xmm, _mm_set_ss(value), 3 << 4);  break;
567         }
568 #else
569         static const int32_t maskl[8] = {0,0,0,0,-1,0,0,0};
570         __m128 broad = _mm_set1_ps(value);  // broadcast value into all elements
571         __m128 mask  = _mm_loadu_ps((float const*)(maskl+4-(index & 3))); // mask with FFFFFFFF at index position
572         xmm = selectf(mask,broad,xmm);
573 #endif
574         return *this;
575     };
576     // Member function extract a single element from vector
extract(uint32_t index)577     float extract(uint32_t index) const {
578         float x[4];
579         store(x);
580         return x[index & 3];
581     }
582     // Extract a single element. Use store function if extracting more than one element.
583     // Operator [] can only read an element, not write.
584     float operator [] (uint32_t index) const {
585         return extract(index);
586     }
size()587     static int size() {
588         return 4;
589     }
590 };
591 
592 
593 /*****************************************************************************
594 *
595 *          Operators for Vec4f
596 *
597 *****************************************************************************/
598 
599 // vector operator + : add element by element
600 static inline Vec4f operator + (Vec4f const & a, Vec4f const & b) {
601     return _mm_add_ps(a, b);
602 }
603 
604 // vector operator + : add vector and scalar
605 static inline Vec4f operator + (Vec4f const & a, float b) {
606     return a + Vec4f(b);
607 }
608 static inline Vec4f operator + (float a, Vec4f const & b) {
609     return Vec4f(a) + b;
610 }
611 
612 // vector operator += : add
613 static inline Vec4f & operator += (Vec4f & a, Vec4f const & b) {
614     a = a + b;
615     return a;
616 }
617 
618 // postfix operator ++
619 static inline Vec4f operator ++ (Vec4f & a, int) {
620     Vec4f a0 = a;
621     a = a + 1.0f;
622     return a0;
623 }
624 
625 // prefix operator ++
626 static inline Vec4f & operator ++ (Vec4f & a) {
627     a = a + 1.0f;
628     return a;
629 }
630 
631 // vector operator - : subtract element by element
632 static inline Vec4f operator - (Vec4f const & a, Vec4f const & b) {
633     return _mm_sub_ps(a, b);
634 }
635 
636 // vector operator - : subtract vector and scalar
637 static inline Vec4f operator - (Vec4f const & a, float b) {
638     return a - Vec4f(b);
639 }
640 static inline Vec4f operator - (float a, Vec4f const & b) {
641     return Vec4f(a) - b;
642 }
643 
644 // vector operator - : unary minus
645 // Change sign bit, even for 0, INF and NAN
646 static inline Vec4f operator - (Vec4f const & a) {
647     return _mm_xor_ps(a, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
648 }
649 
650 // vector operator -= : subtract
651 static inline Vec4f & operator -= (Vec4f & a, Vec4f const & b) {
652     a = a - b;
653     return a;
654 }
655 
656 // postfix operator --
657 static inline Vec4f operator -- (Vec4f & a, int) {
658     Vec4f a0 = a;
659     a = a - 1.0f;
660     return a0;
661 }
662 
663 // prefix operator --
664 static inline Vec4f & operator -- (Vec4f & a) {
665     a = a - 1.0f;
666     return a;
667 }
668 
669 // vector operator * : multiply element by element
670 static inline Vec4f operator * (Vec4f const & a, Vec4f const & b) {
671     return _mm_mul_ps(a, b);
672 }
673 
674 // vector operator * : multiply vector and scalar
675 static inline Vec4f operator * (Vec4f const & a, float b) {
676     return a * Vec4f(b);
677 }
678 static inline Vec4f operator * (float a, Vec4f const & b) {
679     return Vec4f(a) * b;
680 }
681 
682 // vector operator *= : multiply
683 static inline Vec4f & operator *= (Vec4f & a, Vec4f const & b) {
684     a = a * b;
685     return a;
686 }
687 
688 // vector operator / : divide all elements by same integer
689 static inline Vec4f operator / (Vec4f const & a, Vec4f const & b) {
690     return _mm_div_ps(a, b);
691 }
692 
693 // vector operator / : divide vector and scalar
694 static inline Vec4f operator / (Vec4f const & a, float b) {
695     return a / Vec4f(b);
696 }
697 static inline Vec4f operator / (float a, Vec4f const & b) {
698     return Vec4f(a) / b;
699 }
700 
701 // vector operator /= : divide
702 static inline Vec4f & operator /= (Vec4f & a, Vec4f const & b) {
703     a = a / b;
704     return a;
705 }
706 
707 // vector operator == : returns true for elements for which a == b
708 static inline Vec4fb operator == (Vec4f const & a, Vec4f const & b) {
709     return _mm_cmpeq_ps(a, b);
710 }
711 
712 // vector operator != : returns true for elements for which a != b
713 static inline Vec4fb operator != (Vec4f const & a, Vec4f const & b) {
714     return _mm_cmpneq_ps(a, b);
715 }
716 
717 // vector operator < : returns true for elements for which a < b
718 static inline Vec4fb operator < (Vec4f const & a, Vec4f const & b) {
719     return _mm_cmplt_ps(a, b);
720 }
721 
722 // vector operator <= : returns true for elements for which a <= b
723 static inline Vec4fb operator <= (Vec4f const & a, Vec4f const & b) {
724     return _mm_cmple_ps(a, b);
725 }
726 
727 // vector operator > : returns true for elements for which a > b
728 static inline Vec4fb operator > (Vec4f const & a, Vec4f const & b) {
729     return b < a;
730 }
731 
732 // vector operator >= : returns true for elements for which a >= b
733 static inline Vec4fb operator >= (Vec4f const & a, Vec4f const & b) {
734     return b <= a;
735 }
736 
737 // Bitwise logical operators
738 
739 // vector operator & : bitwise and
740 static inline Vec4f operator & (Vec4f const & a, Vec4f const & b) {
741     return _mm_and_ps(a, b);
742 }
743 
744 // vector operator &= : bitwise and
745 static inline Vec4f & operator &= (Vec4f & a, Vec4f const & b) {
746     a = a & b;
747     return a;
748 }
749 
750 // vector operator & : bitwise and of Vec4f and Vec4fb
751 static inline Vec4f operator & (Vec4f const & a, Vec4fb const & b) {
752     return _mm_and_ps(a, b);
753 }
754 static inline Vec4f operator & (Vec4fb const & a, Vec4f const & b) {
755     return _mm_and_ps(a, b);
756 }
757 
758 // vector operator | : bitwise or
759 static inline Vec4f operator | (Vec4f const & a, Vec4f const & b) {
760     return _mm_or_ps(a, b);
761 }
762 
763 // vector operator |= : bitwise or
764 static inline Vec4f & operator |= (Vec4f & a, Vec4f const & b) {
765     a = a | b;
766     return a;
767 }
768 
769 // vector operator ^ : bitwise xor
770 static inline Vec4f operator ^ (Vec4f const & a, Vec4f const & b) {
771     return _mm_xor_ps(a, b);
772 }
773 
774 // vector operator ^= : bitwise xor
775 static inline Vec4f & operator ^= (Vec4f & a, Vec4f const & b) {
776     a = a ^ b;
777     return a;
778 }
779 
780 // vector operator ! : logical not. Returns Boolean vector
781 static inline Vec4fb operator ! (Vec4f const & a) {
782     return a == Vec4f(0.0f);
783 }
784 
785 
786 /*****************************************************************************
787 *
788 *          Functions for Vec4f
789 *
790 *****************************************************************************/
791 
792 // Select between two operands. Corresponds to this pseudocode:
793 // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
794 // Each byte in s must be either 0 (false) or 0xFFFFFFFF (true). No other values are allowed.
select(Vec4fb const & s,Vec4f const & a,Vec4f const & b)795 static inline Vec4f select (Vec4fb const & s, Vec4f const & a, Vec4f const & b) {
796     return selectf(s,a,b);
797 }
798 
799 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec4fb const & f,Vec4f const & a,Vec4f const & b)800 static inline Vec4f if_add (Vec4fb const & f, Vec4f const & a, Vec4f const & b) {
801     return a + (Vec4f(f) & b);
802 }
803 
804 // Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
if_mul(Vec4fb const & f,Vec4f const & a,Vec4f const & b)805 static inline Vec4f if_mul (Vec4fb const & f, Vec4f const & a, Vec4f const & b) {
806     return a * select(f, b, 1.f);
807 }
808 
809 
810 // General arithmetic functions, etc.
811 
812 // Horizontal add: Calculates the sum of all vector elements.
horizontal_add(Vec4f const & a)813 static inline float horizontal_add (Vec4f const & a) {
814 #if  INSTRSET >= 3  // SSE3
815     __m128 t1 = _mm_hadd_ps(a,a);
816     __m128 t2 = _mm_hadd_ps(t1,t1);
817     return _mm_cvtss_f32(t2);
818 #else
819     __m128 t1 = _mm_movehl_ps(a,a);
820     __m128 t2 = _mm_add_ps(a,t1);
821     __m128 t3 = _mm_shuffle_ps(t2,t2,1);
822     __m128 t4 = _mm_add_ss(t2,t3);
823     return _mm_cvtss_f32(t4);
824 #endif
825 }
826 
827 // function max: a > b ? a : b
max(Vec4f const & a,Vec4f const & b)828 static inline Vec4f max(Vec4f const & a, Vec4f const & b) {
829     return _mm_max_ps(a,b);
830 }
831 
832 // function min: a < b ? a : b
min(Vec4f const & a,Vec4f const & b)833 static inline Vec4f min(Vec4f const & a, Vec4f const & b) {
834     return _mm_min_ps(a,b);
835 }
836 
837 // function abs: absolute value
838 // Removes sign bit, even for -0.0f, -INF and -NAN
abs(Vec4f const & a)839 static inline Vec4f abs(Vec4f const & a) {
840     __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
841     return _mm_and_ps(a,mask);
842 }
843 
844 // function sqrt: square root
sqrt(Vec4f const & a)845 static inline Vec4f sqrt(Vec4f const & a) {
846     return _mm_sqrt_ps(a);
847 }
848 
849 // function square: a * a
square(Vec4f const & a)850 static inline Vec4f square(Vec4f const & a) {
851     return a * a;
852 }
853 
854 // pow(vector,int) function template
855 template <typename VTYPE>
pow_template_i(VTYPE const & x0,int n)856 static inline VTYPE pow_template_i(VTYPE const & x0, int n) {
857     VTYPE x = x0;                      // a^(2^i)
858     VTYPE y(1.0f);                     // accumulator
859     if (n >= 0) {                      // make sure n is not negative
860         while (true) {                 // loop for each bit in n
861             if (n & 1) y *= x;         // multiply if bit = 1
862             n >>= 1;                   // get next bit of n
863             if (n == 0) return y;      // finished
864             x *= x;                    // x = a^2, a^4, a^8, etc.
865         }
866     }
867     else {                             // n < 0
868         return VTYPE(1.0f)/pow_template_i<VTYPE>(x0,-n);  // reciprocal
869     }
870 }
871 
872 // pow(Vec4f, int):
873 // The purpose of this template is to prevent implicit conversion of a float
874 // exponent to int when calling pow(vector, float) and vectormath_exp.h is
875 // not included
876 
877 template <typename TT> static Vec4f pow(Vec4f const & a, TT const & n);
878 
879 // Raise floating point numbers to integer power n
880 template <>
881 inline Vec4f pow<int>(Vec4f const & x0, int const & n) {
882     return pow_template_i<Vec4f>(x0, n);
883 }
884 
885 // allow conversion from unsigned int
886 template <>
887 inline Vec4f pow<uint32_t>(Vec4f const & x0, uint32_t const & n) {
888     return pow_template_i<Vec4f>(x0, (int)n);
889 }
890 
891 // Raise floating point numbers to integer power n, where n is a compile-time constant
892 template <int n>
pow_n(Vec4f const & a)893 static inline Vec4f pow_n(Vec4f const & a) {
894     if (n < 0)    return Vec4f(1.0f) / pow_n<-n>(a);
895     if (n == 0)   return Vec4f(1.0f);
896     if (n >= 256) return pow(a, n);
897     Vec4f x = a;                       // a^(2^i)
898     Vec4f y;                           // accumulator
899     const int lowest = n - (n & (n-1));// lowest set bit in n
900     if (n & 1) y = x;
901     if (n < 2) return y;
902     x = x*x;                           // x^2
903     if (n & 2) {
904         if (lowest == 2) y = x; else y *= x;
905     }
906     if (n < 4) return y;
907     x = x*x;                           // x^4
908     if (n & 4) {
909         if (lowest == 4) y = x; else y *= x;
910     }
911     if (n < 8) return y;
912     x = x*x;                           // x^8
913     if (n & 8) {
914         if (lowest == 8) y = x; else y *= x;
915     }
916     if (n < 16) return y;
917     x = x*x;                           // x^16
918     if (n & 16) {
919         if (lowest == 16) y = x; else y *= x;
920     }
921     if (n < 32) return y;
922     x = x*x;                           // x^32
923     if (n & 32) {
924         if (lowest == 32) y = x; else y *= x;
925     }
926     if (n < 64) return y;
927     x = x*x;                           // x^64
928     if (n & 64) {
929         if (lowest == 64) y = x; else y *= x;
930     }
931     if (n < 128) return y;
932     x = x*x;                           // x^128
933     if (n & 128) {
934         if (lowest == 128) y = x; else y *= x;
935     }
936     return y;
937 }
938 
939 // implement as function pow(vector, const_int)
940 template <int n>
pow(Vec4f const & a,Const_int_t<n>)941 static inline Vec4f pow(Vec4f const & a, Const_int_t<n>) {
942     return pow_n<n>(a);
943 }
944 
945 // implement the same as macro pow_const(vector, int)
946 #define pow_const(x,n) pow_n<n>(x)
947 
948 
949 // avoid unsafe optimization in function round
950 #if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) && INSTRSET < 5
951 static inline Vec4f round(Vec4f const & a) __attribute__ ((optimize("-fno-unsafe-math-optimizations")));
952 #elif defined(__clang__) && INSTRSET < 5
953 // static inline Vec4f round(Vec4f const & a) __attribute__ ((optnone));
954 // This doesn't work, but current versions of Clang (3.5) don't optimize away signedmagic, even with -funsafe-math-optimizations
955 // Add volatile to b if future versions fail
956 #elif defined (_MSC_VER) || defined(__INTEL_COMPILER) && INSTRSET < 5
957 #pragma float_control(push)
958 #pragma float_control(precise,on)
959 #define FLOAT_CONTROL_PRECISE_FOR_ROUND
960 #endif
961 // function round: round to nearest integer (even). (result as float vector)
round(Vec4f const & a)962 static inline Vec4f round(Vec4f const & a) {
963 #if INSTRSET >= 5   // SSE4.1 supported
964     return _mm_round_ps(a, 8);
965 #else // SSE2. Use magic number method
966     // Note: assume MXCSR control register is set to rounding
967     // (don't use conversion to int, it will limit the value to +/- 2^31)
968     Vec4f signmask    = _mm_castsi128_ps(constant4ui<0x80000000,0x80000000,0x80000000,0x80000000>());  // -0.0
969     Vec4f magic       = _mm_castsi128_ps(constant4ui<0x4B000000,0x4B000000,0x4B000000,0x4B000000>());  // magic number = 2^23
970     Vec4f sign        = _mm_and_ps(a, signmask);                                    // signbit of a
971     Vec4f signedmagic = _mm_or_ps(magic, sign);                                     // magic number with sign of a
972     // volatile
973     Vec4f b = a + signedmagic;                                                      // round by adding magic number
974     return b - signedmagic;                                                         // .. and subtracting it again
975 #endif
976 }
977 #ifdef FLOAT_CONTROL_PRECISE_FOR_ROUND
978 #pragma float_control(pop)
979 #endif
980 
981 // function truncate: round towards zero. (result as float vector)
truncate(Vec4f const & a)982 static inline Vec4f truncate(Vec4f const & a) {
983 #if INSTRSET >= 5   // SSE4.1 supported
984     return _mm_round_ps(a, 3+8);
985 #else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
986     uint32_t t1 = _mm_getcsr();        // MXCSR
987     uint32_t t2 = t1 | (3 << 13);      // bit 13-14 = 11
988     _mm_setcsr(t2);                    // change MXCSR
989     Vec4f r = round(a);                // use magic number method
990     _mm_setcsr(t1);                    // restore MXCSR
991     return r;
992 #endif
993 }
994 
995 // function floor: round towards minus infinity. (result as float vector)
floor(Vec4f const & a)996 static inline Vec4f floor(Vec4f const & a) {
997 #if INSTRSET >= 5   // SSE4.1 supported
998     return _mm_round_ps(a, 1+8);
999 #else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
1000     uint32_t t1 = _mm_getcsr();        // MXCSR
1001     uint32_t t2 = t1 | (1 << 13);      // bit 13-14 = 01
1002     _mm_setcsr(t2);                    // change MXCSR
1003     Vec4f r = round(a);                // use magic number method
1004     _mm_setcsr(t1);                    // restore MXCSR
1005     return r;
1006 #endif
1007 }
1008 
1009 // function ceil: round towards plus infinity. (result as float vector)
ceil(Vec4f const & a)1010 static inline Vec4f ceil(Vec4f const & a) {
1011 #if INSTRSET >= 5   // SSE4.1 supported
1012     return _mm_round_ps(a, 2+8);
1013 #else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
1014     uint32_t t1 = _mm_getcsr();        // MXCSR
1015     uint32_t t2 = t1 | (2 << 13);      // bit 13-14 = 10
1016     _mm_setcsr(t2);                    // change MXCSR
1017     Vec4f r = round(a);                // use magic number method
1018     _mm_setcsr(t1);                    // restore MXCSR
1019     return r;
1020 #endif
1021 }
1022 
1023 // function round_to_int: round to nearest integer (even). (result as integer vector)
round_to_int(Vec4f const & a)1024 static inline Vec4i round_to_int(Vec4f const & a) {
1025     // Note: assume MXCSR control register is set to rounding
1026     return _mm_cvtps_epi32(a);
1027 }
1028 
1029 // function truncate_to_int: round towards zero. (result as integer vector)
truncate_to_int(Vec4f const & a)1030 static inline Vec4i truncate_to_int(Vec4f const & a) {
1031     return _mm_cvttps_epi32(a);
1032 }
1033 
1034 // function to_float: convert integer vector to float vector
to_float(Vec4i const & a)1035 static inline Vec4f to_float(Vec4i const & a) {
1036     return _mm_cvtepi32_ps(a);
1037 }
1038 
1039 // function to_float: convert unsigned integer vector to float vector
to_float(Vec4ui const & a)1040 static inline Vec4f to_float(Vec4ui const & a) {
1041 #ifdef __AVX512VL__
1042     return _mm_cvtepu32_ps(a);
1043 #else
1044     Vec4f b = to_float(Vec4i(a & 0x7FFFFFFF));             // 31 bits
1045     Vec4i c = Vec4i(a) >> 31;                              // generate mask from highest bit
1046     Vec4f d = Vec4f(2147483648.f) & Vec4f(_mm_castsi128_ps(c));// mask floating point constant 2^31
1047     return b + d;
1048 #endif
1049 }
1050 
1051 
1052 // Approximate math functions
1053 
1054 // approximate reciprocal (Faster than 1.f / a. relative accuracy better than 2^-11)
approx_recipr(Vec4f const & a)1055 static inline Vec4f approx_recipr(Vec4f const & a) {
1056 #if INSTRSET >= 9  // use more accurate version if available. (none of these will raise exceptions on zero)
1057 #ifdef __AVX512ER__  // AVX512ER: full precision
1058     // todo: if future processors have both AVX512ER and AVX512VL: _mm128_rcp28_round_ps(a, _MM_FROUND_NO_EXC);
1059     return _mm512_castps512_ps128(_mm512_rcp28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC));
1060 #elif defined __AVX512VL__  // AVX512VL: 14 bit precision
1061     return _mm_rcp14_ps(a);
1062 #else  // AVX512F: 14 bit precision
1063     return _mm512_castps512_ps128(_mm512_rcp14_ps(_mm512_castps128_ps512(a)));
1064 #endif
1065 #else  // AVX: 11 bit precision
1066     return _mm_rcp_ps(a);
1067 #endif
1068 }
1069 
1070 // approximate reciprocal squareroot (Faster than 1.f / sqrt(a). Relative accuracy better than 2^-11)
approx_rsqrt(Vec4f const & a)1071 static inline Vec4f approx_rsqrt(Vec4f const & a) {
1072 #if INSTRSET >= 9  // use more accurate version if available. (none of these will raise exceptions on zero)
1073 #ifdef __AVX512ER__  // AVX512ER: full precision
1074     // todo: if future processors have both AVX512ER and AVX521VL: _mm128_rsqrt28_round_ps(a, _MM_FROUND_NO_EXC);
1075     return _mm512_castps512_ps128(_mm512_rsqrt28_round_ps(_mm512_castps128_ps512(a), _MM_FROUND_NO_EXC));
1076 #elif defined __AVX512VL__  // AVX512VL: 14 bit precision
1077     return _mm_rsqrt14_ps(a);
1078 #else  // AVX512F: 14 bit precision
1079     return _mm512_castps512_ps128(_mm512_rsqrt14_ps(_mm512_castps128_ps512(a)));
1080 #endif
1081 #else  // AVX: 11 bit precision
1082     return _mm_rsqrt_ps(a);
1083 #endif
1084 }
1085 
1086 // Fused multiply and add functions
1087 
1088 // Multiply and add
mul_add(Vec4f const & a,Vec4f const & b,Vec4f const & c)1089 static inline Vec4f mul_add(Vec4f const & a, Vec4f const & b, Vec4f const & c) {
1090 #ifdef __FMA__
1091     return _mm_fmadd_ps(a, b, c);
1092 #elif defined (__FMA4__)
1093     return _mm_macc_ps(a, b, c);
1094 #else
1095     return a * b + c;
1096 #endif
1097 }
1098 
1099 // Multiply and subtract
mul_sub(Vec4f const & a,Vec4f const & b,Vec4f const & c)1100 static inline Vec4f mul_sub(Vec4f const & a, Vec4f const & b, Vec4f const & c) {
1101 #ifdef __FMA__
1102     return _mm_fmsub_ps(a, b, c);
1103 #elif defined (__FMA4__)
1104     return _mm_msub_ps(a, b, c);
1105 #else
1106     return a * b - c;
1107 #endif
1108 }
1109 
1110 // Multiply and inverse subtract
nmul_add(Vec4f const & a,Vec4f const & b,Vec4f const & c)1111 static inline Vec4f nmul_add(Vec4f const & a, Vec4f const & b, Vec4f const & c) {
1112 #ifdef __FMA__
1113     return _mm_fnmadd_ps(a, b, c);
1114 #elif defined (__FMA4__)
1115     return _mm_nmacc_ps(a, b, c);
1116 #else
1117     return c - a * b;
1118 #endif
1119 }
1120 
1121 
1122 // Multiply and subtract with extra precision on the intermediate calculations,
1123 // even if FMA instructions not supported, using Veltkamp-Dekker split
mul_sub_x(Vec4f const & a,Vec4f const & b,Vec4f const & c)1124 static inline Vec4f mul_sub_x(Vec4f const & a, Vec4f const & b, Vec4f const & c) {
1125 #ifdef __FMA__
1126     return _mm_fmsub_ps(a, b, c);
1127 #elif defined (__FMA4__)
1128     return _mm_msub_ps(a, b, c);
1129 #else
1130     // calculate a * b - c with extra precision
1131     Vec4i upper_mask = -(1 << 12);                         // mask to remove lower 12 bits
1132     Vec4f a_high = a & Vec4f(_mm_castsi128_ps(upper_mask));// split into high and low parts
1133     Vec4f b_high = b & Vec4f(_mm_castsi128_ps(upper_mask));
1134     Vec4f a_low  = a - a_high;
1135     Vec4f b_low  = b - b_high;
1136     Vec4f r1 = a_high * b_high;                            // this product is exact
1137     Vec4f r2 = r1 - c;                                     // subtract c from high product
1138     Vec4f r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product
1139     return r3; // + ((r2 - r1) + c);
1140 #endif
1141 }
1142 
1143 // Math functions using fast bit manipulation
1144 
1145 // Extract the exponent as an integer
1146 // exponent(a) = floor(log2(abs(a)));
1147 // exponent(1.0f) = 0, exponent(0.0f) = -127, exponent(INF) = +128, exponent(NAN) = +128
exponent(Vec4f const & a)1148 static inline Vec4i exponent(Vec4f const & a) {
1149     Vec4ui t1 = _mm_castps_si128(a);   // reinterpret as 32-bit integer
1150     Vec4ui t2 = t1 << 1;               // shift out sign bit
1151     Vec4ui t3 = t2 >> 24;              // shift down logical to position 0
1152     Vec4i  t4 = Vec4i(t3) - 0x7F;      // subtract bias from exponent
1153     return t4;
1154 }
1155 
1156 // Extract the fraction part of a floating point number
1157 // a = 2^exponent(a) * fraction(a), except for a = 0
1158 // fraction(1.0f) = 1.0f, fraction(5.0f) = 1.25f
1159 // NOTE: The name fraction clashes with an ENUM in MAC XCode CarbonCore script.h !
fraction(Vec4f const & a)1160 static inline Vec4f fraction(Vec4f const & a) {
1161     Vec4ui t1 = _mm_castps_si128(a);   // reinterpret as 32-bit integer
1162     Vec4ui t2 = Vec4ui((t1 & 0x007FFFFF) | 0x3F800000); // set exponent to 0 + bias
1163     return _mm_castsi128_ps(t2);
1164 }
1165 
1166 // Fast calculation of pow(2,n) with n integer
1167 // n  =    0 gives 1.0f
1168 // n >=  128 gives +INF
1169 // n <= -127 gives 0.0f
1170 // This function will never produce denormals, and never raise exceptions
exp2(Vec4i const & n)1171 static inline Vec4f exp2(Vec4i const & n) {
1172     Vec4i t1 = max(n,  -0x7F);         // limit to allowed range
1173     Vec4i t2 = min(t1,  0x80);
1174     Vec4i t3 = t2 + 0x7F;              // add bias
1175     Vec4i t4 = t3 << 23;               // put exponent into position 23
1176     return _mm_castsi128_ps(t4);       // reinterpret as float
1177 }
1178 //static Vec4f exp2(Vec4f const & x); // defined in vectormath_exp.h
1179 
1180 
1181 // Control word manipulaton
1182 // ------------------------
1183 // The MXCSR control word has the following bits:
1184 //  0:    Invalid Operation Flag
1185 //  1:    Denormal Flag (=subnormal)
1186 //  2:    Divide-by-Zero Flag
1187 //  3:    Overflow Flag
1188 //  4:    Underflow Flag
1189 //  5:    Precision Flag
1190 //  6:    Denormals Are Zeros (=subnormals)
1191 //  7:    Invalid Operation Mask
1192 //  8:    Denormal Operation Mask (=subnormal)
1193 //  9:    Divide-by-Zero Mask
1194 // 10:    Overflow Mask
1195 // 11:    Underflow Mask
1196 // 12:    Precision Mask
1197 // 13-14: Rounding control
1198 //        00: round to nearest or even
1199 //        01: round down towards -infinity
1200 //        10: round up   towards +infinity
1201 //        11: round towards zero (truncate)
1202 // 15: Flush to Zero
1203 
1204 // Function get_control_word:
1205 // Read the MXCSR control word
get_control_word()1206 static inline uint32_t get_control_word() {
1207     return _mm_getcsr();
1208 }
1209 
1210 // Function set_control_word:
1211 // Write the MXCSR control word
set_control_word(uint32_t w)1212 static inline void set_control_word(uint32_t w) {
1213     _mm_setcsr(w);
1214 }
1215 
1216 // Function no_subnormals:
1217 // Set "Denormals Are Zeros" and "Flush to Zero" mode to avoid the extremely
1218 // time-consuming denormals in case of underflow
no_subnormals()1219 static inline void no_subnormals() {
1220     uint32_t t1 = get_control_word();
1221     t1 |= (1 << 6) | (1 << 15);     // set bit 6 and 15 in MXCSR
1222     set_control_word(t1);
1223 }
1224 
1225 // Function reset_control_word:
1226 // Set the MXCSR control word to the default value 0x1F80.
1227 // This will mask floating point exceptions, set rounding mode to nearest (or even),
1228 // and allow denormals.
reset_control_word()1229 static inline void reset_control_word() {
1230     set_control_word(0x1F80);
1231 }
1232 
1233 
1234 // Categorization functions
1235 
1236 // Function sign_bit: gives true for elements that have the sign bit set
1237 // even for -0.0f, -INF and -NAN
1238 // Note that sign_bit(Vec4f(-0.0f)) gives true, while Vec4f(-0.0f) < Vec4f(0.0f) gives false
1239 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
sign_bit(Vec4f const & a)1240 static inline Vec4fb sign_bit(Vec4f const & a) {
1241     Vec4i t1 = _mm_castps_si128(a);    // reinterpret as 32-bit integer
1242     Vec4i t2 = t1 >> 31;               // extend sign bit
1243     return _mm_castsi128_ps(t2);       // reinterpret as 32-bit Boolean
1244 }
1245 
1246 // Function sign_combine: changes the sign of a when b has the sign bit set
1247 // same as select(sign_bit(b), -a, a)
sign_combine(Vec4f const & a,Vec4f const & b)1248 static inline Vec4f sign_combine(Vec4f const & a, Vec4f const & b) {
1249     Vec4f signmask = _mm_castsi128_ps(constant4ui<0x80000000,0x80000000,0x80000000,0x80000000>());  // -0.0
1250     return a ^ (b & signmask);
1251 }
1252 
1253 // Function is_finite: gives true for elements that are normal, denormal or zero,
1254 // false for INF and NAN
1255 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
is_finite(Vec4f const & a)1256 static inline Vec4fb is_finite(Vec4f const & a) {
1257     Vec4i t1 = _mm_castps_si128(a);    // reinterpret as 32-bit integer
1258     Vec4i t2 = t1 << 1;                // shift out sign bit
1259     Vec4i t3 = Vec4i(t2 & 0xFF000000) != 0xFF000000; // exponent field is not all 1s
1260     return Vec4ib(t3);
1261 }
1262 
1263 // Function is_inf: gives true for elements that are +INF or -INF
1264 // false for finite numbers and NAN
1265 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
is_inf(Vec4f const & a)1266 static inline Vec4fb is_inf(Vec4f const & a) {
1267     Vec4i t1 = _mm_castps_si128(a);    // reinterpret as 32-bit integer
1268     Vec4i t2 = t1 << 1;                // shift out sign bit
1269     return t2 == Vec4i(0xFF000000);    // exponent is all 1s, fraction is 0
1270 }
1271 
1272 // Function is_nan: gives true for elements that are +NAN or -NAN
1273 // false for finite numbers and +/-INF
1274 // (the underscore in the name avoids a conflict with a macro in Intel's mathimf.h)
is_nan(Vec4f const & a)1275 static inline Vec4fb is_nan(Vec4f const & a) {
1276     Vec4i t1 = _mm_castps_si128(a);    // reinterpret as 32-bit integer
1277     Vec4i t2 = t1 << 1;                // shift out sign bit
1278     Vec4i t3 = 0xFF000000;             // exponent mask
1279     Vec4i t4 = t2 & t3;                // exponent
1280     Vec4i t5 = _mm_andnot_si128(t3,t2);// fraction
1281     return Vec4ib((t4 == t3) & (t5 != 0));// exponent = all 1s and fraction != 0
1282 }
1283 
1284 // Function is_subnormal: gives true for elements that are denormal (subnormal)
1285 // false for finite numbers, zero, NAN and INF
is_subnormal(Vec4f const & a)1286 static inline Vec4fb is_subnormal(Vec4f const & a) {
1287     Vec4i t1 = _mm_castps_si128(a);    // reinterpret as 32-bit integer
1288     Vec4i t2 = t1 << 1;                // shift out sign bit
1289     Vec4i t3 = 0xFF000000;             // exponent mask
1290     Vec4i t4 = t2 & t3;                // exponent
1291     Vec4i t5 = _mm_andnot_si128(t3,t2);// fraction
1292     return Vec4ib((t4 == 0) & (t5 != 0));// exponent = 0 and fraction != 0
1293 }
1294 
1295 // Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
1296 // false for finite numbers, NAN and INF
is_zero_or_subnormal(Vec4f const & a)1297 static inline Vec4fb is_zero_or_subnormal(Vec4f const & a) {
1298     Vec4i t = _mm_castps_si128(a);     // reinterpret as 32-bit integer
1299           t &= 0x7F800000;             // isolate exponent
1300     return t == 0;                     // exponent = 0
1301 }
1302 
1303 // Function infinite4f: returns a vector where all elements are +INF
infinite4f()1304 static inline Vec4f infinite4f() {
1305     return _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));
1306 }
1307 
1308 // Function nan4f: returns a vector where all elements are NAN (quiet)
1309 static inline Vec4f nan4f(int n = 0x10) {
1310     return _mm_castsi128_ps(_mm_set1_epi32(0x7FC00000 + n));
1311 }
1312 
1313 
1314 /*****************************************************************************
1315 *
1316 *          Vector Vec4f permute and blend functions
1317 *
1318 ******************************************************************************
1319 *
1320 * The permute function can reorder the elements of a vector and optionally
1321 * set some elements to zero.
1322 *
1323 * The indexes are inserted as template parameters in <>. These indexes must be
1324 * constants. Each template parameter is an index to the element you want to
1325 * select. A negative index will generate zero.
1326 *
1327 * Example:
1328 * Vec4f a(10.f,11.f,12.f,13.f);        // a is (10,11,12,13)
1329 * Vec4f b, c;
1330 * b = permute4f<0,0,2,2>(a);           // b is (10,10,12,12)
1331 * c = permute4f<3,2,-1,-1>(a);         // c is (13,12, 0, 0)
1332 *
1333 *
1334 * The blend function can mix elements from two different vectors and
1335 * optionally set some elements to zero.
1336 *
1337 * The indexes are inserted as template parameters in <>. These indexes must be
1338 * constants. Each template parameter is an index to the element you want to
1339 * select, where indexes 0 - 3 indicate an element from the first source
1340 * vector and indexes 4 - 7 indicate an element from the second source vector.
1341 * A negative index will generate zero.
1342 *
1343 *
1344 * Example:
1345 * Vec4f a(10.f,11.f,12.f,13.f);        // a is (10, 11, 12, 13)
1346 * Vec4f b(20.f,21.f,22.f,23.f);        // b is (20, 21, 22, 23)
1347 * Vec4f c;
1348 * c = blend4f<1,4,-1,7> (a,b);         // c is (11, 20,  0, 23)
1349 *
1350 * Don't worry about the complicated code for these functions. Most of the
1351 * code is resolved at compile time to generate only a few instructions.
1352 *****************************************************************************/
1353 
1354 // permute vector Vec4f
1355 template <int i0, int i1, int i2, int i3>
permute4f(Vec4f const & a)1356 static inline Vec4f permute4f(Vec4f const & a) {
1357     // is shuffling needed
1358     const bool do_shuffle = (i0 > 0) || (i1 != 1 && i1 >= 0) || (i2 != 2 && i2 >= 0) || (i3 != 3 && i3 >= 0);
1359     // is zeroing needed
1360     const bool do_zero    = (i0 | i1 | i2 | i3) < 0 && ((i0 | i1 | i2 | i3) & 0x80);
1361 
1362     if (!do_shuffle && !do_zero) {
1363         return a;                                          // trivial case: do nothing
1364     }
1365     if (do_zero && !do_shuffle) {                          // zeroing, not shuffling
1366         if ((i0 & i1 & i2 & i3) < 0) return _mm_setzero_ps(); // zero everything
1367         // zero some elements
1368         __m128i mask1 = constant4i< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0) >();
1369         return  _mm_and_ps(a,_mm_castsi128_ps(mask1));     // zero with AND mask
1370     }
1371     if (do_shuffle && !do_zero) {                          // shuffling, not zeroing
1372         return _mm_shuffle_ps(a, a, (i0&3) | (i1&3)<<2 | (i2&3)<<4 | (i3&3)<<6);
1373     }
1374     // both shuffle and zero
1375     if ((i0 & i1) < 0 && (i2 | i3) >= 0) {                 // zero low half, shuffle high half
1376         return _mm_shuffle_ps(_mm_setzero_ps(), a, (i2&3)<<4 | (i3&3)<<6);
1377     }
1378     if ((i0 | i1) >= 0 && (i2 & i3) < 0) {                 // shuffle low half, zero high half
1379         return _mm_shuffle_ps(a, _mm_setzero_ps(), (i0&3) | (i1&3)<<2);
1380     }
1381 #if  INSTRSET >= 4  // SSSE3
1382     // With SSSE3 we can do both with the PSHUFB instruction
1383     const int j0 = (i0 & 3) << 2;
1384     const int j1 = (i1 & 3) << 2;
1385     const int j2 = (i2 & 3) << 2;
1386     const int j3 = (i3 & 3) << 2;
1387     __m128i mask2 = constant4i <
1388         i0 < 0 ? -1 : j0 | (j0+1)<<8 | (j0+2)<<16 | (j0+3) << 24,
1389         i1 < 0 ? -1 : j1 | (j1+1)<<8 | (j1+2)<<16 | (j1+3) << 24,
1390         i2 < 0 ? -1 : j2 | (j2+1)<<8 | (j2+2)<<16 | (j2+3) << 24,
1391         i3 < 0 ? -1 : j3 | (j3+1)<<8 | (j3+2)<<16 | (j3+3) << 24 > ();
1392     return _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(a),mask2));
1393 #else
1394     __m128 t1 = _mm_shuffle_ps(a, a, (i0&3) | (i1&3)<<2 | (i2&3)<<4 | (i3&3)<<6); // shuffle
1395     __m128i mask3 = constant4i< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0) >();
1396     return _mm_and_ps(t1,_mm_castsi128_ps(mask3));     // zero with AND mask
1397 #endif
1398 }
1399 
1400 
1401 // blend vectors Vec4f
1402 template <int i0, int i1, int i2, int i3>
blend4f(Vec4f const & a,Vec4f const & b)1403 static inline Vec4f blend4f(Vec4f const & a, Vec4f const & b) {
1404 
1405     // Combine all the indexes into a single bitfield, with 8 bits for each
1406     const int m1 = (i0&7) | (i1&7)<<8 | (i2&7)<<16 | (i3&7)<<24;
1407 
1408     // Mask to zero out negative indexes
1409     const int m2 = (i0<0?0:0xFF) | (i1<0?0:0xFF)<<8 | (i2<0?0:0xFF)<<16 | (i3<0?0:0xFF)<<24;
1410 
1411     if ((m1 & 0x04040404 & m2) == 0) {
1412         // no elements from b
1413         return permute4f<i0,i1,i2,i3>(a);
1414     }
1415     if (((m1^0x04040404) & 0x04040404 & m2) == 0) {
1416         // no elements from a
1417         return permute4f<i0&~4, i1&~4, i2&~4, i3&~4>(b);
1418     }
1419     if (((m1 & ~0x04040404) ^ 0x03020100) == 0 && m2 == -1) {
1420         // selecting without shuffling or zeroing
1421         __m128i sel = constant4i <i0 & 4 ? 0 : -1, i1 & 4 ? 0 : -1, i2 & 4 ? 0 : -1, i3 & 4 ? 0 : -1> ();
1422         return selectf(_mm_castsi128_ps(sel), a, b);
1423     }
1424 #ifdef __XOP__     // Use AMD XOP instruction PPERM
1425     __m128i maska = constant4i <
1426         i0 < 0 ? 0x80808080 : (i0*4 & 31) + (((i0*4 & 31) + 1) << 8) + (((i0*4 & 31) + 2) << 16) + (((i0*4 & 31) + 3) << 24),
1427         i1 < 0 ? 0x80808080 : (i1*4 & 31) + (((i1*4 & 31) + 1) << 8) + (((i1*4 & 31) + 2) << 16) + (((i1*4 & 31) + 3) << 24),
1428         i2 < 0 ? 0x80808080 : (i2*4 & 31) + (((i2*4 & 31) + 1) << 8) + (((i2*4 & 31) + 2) << 16) + (((i2*4 & 31) + 3) << 24),
1429         i3 < 0 ? 0x80808080 : (i3*4 & 31) + (((i3*4 & 31) + 1) << 8) + (((i3*4 & 31) + 2) << 16) + (((i3*4 & 31) + 3) << 24) > ();
1430     return _mm_castsi128_ps(_mm_perm_epi8(_mm_castps_si128(a), _mm_castps_si128(b), maska));
1431 #else
1432     if ((((m1 & ~0x04040404) ^ 0x03020100) & m2) == 0) {
1433         // selecting and zeroing, not shuffling
1434         __m128i sel1  = constant4i <i0 & 4 ? 0 : -1, i1 & 4 ? 0 : -1, i2 & 4 ? 0 : -1, i3 & 4 ? 0 : -1> ();
1435         __m128i mask1 = constant4i< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0) >();
1436         __m128 t1 = selectf(_mm_castsi128_ps(sel1), a, b);   // select
1437         return  _mm_and_ps(t1, _mm_castsi128_ps(mask1));     // zero
1438     }
1439     // special cases unpckhps, unpcklps, shufps
1440     Vec4f t;
1441     if (((m1 ^ 0x05010400) & m2) == 0) {
1442         t = _mm_unpacklo_ps(a, b);
1443         goto DOZERO;
1444     }
1445     if (((m1 ^ 0x01050004) & m2) == 0) {
1446         t = _mm_unpacklo_ps(b, a);
1447         goto DOZERO;
1448     }
1449     if (((m1 ^ 0x07030602) & m2) == 0) {
1450         t = _mm_unpackhi_ps(a, b);
1451         goto DOZERO;
1452     }
1453     if (((m1 ^ 0x03070206) & m2) == 0) {
1454         t = _mm_unpackhi_ps(b, a);
1455         goto DOZERO;
1456     }
1457     // first two elements from a, last two from b
1458     if (((m1^0x04040000) & 0x04040404 & m2) == 0) {
1459         t = _mm_shuffle_ps(a, b, (i0&3) + ((i1&3)<<2) + ((i2&3)<<4) + ((i3&3)<<6));
1460         goto DOZERO;
1461     }
1462     // first two elements from b, last two from a
1463     if (((m1^0x00000404) & 0x04040404 & m2) == 0) {
1464         t = _mm_shuffle_ps(b, a, (i0&3) + ((i1&3)<<2) + ((i2&3)<<4) + ((i3&3)<<6));
1465         goto DOZERO;
1466     }
1467     {   // general case. combine two permutes
1468         __m128 a1 = permute4f <
1469             (uint32_t)i0 < 4 ? i0 : -1,
1470             (uint32_t)i1 < 4 ? i1 : -1,
1471             (uint32_t)i2 < 4 ? i2 : -1,
1472             (uint32_t)i3 < 4 ? i3 : -1  > (a);
1473         __m128 b1 = permute4f <
1474             (uint32_t)(i0^4) < 4 ? (i0^4) : -1,
1475             (uint32_t)(i1^4) < 4 ? (i1^4) : -1,
1476             (uint32_t)(i2^4) < 4 ? (i2^4) : -1,
1477             (uint32_t)(i3^4) < 4 ? (i3^4) : -1  > (b);
1478         return  _mm_or_ps(a1,b1);
1479     }
1480 DOZERO:
1481     if ((i0|i1|i2|i3) & 0x80) {
1482         // zero some elements
1483         __m128i mask1 = constant4i< -int(i0>=0), -int(i1>=0), -int(i2>=0), -int(i3>=0) >();
1484         t = _mm_and_ps(t,_mm_castsi128_ps(mask1));     // zero with AND mask
1485     }
1486     return t;
1487 
1488 #endif // __XOP__
1489 }
1490 
1491 // change signs on vectors Vec4f
1492 // Each index i0 - i3 is 1 for changing sign on the corresponding element, 0 for no change
1493 template <int i0, int i1, int i2, int i3>
change_sign(Vec4f const & a)1494 static inline Vec4f change_sign(Vec4f const & a) {
1495     if ((i0 | i1 | i2 | i3) == 0) return a;
1496     __m128i mask = constant4ui<i0 ? 0x80000000 : 0, i1 ? 0x80000000 : 0, i2 ? 0x80000000 : 0, i3 ? 0x80000000 : 0>();
1497     return  _mm_xor_ps(a, _mm_castsi128_ps(mask));     // flip sign bits
1498 }
1499 
1500 
1501 /*****************************************************************************
1502 *
1503 *          Vec2d: Vector of 2 double precision floating point values
1504 *
1505 *****************************************************************************/
1506 
1507 class Vec2d {
1508 protected:
1509     __m128d xmm; // double vector
1510 public:
1511     // Default constructor:
Vec2d()1512     Vec2d() {
1513     }
1514     // Constructor to broadcast the same value into all elements:
Vec2d(double d)1515     Vec2d(double d) {
1516         xmm = _mm_set1_pd(d);
1517     }
1518     // Constructor to build from all elements:
Vec2d(double d0,double d1)1519     Vec2d(double d0, double d1) {
1520         xmm = _mm_setr_pd(d0, d1);
1521     }
1522     // Constructor to convert from type __m128d used in intrinsics:
Vec2d(__m128d const & x)1523     Vec2d(__m128d const & x) {
1524         xmm = x;
1525     }
1526     // Assignment operator to convert from type __m128d used in intrinsics:
1527     Vec2d & operator = (__m128d const & x) {
1528         xmm = x;
1529         return *this;
1530     }
1531     // Type cast operator to convert to __m128d used in intrinsics
__m128d()1532     operator __m128d() const {
1533         return xmm;
1534     }
1535     // Member function to load from array (unaligned)
load(double const * p)1536     Vec2d & load(double const * p) {
1537         xmm = _mm_loadu_pd(p);
1538         return *this;
1539     }
1540     // Member function to load from array, aligned by 16
1541     // "load_a" is faster than "load" on older Intel processors (Pentium 4, Pentium M, Core 1,
1542     // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
1543     // You may use load_a instead of load if you are certain that p points to an address
1544     // divisible by 16.
load_a(double const * p)1545     Vec2d const & load_a(double const * p) {
1546         xmm = _mm_load_pd(p);
1547         return *this;
1548     }
1549     // Member function to store into array (unaligned)
store(double * p)1550     void store(double * p) const {
1551         _mm_storeu_pd(p, xmm);
1552     }
1553     // Member function to store into array, aligned by 16
1554     // "store_a" is faster than "store" on older Intel processors (Pentium 4, Pentium M, Core 1,
1555     // Merom, Wolfdale) and Atom, but not on other processors from Intel, AMD or VIA.
1556     // You may use store_a instead of store if you are certain that p points to an address
1557     // divisible by 16.
store_a(double * p)1558     void store_a(double * p) const {
1559         _mm_store_pd(p, xmm);
1560     }
1561     // Partial load. Load n elements and set the rest to 0
load_partial(int n,double const * p)1562     Vec2d & load_partial(int n, double const * p) {
1563         if (n == 1) {
1564             xmm = _mm_load_sd(p);
1565         }
1566         else if (n == 2) {
1567             load(p);
1568         }
1569         else {
1570             xmm = _mm_setzero_pd();
1571         }
1572         return *this;
1573     }
1574     // Partial store. Store n elements
store_partial(int n,double * p)1575     void store_partial(int n, double * p) const {
1576         if (n == 1) {
1577             _mm_store_sd(p, xmm);
1578         }
1579         else if (n == 2) {
1580             store(p);
1581         }
1582     }
1583     // cut off vector to n elements. The last 4-n elements are set to zero
cutoff(int n)1584     Vec2d & cutoff(int n) {
1585         xmm = _mm_castps_pd(Vec4f(_mm_castpd_ps(xmm)).cutoff(n*2));
1586         return *this;
1587     }
1588     // Member function to change a single element in vector
1589     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,double value)1590     Vec2d const & insert(uint32_t index, double value) {
1591         __m128d v2 = _mm_set_sd(value);
1592         if (index == 0) {
1593             xmm = _mm_shuffle_pd(v2,xmm,2);
1594         }
1595         else {
1596             xmm = _mm_shuffle_pd(xmm,v2,0);
1597         }
1598         return *this;
1599     };
1600     // Member function extract a single element from vector
extract(uint32_t index)1601     double extract(uint32_t index) const {
1602         double x[2];
1603         store(x);
1604         return x[index & 1];
1605     }
1606     // Extract a single element. Use store function if extracting more than one element.
1607     // Operator [] can only read an element, not write.
1608     double operator [] (uint32_t index) const {
1609         return extract(index);
1610     }
size()1611     static int size() {
1612         return 2;
1613     }
1614 };
1615 
1616 
1617 /*****************************************************************************
1618 *
1619 *          Operators for Vec2d
1620 *
1621 *****************************************************************************/
1622 
1623 // vector operator + : add element by element
1624 static inline Vec2d operator + (Vec2d const & a, Vec2d const & b) {
1625     return _mm_add_pd(a, b);
1626 }
1627 
1628 // vector operator + : add vector and scalar
1629 static inline Vec2d operator + (Vec2d const & a, double b) {
1630     return a + Vec2d(b);
1631 }
1632 static inline Vec2d operator + (double a, Vec2d const & b) {
1633     return Vec2d(a) + b;
1634 }
1635 
1636 // vector operator += : add
1637 static inline Vec2d & operator += (Vec2d & a, Vec2d const & b) {
1638     a = a + b;
1639     return a;
1640 }
1641 
1642 // postfix operator ++
1643 static inline Vec2d operator ++ (Vec2d & a, int) {
1644     Vec2d a0 = a;
1645     a = a + 1.0;
1646     return a0;
1647 }
1648 
1649 // prefix operator ++
1650 static inline Vec2d & operator ++ (Vec2d & a) {
1651     a = a + 1.0;
1652     return a;
1653 }
1654 
1655 // vector operator - : subtract element by element
1656 static inline Vec2d operator - (Vec2d const & a, Vec2d const & b) {
1657     return _mm_sub_pd(a, b);
1658 }
1659 
1660 // vector operator - : subtract vector and scalar
1661 static inline Vec2d operator - (Vec2d const & a, double b) {
1662     return a - Vec2d(b);
1663 }
1664 static inline Vec2d operator - (double a, Vec2d const & b) {
1665     return Vec2d(a) - b;
1666 }
1667 
1668 // vector operator - : unary minus
1669 // Change sign bit, even for 0, INF and NAN
1670 static inline Vec2d operator - (Vec2d const & a) {
1671     return _mm_xor_pd(a, _mm_castsi128_pd(_mm_setr_epi32(0,0x80000000,0,0x80000000)));
1672 }
1673 
1674 // vector operator -= : subtract
1675 static inline Vec2d & operator -= (Vec2d & a, Vec2d const & b) {
1676     a = a - b;
1677     return a;
1678 }
1679 
1680 // postfix operator --
1681 static inline Vec2d operator -- (Vec2d & a, int) {
1682     Vec2d a0 = a;
1683     a = a - 1.0;
1684     return a0;
1685 }
1686 
1687 // prefix operator --
1688 static inline Vec2d & operator -- (Vec2d & a) {
1689     a = a - 1.0;
1690     return a;
1691 }
1692 
1693 // vector operator * : multiply element by element
1694 static inline Vec2d operator * (Vec2d const & a, Vec2d const & b) {
1695     return _mm_mul_pd(a, b);
1696 }
1697 
1698 // vector operator * : multiply vector and scalar
1699 static inline Vec2d operator * (Vec2d const & a, double b) {
1700     return a * Vec2d(b);
1701 }
1702 static inline Vec2d operator * (double a, Vec2d const & b) {
1703     return Vec2d(a) * b;
1704 }
1705 
1706 // vector operator *= : multiply
1707 static inline Vec2d & operator *= (Vec2d & a, Vec2d const & b) {
1708     a = a * b;
1709     return a;
1710 }
1711 
1712 // vector operator / : divide all elements by same integer
1713 static inline Vec2d operator / (Vec2d const & a, Vec2d const & b) {
1714     return _mm_div_pd(a, b);
1715 }
1716 
1717 // vector operator / : divide vector and scalar
1718 static inline Vec2d operator / (Vec2d const & a, double b) {
1719     return a / Vec2d(b);
1720 }
1721 static inline Vec2d operator / (double a, Vec2d const & b) {
1722     return Vec2d(a) / b;
1723 }
1724 
1725 // vector operator /= : divide
1726 static inline Vec2d & operator /= (Vec2d & a, Vec2d const & b) {
1727     a = a / b;
1728     return a;
1729 }
1730 
1731 // vector operator == : returns true for elements for which a == b
1732 static inline Vec2db operator == (Vec2d const & a, Vec2d const & b) {
1733     return _mm_cmpeq_pd(a, b);
1734 }
1735 
1736 // vector operator != : returns true for elements for which a != b
1737 static inline Vec2db operator != (Vec2d const & a, Vec2d const & b) {
1738     return _mm_cmpneq_pd(a, b);
1739 }
1740 
1741 // vector operator < : returns true for elements for which a < b
1742 static inline Vec2db operator < (Vec2d const & a, Vec2d const & b) {
1743     return _mm_cmplt_pd(a, b);
1744 }
1745 
1746 // vector operator <= : returns true for elements for which a <= b
1747 static inline Vec2db operator <= (Vec2d const & a, Vec2d const & b) {
1748     return _mm_cmple_pd(a, b);
1749 }
1750 
1751 // vector operator > : returns true for elements for which a > b
1752 static inline Vec2db operator > (Vec2d const & a, Vec2d const & b) {
1753     return b < a;
1754 }
1755 
1756 // vector operator >= : returns true for elements for which a >= b
1757 static inline Vec2db operator >= (Vec2d const & a, Vec2d const & b) {
1758     return b <= a;
1759 }
1760 
1761 // Bitwise logical operators
1762 
1763 // vector operator & : bitwise and
1764 static inline Vec2d operator & (Vec2d const & a, Vec2d const & b) {
1765     return _mm_and_pd(a, b);
1766 }
1767 
1768 // vector operator &= : bitwise and
1769 static inline Vec2d & operator &= (Vec2d & a, Vec2d const & b) {
1770     a = a & b;
1771     return a;
1772 }
1773 
1774 // vector operator & : bitwise and of Vec2d and Vec2db
1775 static inline Vec2d operator & (Vec2d const & a, Vec2db const & b) {
1776     return _mm_and_pd(a, b);
1777 }
1778 static inline Vec2d operator & (Vec2db const & a, Vec2d const & b) {
1779     return _mm_and_pd(a, b);
1780 }
1781 
1782 // vector operator | : bitwise or
1783 static inline Vec2d operator | (Vec2d const & a, Vec2d const & b) {
1784     return _mm_or_pd(a, b);
1785 }
1786 
1787 // vector operator |= : bitwise or
1788 static inline Vec2d & operator |= (Vec2d & a, Vec2d const & b) {
1789     a = a | b;
1790     return a;
1791 }
1792 
1793 // vector operator ^ : bitwise xor
1794 static inline Vec2d operator ^ (Vec2d const & a, Vec2d const & b) {
1795     return _mm_xor_pd(a, b);
1796 }
1797 
1798 // vector operator ^= : bitwise xor
1799 static inline Vec2d & operator ^= (Vec2d & a, Vec2d const & b) {
1800     a = a ^ b;
1801     return a;
1802 }
1803 
1804 // vector operator ! : logical not. Returns Boolean vector
1805 static inline Vec2db operator ! (Vec2d const & a) {
1806     return a == Vec2d(0.0);
1807 }
1808 
1809 
1810 /*****************************************************************************
1811 *
1812 *          Functions for Vec2d
1813 *
1814 *****************************************************************************/
1815 
1816 // Select between two operands. Corresponds to this pseudocode:
1817 // for (int i = 0; i < 2; i++) result[i] = s[i] ? a[i] : b[i];
1818 // Each byte in s must be either 0 (false) or 0xFFFFFFFFFFFFFFFF (true).
1819 // No other values are allowed.
select(Vec2db const & s,Vec2d const & a,Vec2d const & b)1820 static inline Vec2d select (Vec2db const & s, Vec2d const & a, Vec2d const & b) {
1821     return selectd(s,a,b);
1822 }
1823 
1824 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec2db const & f,Vec2d const & a,Vec2d const & b)1825 static inline Vec2d if_add (Vec2db const & f, Vec2d const & a, Vec2d const & b) {
1826     return a + (Vec2d(f) & b);
1827 }
1828 
1829 // Conditional multiply: For all vector elements i: result[i] = f[i] ? (a[i] * b[i]) : a[i]
if_mul(Vec2db const & f,Vec2d const & a,Vec2d const & b)1830 static inline Vec2d if_mul (Vec2db const & f, Vec2d const & a, Vec2d const & b) {
1831     return a * select(f, b, 1.);
1832 }
1833 
1834 
1835 // General arithmetic functions, etc.
1836 
1837 // Horizontal add: Calculates the sum of all vector elements.
horizontal_add(Vec2d const & a)1838 static inline double horizontal_add (Vec2d const & a) {
1839 #if  INSTRSET >= 3  // SSE3
1840     __m128d t1 = _mm_hadd_pd(a,a);
1841     return _mm_cvtsd_f64(t1);
1842 #else
1843     __m128  t0 = _mm_castpd_ps(a);
1844     __m128d t1 = _mm_castps_pd(_mm_movehl_ps(t0,t0));
1845     __m128d t2 = _mm_add_sd(a,t1);
1846     return _mm_cvtsd_f64(t2);
1847 #endif
1848 }
1849 
1850 // function max: a > b ? a : b
max(Vec2d const & a,Vec2d const & b)1851 static inline Vec2d max(Vec2d const & a, Vec2d const & b) {
1852     return _mm_max_pd(a,b);
1853 }
1854 
1855 // function min: a < b ? a : b
min(Vec2d const & a,Vec2d const & b)1856 static inline Vec2d min(Vec2d const & a, Vec2d const & b) {
1857     return _mm_min_pd(a,b);
1858 }
1859 
1860 // function abs: absolute value
1861 // Removes sign bit, even for -0.0f, -INF and -NAN
abs(Vec2d const & a)1862 static inline Vec2d abs(Vec2d const & a) {
1863     __m128d mask = _mm_castsi128_pd(_mm_setr_epi32(-1,0x7FFFFFFF,-1,0x7FFFFFFF));
1864     return _mm_and_pd(a,mask);
1865 }
1866 
1867 // function sqrt: square root
sqrt(Vec2d const & a)1868 static inline Vec2d sqrt(Vec2d const & a) {
1869     return _mm_sqrt_pd(a);
1870 }
1871 
1872 // function square: a * a
square(Vec2d const & a)1873 static inline Vec2d square(Vec2d const & a) {
1874     return a * a;
1875 }
1876 
1877 // pow(Vec2d, int):
1878 // The purpose of this template is to prevent implicit conversion of a float
1879 // exponent to int when calling pow(vector, float) and vectormath_exp.h is
1880 // not included
1881 
1882 template <typename TT> static Vec2d pow(Vec2d const & a, TT const & n);
1883 
1884 // Raise floating point numbers to integer power n
1885 template <>
1886 inline Vec2d pow<int>(Vec2d const & x0, int const & n) {
1887     return pow_template_i<Vec2d>(x0, n);
1888 }
1889 
1890 // allow conversion from unsigned int
1891 template <>
1892 inline Vec2d pow<uint32_t>(Vec2d const & x0, uint32_t const & n) {
1893     return pow_template_i<Vec2d>(x0, (int)n);
1894 }
1895 
1896 
1897 // Raise floating point numbers to integer power n, where n is a compile-time constant
1898 template <int n>
pow_n(Vec2d const & a)1899 static inline Vec2d pow_n(Vec2d const & a) {
1900     if (n < 0)    return Vec2d(1.0) / pow_n<-n>(a);
1901     if (n == 0)   return Vec2d(1.0);
1902     if (n >= 256) return pow(a, n);
1903     Vec2d x = a;                       // a^(2^i)
1904     Vec2d y;                           // accumulator
1905     const int lowest = n - (n & (n-1));// lowest set bit in n
1906     if (n & 1) y = x;
1907     if (n < 2) return y;
1908     x = x*x;                           // x^2
1909     if (n & 2) {
1910         if (lowest == 2) y = x; else y *= x;
1911     }
1912     if (n < 4) return y;
1913     x = x*x;                           // x^4
1914     if (n & 4) {
1915         if (lowest == 4) y = x; else y *= x;
1916     }
1917     if (n < 8) return y;
1918     x = x*x;                           // x^8
1919     if (n & 8) {
1920         if (lowest == 8) y = x; else y *= x;
1921     }
1922     if (n < 16) return y;
1923     x = x*x;                           // x^16
1924     if (n & 16) {
1925         if (lowest == 16) y = x; else y *= x;
1926     }
1927     if (n < 32) return y;
1928     x = x*x;                           // x^32
1929     if (n & 32) {
1930         if (lowest == 32) y = x; else y *= x;
1931     }
1932     if (n < 64) return y;
1933     x = x*x;                           // x^64
1934     if (n & 64) {
1935         if (lowest == 64) y = x; else y *= x;
1936     }
1937     if (n < 128) return y;
1938     x = x*x;                           // x^128
1939     if (n & 128) {
1940         if (lowest == 128) y = x; else y *= x;
1941     }
1942     return y;
1943 }
1944 
1945 template <int n>
pow(Vec2d const & a,Const_int_t<n>)1946 static inline Vec2d pow(Vec2d const & a, Const_int_t<n>) {
1947     return pow_n<n>(a);
1948 }
1949 
1950 
1951 // avoid unsafe optimization in function round
1952 #if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__) && INSTRSET < 5
1953 static inline Vec2d round(Vec2d const & a) __attribute__ ((optimize("-fno-unsafe-math-optimizations")));
1954 #elif defined (FLOAT_CONTROL_PRECISE_FOR_ROUND)
1955 #pragma float_control(push)
1956 #pragma float_control(precise,on)
1957 #endif
1958 // function round: round to nearest integer (even). (result as double vector)
round(Vec2d const & a)1959 static inline Vec2d round(Vec2d const & a) {
1960 #if INSTRSET >= 5   // SSE4.1 supported
1961     return _mm_round_pd(a, 0+8);
1962 #else // SSE2. Use magic number method
1963     // Note: assume MXCSR control register is set to rounding
1964     // (don't use conversion to int, it will limit the value to +/- 2^31)
1965     Vec2d signmask    = _mm_castsi128_pd(constant4ui<0,0x80000000,0,0x80000000>());  // -0.0
1966     Vec2d magic       = _mm_castsi128_pd(constant4ui<0,0x43300000,0,0x43300000>());  // magic number = 2^52
1967     Vec2d sign        = _mm_and_pd(a, signmask);                                    // signbit of a
1968     Vec2d signedmagic = _mm_or_pd(magic, sign);                                     // magic number with sign of a
1969     return a + signedmagic - signedmagic;                                           // round by adding magic number
1970 #endif
1971 }
1972 #if defined (FLOAT_CONTROL_PRECISE_FOR_ROUND)
1973 #pragma float_control(pop)
1974 #endif
1975 
1976 // function truncate: round towards zero. (result as double vector)
truncate(Vec2d const & a)1977 static inline Vec2d truncate(Vec2d const & a) {
1978 // (note: may fail on MS Visual Studio 2008, works in later versions)
1979 #if INSTRSET >= 5   // SSE4.1 supported
1980     return _mm_round_pd(a, 3+8);
1981 #else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
1982     uint32_t t1 = _mm_getcsr();        // MXCSR
1983     uint32_t t2 = t1 | (3 << 13);      // bit 13-14 = 11
1984     _mm_setcsr(t2);                    // change MXCSR
1985     Vec2d r = round(a);                // use magic number method
1986     _mm_setcsr(t1);                    // restore MXCSR
1987     return r;
1988 #endif
1989 }
1990 
1991 // function floor: round towards minus infinity. (result as double vector)
1992 // (note: may fail on MS Visual Studio 2008, works in later versions)
floor(Vec2d const & a)1993 static inline Vec2d floor(Vec2d const & a) {
1994 #if INSTRSET >= 5   // SSE4.1 supported
1995     return _mm_round_pd(a, 1+8);
1996 #else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
1997     uint32_t t1 = _mm_getcsr();        // MXCSR
1998     uint32_t t2 = t1 | (1 << 13);      // bit 13-14 = 01
1999     _mm_setcsr(t2);                    // change MXCSR
2000     Vec2d r = round(a);                // use magic number method
2001     _mm_setcsr(t1);                    // restore MXCSR
2002     return r;
2003 #endif
2004 }
2005 
2006 // function ceil: round towards plus infinity. (result as double vector)
ceil(Vec2d const & a)2007 static inline Vec2d ceil(Vec2d const & a) {
2008 #if INSTRSET >= 5   // SSE4.1 supported
2009     return _mm_round_pd(a, 2+8);
2010 #else  // SSE2. Use magic number method (conversion to int would limit the value to 2^31)
2011     uint32_t t1 = _mm_getcsr();        // MXCSR
2012     uint32_t t2 = t1 | (2 << 13);      // bit 13-14 = 10
2013     _mm_setcsr(t2);                    // change MXCSR
2014     Vec2d r = round(a);                // use magic number method
2015     _mm_setcsr(t1);                    // restore MXCSR
2016     return r;
2017 #endif
2018 }
2019 
2020 // function truncate_to_int: round towards zero.
truncate_to_int(Vec2d const & a,Vec2d const & b)2021 static inline Vec4i truncate_to_int(Vec2d const & a, Vec2d const & b) {
2022     Vec4i t1 = _mm_cvttpd_epi32(a);
2023     Vec4i t2 = _mm_cvttpd_epi32(b);
2024     return blend4i<0,1,4,5> (t1, t2);
2025 }
2026 
2027 // function round_to_int: round to nearest integer (even).
2028 // result as 32-bit integer vector
round_to_int(Vec2d const & a,Vec2d const & b)2029 static inline Vec4i round_to_int(Vec2d const & a, Vec2d const & b) {
2030     // Note: assume MXCSR control register is set to rounding
2031     Vec4i t1 = _mm_cvtpd_epi32(a);
2032     Vec4i t2 = _mm_cvtpd_epi32(b);
2033     return blend4i<0,1,4,5> (t1, t2);
2034 }
2035 // function round_to_int: round to nearest integer (even).
2036 // result as 32-bit integer vector. Upper two values of result are 0
round_to_int(Vec2d const & a)2037 static inline Vec4i round_to_int(Vec2d const & a) {
2038     Vec4i t1 = _mm_cvtpd_epi32(a);
2039     return t1;
2040 }
2041 
2042 // function truncate_to_int64: round towards zero. (inefficient)
truncate_to_int64(Vec2d const & a)2043 static inline Vec2q truncate_to_int64(Vec2d const & a) {
2044 #if defined (__AVX512DQ__) && defined (__AVX512VL__)
2045     //return _mm_maskz_cvttpd_epi64( __mmask8(0xFF), a);
2046     return _mm_cvttpd_epi64(a);
2047 #else
2048     double aa[2];
2049     a.store(aa);
2050     return Vec2q(int64_t(aa[0]), int64_t(aa[1]));
2051 #endif
2052 }
2053 
2054 // function truncate_to_int64_limited: round towards zero. (inefficient)
2055 // result as 64-bit integer vector, but with limited range. Deprecated!
truncate_to_int64_limited(Vec2d const & a)2056 static inline Vec2q truncate_to_int64_limited(Vec2d const & a) {
2057 #if defined (__AVX512DQ__) && defined (__AVX512VL__)
2058     return truncate_to_int64(a);
2059 #else
2060     // Note: assume MXCSR control register is set to rounding
2061     Vec4i t1 = _mm_cvttpd_epi32(a);
2062     return extend_low(t1);
2063 #endif
2064 }
2065 
2066 // function round_to_int64: round to nearest or even. (inefficient)
round_to_int64(Vec2d const & a)2067 static inline Vec2q round_to_int64(Vec2d const & a) {
2068 #if defined (__AVX512DQ__) && defined (__AVX512VL__)
2069     return _mm_cvtpd_epi64(a);
2070 #else
2071     return truncate_to_int64(round(a));
2072 #endif
2073 }
2074 
2075 // function round_to_int: round to nearest integer (even)
2076 // result as 64-bit integer vector, but with limited range. Deprecated!
round_to_int64_limited(Vec2d const & a)2077 static inline Vec2q round_to_int64_limited(Vec2d const & a) {
2078 #if defined (__AVX512DQ__) && defined (__AVX512VL__)
2079     return round_to_int64(a);
2080 #else
2081     // Note: assume MXCSR control register is set to rounding
2082     Vec4i t1 = _mm_cvtpd_epi32(a);
2083     return extend_low(t1);
2084 #endif
2085 }
2086 
2087 // function to_double: convert integer vector elements to double vector (inefficient)
to_double(Vec2q const & a)2088 static inline Vec2d to_double(Vec2q const & a) {
2089 #if defined (__AVX512DQ__) && defined (__AVX512VL__)
2090     return _mm_maskz_cvtepi64_pd( __mmask8(0xFF), a);
2091 #else
2092     int64_t aa[2];
2093     a.store(aa);
2094     return Vec2d(double(aa[0]), double(aa[1]));
2095 #endif
2096 }
2097 
2098 // function to_double_limited: convert integer vector elements to double vector
2099 // limited to abs(x) < 2^31. Deprecated!
to_double_limited(Vec2q const & x)2100 static inline Vec2d to_double_limited(Vec2q const & x) {
2101 #if defined (__AVX512DQ__) && defined (__AVX512VL__)
2102     return to_double(x);
2103 #else
2104     Vec4i compressed = permute4i<0,2,-256,-256>(Vec4i(x));
2105     return _mm_cvtepi32_pd(compressed);
2106 #endif
2107 }
2108 
2109 // function to_double_low: convert integer vector elements [0] and [1] to double vector
to_double_low(Vec4i const & a)2110 static inline Vec2d to_double_low(Vec4i const & a) {
2111     return _mm_cvtepi32_pd(a);
2112 }
2113 
2114 // function to_double_high: convert integer vector elements [2] and [3] to double vector
to_double_high(Vec4i const & a)2115 static inline Vec2d to_double_high(Vec4i const & a) {
2116     return to_double_low(_mm_srli_si128(a,8));
2117 }
2118 
2119 // function compress: convert two Vec2d to one Vec4f
compress(Vec2d const & low,Vec2d const & high)2120 static inline Vec4f compress (Vec2d const & low, Vec2d const & high) {
2121     Vec4f t1 = _mm_cvtpd_ps(low);
2122     Vec4f t2 = _mm_cvtpd_ps(high);
2123     return blend4f<0,1,4,5> (t1, t2);
2124 }
2125 
2126 // Function extend_low : convert Vec4f vector elements [0] and [1] to Vec2d
extend_low(Vec4f const & a)2127 static inline Vec2d extend_low (Vec4f const & a) {
2128     return _mm_cvtps_pd(a);
2129 }
2130 
2131 // Function extend_high : convert Vec4f vector elements [2] and [3] to Vec2d
extend_high(Vec4f const & a)2132 static inline Vec2d extend_high (Vec4f const & a) {
2133     return _mm_cvtps_pd(_mm_movehl_ps(a,a));
2134 }
2135 
2136 
2137 // Fused multiply and add functions
2138 
2139 // Multiply and add
mul_add(Vec2d const & a,Vec2d const & b,Vec2d const & c)2140 static inline Vec2d mul_add(Vec2d const & a, Vec2d const & b, Vec2d const & c) {
2141 #ifdef __FMA__
2142     return _mm_fmadd_pd(a, b, c);
2143 #elif defined (__FMA4__)
2144     return _mm_macc_pd(a, b, c);
2145 #else
2146     return a * b + c;
2147 #endif
2148 }
2149 
2150 // Multiply and subtract
mul_sub(Vec2d const & a,Vec2d const & b,Vec2d const & c)2151 static inline Vec2d mul_sub(Vec2d const & a, Vec2d const & b, Vec2d const & c) {
2152 #ifdef __FMA__
2153     return _mm_fmsub_pd(a, b, c);
2154 #elif defined (__FMA4__)
2155     return _mm_msub_pd(a, b, c);
2156 #else
2157     return a * b - c;
2158 #endif
2159 }
2160 
2161 // Multiply and inverse subtract
nmul_add(Vec2d const & a,Vec2d const & b,Vec2d const & c)2162 static inline Vec2d nmul_add(Vec2d const & a, Vec2d const & b, Vec2d const & c) {
2163 #ifdef __FMA__
2164     return _mm_fnmadd_pd(a, b, c);
2165 #elif defined (__FMA4__)
2166     return _mm_nmacc_pd(a, b, c);
2167 #else
2168     return c - a * b;
2169 #endif
2170 }
2171 
2172 
2173 // Multiply and subtract with extra precision on the intermediate calculations,
2174 // even if FMA instructions not supported, using Veltkamp-Dekker split
mul_sub_x(Vec2d const & a,Vec2d const & b,Vec2d const & c)2175 static inline Vec2d mul_sub_x(Vec2d const & a, Vec2d const & b, Vec2d const & c) {
2176 #ifdef __FMA__
2177     return _mm_fmsub_pd(a, b, c);
2178 #elif defined (__FMA4__)
2179     return _mm_msub_pd(a, b, c);
2180 #else
2181     // calculate a * b - c with extra precision
2182     Vec2q upper_mask = -(1LL << 27);                       // mask to remove lower 27 bits
2183     Vec2d a_high = a & Vec2d(_mm_castsi128_pd(upper_mask));// split into high and low parts
2184     Vec2d b_high = b & Vec2d(_mm_castsi128_pd(upper_mask));
2185     Vec2d a_low  = a - a_high;
2186     Vec2d b_low  = b - b_high;
2187     Vec2d r1 = a_high * b_high;                            // this product is exact
2188     Vec2d r2 = r1 - c;                                     // subtract c from high product
2189     Vec2d r3 = r2 + (a_high * b_low + b_high * a_low) + a_low * b_low; // add rest of product
2190     return r3; // + ((r2 - r1) + c);
2191 #endif
2192 }
2193 
2194 
2195 // Math functions using fast bit manipulation
2196 
2197 // Extract the exponent as an integer
2198 // exponent(a) = floor(log2(abs(a)));
2199 // exponent(1.0) = 0, exponent(0.0) = -1023, exponent(INF) = +1024, exponent(NAN) = +1024
exponent(Vec2d const & a)2200 static inline Vec2q exponent(Vec2d const & a) {
2201     Vec2uq t1 = _mm_castpd_si128(a);   // reinterpret as 64-bit integer
2202     Vec2uq t2 = t1 << 1;               // shift out sign bit
2203     Vec2uq t3 = t2 >> 53;              // shift down logical to position 0
2204     Vec2q  t4 = Vec2q(t3) - 0x3FF;     // subtract bias from exponent
2205     return t4;
2206 }
2207 
2208 // Extract the fraction part of a floating point number
2209 // a = 2^exponent(a) * fraction(a), except for a = 0
2210 // fraction(1.0) = 1.0, fraction(5.0) = 1.25
2211 // NOTE: The name fraction clashes with an ENUM in MAC XCode CarbonCore script.h !
fraction(Vec2d const & a)2212 static inline Vec2d fraction(Vec2d const & a) {
2213     Vec2uq t1 = _mm_castpd_si128(a);   // reinterpret as 64-bit integer
2214     Vec2uq t2 = Vec2uq((t1 & 0x000FFFFFFFFFFFFFll) | 0x3FF0000000000000ll); // set exponent to 0 + bias
2215     return _mm_castsi128_pd(t2);
2216 }
2217 
2218 // Fast calculation of pow(2,n) with n integer
2219 // n  =     0 gives 1.0
2220 // n >=  1024 gives +INF
2221 // n <= -1023 gives 0.0
2222 // This function will never produce denormals, and never raise exceptions
exp2(Vec2q const & n)2223 static inline Vec2d exp2(Vec2q const & n) {
2224     Vec2q t1 = max(n,  -0x3FF);        // limit to allowed range
2225     Vec2q t2 = min(t1,  0x400);
2226     Vec2q t3 = t2 + 0x3FF;             // add bias
2227     Vec2q t4 = t3 << 52;               // put exponent into position 52
2228     return _mm_castsi128_pd(t4);       // reinterpret as double
2229 }
2230 //static Vec2d exp2(Vec2d const & x); // defined in vectormath_exp.h
2231 
2232 
2233 // Categorization functions
2234 
2235 // Function sign_bit: gives true for elements that have the sign bit set
2236 // even for -0.0, -INF and -NAN
2237 // Note that sign_bit(Vec2d(-0.0)) gives true, while Vec2d(-0.0) < Vec2d(0.0) gives false
sign_bit(Vec2d const & a)2238 static inline Vec2db sign_bit(Vec2d const & a) {
2239     Vec2q t1 = _mm_castpd_si128(a);    // reinterpret as 64-bit integer
2240     Vec2q t2 = t1 >> 63;               // extend sign bit
2241     return _mm_castsi128_pd(t2);       // reinterpret as 64-bit Boolean
2242 }
2243 
2244 // Function sign_combine: changes the sign of a when b has the sign bit set
2245 // same as select(sign_bit(b), -a, a)
sign_combine(Vec2d const & a,Vec2d const & b)2246 static inline Vec2d sign_combine(Vec2d const & a, Vec2d const & b) {
2247     Vec2d signmask = _mm_castsi128_pd(constant4ui<0,0x80000000,0,0x80000000>());  // -0.0
2248     return a ^ (b & signmask);
2249 }
2250 
2251 // Function is_finite: gives true for elements that are normal, denormal or zero,
2252 // false for INF and NAN
is_finite(Vec2d const & a)2253 static inline Vec2db is_finite(Vec2d const & a) {
2254     Vec2q t1 = _mm_castpd_si128(a);    // reinterpret as integer
2255     Vec2q t2 = t1 << 1;                // shift out sign bit
2256     Vec2q t3 = 0xFFE0000000000000ll;   // exponent mask
2257     Vec2qb t4 = Vec2q(t2 & t3) != t3;  // exponent field is not all 1s
2258     return t4;
2259 }
2260 
2261 // Function is_inf: gives true for elements that are +INF or -INF
2262 // false for finite numbers and NAN
is_inf(Vec2d const & a)2263 static inline Vec2db is_inf(Vec2d const & a) {
2264     Vec2q t1 = _mm_castpd_si128(a);    // reinterpret as integer
2265     Vec2q t2 = t1 << 1;                // shift out sign bit
2266     return t2 == 0xFFE0000000000000ll; // exponent is all 1s, fraction is 0
2267 }
2268 
2269 // Function is_nan: gives true for elements that are +NAN or -NAN
2270 // false for finite numbers and +/-INF
is_nan(Vec2d const & a)2271 static inline Vec2db is_nan(Vec2d const & a) {
2272     Vec2q t1 = _mm_castpd_si128(a);    // reinterpret as integer
2273     Vec2q t2 = t1 << 1;                // shift out sign bit
2274     Vec2q t3 = 0xFFE0000000000000ll;   // exponent mask
2275     Vec2q t4 = t2 & t3;                // exponent
2276     Vec2q t5 = _mm_andnot_si128(t3,t2);// fraction
2277     return Vec2qb((t4==t3) & (t5!=0)); // exponent = all 1s and fraction != 0
2278 }
2279 
2280 // Function is_subnormal: gives true for elements that are subnormal (denormal)
2281 // false for finite numbers, zero, NAN and INF
is_subnormal(Vec2d const & a)2282 static inline Vec2db is_subnormal(Vec2d const & a) {
2283     Vec2q t1 = _mm_castpd_si128(a);    // reinterpret as 32-bit integer
2284     Vec2q t2 = t1 << 1;                // shift out sign bit
2285     Vec2q t3 = 0xFFE0000000000000ll;   // exponent mask
2286     Vec2q t4 = t2 & t3;                // exponent
2287     Vec2q t5 = _mm_andnot_si128(t3,t2);// fraction
2288     return Vec2qb((t4==0) & (t5!=0));  // exponent = 0 and fraction != 0
2289 }
2290 
2291 // Function is_zero_or_subnormal: gives true for elements that are zero or subnormal (denormal)
2292 // false for finite numbers, NAN and INF
is_zero_or_subnormal(Vec2d const & a)2293 static inline Vec2db is_zero_or_subnormal(Vec2d const & a) {
2294     Vec2q t = _mm_castpd_si128(a);     // reinterpret as 32-bit integer
2295           t &= 0x7FF0000000000000ll;   // isolate exponent
2296     return t == 0;                     // exponent = 0
2297 }
2298 
2299 // Function infinite2d: returns a vector where all elements are +INF
infinite2d()2300 static inline Vec2d infinite2d() {
2301     return _mm_castsi128_pd(_mm_setr_epi32(0,0x7FF00000,0,0x7FF00000));
2302 }
2303 
2304 // Function nan2d: returns a vector where all elements are +NAN (quiet)
2305 static inline Vec2d nan2d(int n = 0x10) {
2306     return _mm_castsi128_pd(_mm_setr_epi32(n, 0x7FF80000, n, 0x7FF80000));
2307 }
2308 
2309 
2310 /*****************************************************************************
2311 *
2312 *          Functions for reinterpretation between vector types
2313 *
2314 *****************************************************************************/
2315 
reinterpret_i(__m128i const & x)2316 static inline __m128i reinterpret_i (__m128i const & x) {
2317     return x;
2318 }
2319 
reinterpret_i(__m128 const & x)2320 static inline __m128i reinterpret_i (__m128  const & x) {
2321     return _mm_castps_si128(x);
2322 }
2323 
reinterpret_i(__m128d const & x)2324 static inline __m128i reinterpret_i (__m128d const & x) {
2325     return _mm_castpd_si128(x);
2326 }
2327 
reinterpret_f(__m128i const & x)2328 static inline __m128  reinterpret_f (__m128i const & x) {
2329     return _mm_castsi128_ps(x);
2330 }
2331 
reinterpret_f(__m128 const & x)2332 static inline __m128  reinterpret_f (__m128  const & x) {
2333     return x;
2334 }
2335 
reinterpret_f(__m128d const & x)2336 static inline __m128  reinterpret_f (__m128d const & x) {
2337     return _mm_castpd_ps(x);
2338 }
2339 
reinterpret_d(__m128i const & x)2340 static inline __m128d reinterpret_d (__m128i const & x) {
2341     return _mm_castsi128_pd(x);
2342 }
2343 
reinterpret_d(__m128 const & x)2344 static inline __m128d reinterpret_d (__m128  const & x) {
2345     return _mm_castps_pd(x);
2346 }
2347 
reinterpret_d(__m128d const & x)2348 static inline __m128d reinterpret_d (__m128d const & x) {
2349     return x;
2350 }
2351 
2352 
2353 /*****************************************************************************
2354 *
2355 *          Vector permute and blend functions
2356 *
2357 ******************************************************************************
2358 *
2359 * The permute function can reorder the elements of a vector and optionally
2360 * set some elements to zero.
2361 *
2362 * The indexes are inserted as template parameters in <>. These indexes must be
2363 * constants. Each template parameter is an index to the element you want to
2364 * select. An index of -1 will generate zero. An index of -256 means don't care.
2365 *
2366 * Example:
2367 * Vec2d a(10., 11.);              // a is (10, 11)
2368 * Vec2d b, c;
2369 * b = permute2d<1,1>(a);          // b is (11, 11)
2370 * c = permute2d<-1,0>(a);         // c is ( 0, 10)
2371 *
2372 *
2373 * The blend function can mix elements from two different vectors and
2374 * optionally set some elements to zero.
2375 *
2376 * The indexes are inserted as template parameters in <>. These indexes must be
2377 * constants. Each template parameter is an index to the element you want to
2378 * select, where indexes 0 - 1 indicate an element from the first source
2379 * vector and indexes 2 - 3 indicate an element from the second source vector.
2380 * An index of -1 will generate zero.
2381 *
2382 *
2383 * Example:
2384 * Vec2d a(10., 11.);              // a is (10, 11)
2385 * Vec2d b(20., 21.);              // b is (20, 21)
2386 * Vec2d c;
2387 * c = blend2d<0,3> (a,b);         // c is (10, 21)
2388 *
2389 * A lot of the code here is metaprogramming aiming to find the instructions
2390 * that best fit the template parameters and instruction set. The metacode
2391 * will be reduced out to leave only a few vector instructions in release
2392 * mode with optimization on.
2393 *****************************************************************************/
2394 
2395 // permute vector Vec2d
2396 template <int i0, int i1>
permute2d(Vec2d const & a)2397 static inline Vec2d permute2d(Vec2d const & a) {
2398     // is shuffling needed
2399     const bool do_shuffle = (i0 > 0) || (i1 != 1 && i1 >= 0);
2400     // is zeroing needed
2401     const bool do_zero    = ((i0 | i1) < 0 && (i0 | i1) & 0x80);
2402 
2403     if (do_zero && !do_shuffle) {                          // zeroing, not shuffling
2404         if ((i0 & i1) < 0) return _mm_setzero_pd();        // zero everything
2405         // zero some elements
2406         __m128i mask1 = constant4i< -int(i0>=0), -int(i0>=0), -int(i1>=0), -int(i1>=0) >();
2407         return  _mm_and_pd(a,_mm_castsi128_pd(mask1));     // zero with AND mask
2408     }
2409     else if (do_shuffle && !do_zero) {                     // shuffling, not zeroing
2410         return _mm_shuffle_pd(a, a, (i0&1) | (i1&1)<<1);
2411     }
2412     else if (do_shuffle && do_zero) {                      // shuffling and zeroing
2413         // both shuffle and zero
2414         if (i0 < 0 && i1 >= 0) {                           // zero low half, shuffle high half
2415             return _mm_shuffle_pd(_mm_setzero_pd(), a, (i1 & 1) << 1);
2416         }
2417         if (i0 >= 0 && i1 < 0) {                           // shuffle low half, zero high half
2418             return _mm_shuffle_pd(a, _mm_setzero_pd(), i0 & 1);
2419         }
2420     }
2421     return a;                                              // trivial case: do nothing
2422 }
2423 
2424 
2425 // blend vectors Vec2d
2426 template <int i0, int i1>
blend2d(Vec2d const & a,Vec2d const & b)2427 static inline Vec2d blend2d(Vec2d const & a, Vec2d const & b) {
2428 
2429     // Combine all the indexes into a single bitfield, with 8 bits for each
2430     const int m1 = (i0 & 3) | (i1 & 3) << 8;
2431 
2432     // Mask to zero out negative indexes
2433     const int m2 = (i0 < 0 ? 0 : 0xFF) | (i1 < 0 ? 0 : 0xFF) << 8;
2434 
2435     if ((m1 & 0x0202 & m2) == 0) {
2436         // no elements from b, only elements from a and possibly zero
2437         return permute2d <i0, i1> (a);
2438     }
2439     if (((m1^0x0202) & 0x0202 & m2) == 0) {
2440         // no elements from a, only elements from b and possibly zero
2441         return permute2d <i0 & ~2, i1 & ~2> (b);
2442     }
2443     // selecting from both a and b without zeroing
2444     if ((i0 & 2) == 0) { // first element from a, second element from b
2445         return _mm_shuffle_pd(a, b, (i0 & 1) | (i1 & 1) << 1);
2446     }
2447     else {         // first element from b, second element from a
2448         return _mm_shuffle_pd(b, a, (i0 & 1) | (i1 & 1) << 1);
2449     }
2450 }
2451 
2452 // change signs on vectors Vec4f
2453 // Each index i0 - i1 is 1 for changing sign on the corresponding element, 0 for no change
2454 template <int i0, int i1>
change_sign(Vec2d const & a)2455 static inline Vec2d change_sign(Vec2d const & a) {
2456     if ((i0 | i1) == 0) return a;
2457     __m128i mask = constant4ui<0, i0 ? 0x80000000 : 0, 0, i1 ? 0x80000000 : 0> ();
2458     return  _mm_xor_pd(a, _mm_castsi128_pd(mask));     // flip sign bits
2459 }
2460 
2461 
2462 /*****************************************************************************
2463 *
2464 *          Vector lookup functions
2465 *
2466 ******************************************************************************
2467 *
2468 * These functions use vector elements as indexes into a table.
2469 * The table is given as one or more vectors or as an array.
2470 *
2471 * This can be used for several purposes:
2472 *  - table lookup
2473 *  - permute or blend with variable indexes
2474 *  - blend from more than two sources
2475 *  - gather non-contiguous data
2476 *
2477 * An index out of range may produce any value - the actual value produced is
2478 * implementation dependent and may be different for different instruction
2479 * sets. An index out of range does not produce an error message or exception.
2480 *
2481 * Example:
2482 * Vec4i a(2,0,0,3);               // index  a is (  2,   0,   0,   3)
2483 * Vec4f b(1.0f,1.1f,1.2f,1.3f);   // table  b is (1.0, 1.1, 1.2, 1.3)
2484 * Vec4f c;
2485 * c = lookup4 (a,b);              // result c is (1.2, 1.0, 1.0, 1.3)
2486 *
2487 *****************************************************************************/
2488 
lookup4(Vec4i const & index,Vec4f const & table)2489 static inline Vec4f lookup4(Vec4i const & index, Vec4f const & table) {
2490 #if INSTRSET >= 7  // AVX
2491     return _mm_permutevar_ps(table, index);
2492 #else
2493     int32_t ii[4];
2494     float   tt[6];
2495     table.store(tt);  (index & 3).store(ii);
2496     __m128 r01 = _mm_loadh_pi(_mm_load_ss(&tt[ii[0]]), (const __m64 *)&tt[ii[1]]);
2497     __m128 r23 = _mm_loadh_pi(_mm_load_ss(&tt[ii[2]]), (const __m64 *)&tt[ii[3]]);
2498     return _mm_shuffle_ps(r01, r23, 0x88);
2499 #endif
2500 }
2501 
lookup8(Vec4i const & index,Vec4f const & table0,Vec4f const & table1)2502 static inline Vec4f lookup8(Vec4i const & index, Vec4f const & table0, Vec4f const & table1) {
2503 #if INSTRSET >= 8  // AVX2
2504     __m256 tt = _mm256_insertf128_ps(_mm256_castps128_ps256(table0), table1, 1); // combine tables
2505 
2506 #if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)
2507     // bug in MS VS 11 beta: operands in wrong order
2508     __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), _mm256_castps_si256(tt)));
2509     r = _mm_and_ps(r,r); // fix another bug in VS 11 beta (would store r as 256 bits aligned by 16)
2510 #elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
2511     // Gcc 4.7.0 has wrong parameter type and operands in wrong order
2512     __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), tt));
2513 #else
2514     // no bug version
2515     __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(tt, _mm256_castsi128_si256(index)));
2516 #endif
2517     return r;
2518 
2519 #elif INSTRSET >= 7  // AVX
2520     __m128  r0 = _mm_permutevar_ps(table0, index);
2521     __m128  r1 = _mm_permutevar_ps(table1, index);
2522     __m128i i4 = _mm_slli_epi32(index, 29);
2523     return _mm_blendv_ps(r0, r1, _mm_castsi128_ps(i4));
2524 
2525 #elif INSTRSET >= 5  // SSE4.1
2526     Vec4f   r0 = lookup4(index, table0);
2527     Vec4f   r1 = lookup4(index, table1);
2528     __m128i i4 = _mm_slli_epi32(index, 29);
2529     return _mm_blendv_ps(r0, r1, _mm_castsi128_ps(i4));
2530 
2531 #else               // SSE2
2532     Vec4f   r0 = lookup4(index, table0);
2533     Vec4f   r1 = lookup4(index, table1);
2534     __m128i i4 = _mm_srai_epi32(_mm_slli_epi32(index, 29), 31);
2535     return selectf(_mm_castsi128_ps(i4), r1, r0);
2536 #endif
2537 }
2538 
2539 template <int n>
lookup(Vec4i const & index,float const * table)2540 static inline Vec4f lookup(Vec4i const & index, float const * table) {
2541     if (n <= 0) return 0.0f;
2542     if (n <= 4) return lookup4(index, Vec4f().load(table));
2543     if (n <= 8) {
2544 #if INSTRSET >= 8  // AVX2
2545         __m256 tt = _mm256_loadu_ps(table);
2546 #if defined (_MSC_VER) && _MSC_VER < 1700 && ! defined(__INTEL_COMPILER)
2547         // bug in MS VS 11 beta: operands in wrong order
2548         __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), _mm256_castps_si256(tt)));
2549         r = _mm_and_ps(r,r); // fix another bug in VS 11 beta (would store r as 256 bits aligned by 16)
2550 #elif defined (GCC_VERSION) && GCC_VERSION <= 40700 && !defined(__INTEL_COMPILER) && !defined(__clang__)
2551         // Gcc 4.7.0 has wrong parameter type and operands in wrong order
2552         __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(_mm256_castsi256_ps(_mm256_castsi128_si256(index)), tt));
2553 #else
2554         // no bug version
2555         __m128 r = _mm256_castps256_ps128(_mm256_permutevar8x32_ps(tt, _mm256_castsi128_si256(index)));
2556 #endif
2557         return r;
2558 #else   // not AVX2
2559         return lookup8(index, Vec4f().load(table), Vec4f().load(table+4));
2560 #endif  // INSTRSET
2561     }
2562     // n > 8. Limit index
2563     Vec4ui index1;
2564     if ((n & (n-1)) == 0) {
2565         // n is a power of 2, make index modulo n
2566         index1 = Vec4ui(index) & (n-1);
2567     }
2568     else {
2569         // n is not a power of 2, limit to n-1
2570         index1 = min(Vec4ui(index), n-1);
2571     }
2572 #if INSTRSET >= 8  // AVX2
2573     return _mm_i32gather_ps(table, index1, 4);
2574 #else
2575     uint32_t ii[4];  index1.store(ii);
2576     return Vec4f(table[ii[0]], table[ii[1]], table[ii[2]], table[ii[3]]);
2577 #endif
2578 }
2579 
lookup2(Vec2q const & index,Vec2d const & table)2580 static inline Vec2d lookup2(Vec2q const & index, Vec2d const & table) {
2581 #if INSTRSET >= 7  // AVX
2582     return _mm_permutevar_pd(table, index + index);
2583 #else
2584     int32_t ii[4];
2585     double  tt[2];
2586     table.store(tt);  (index & 1).store(ii);
2587     return Vec2d(tt[ii[0]], tt[ii[2]]);
2588 #endif
2589 }
2590 
lookup4(Vec2q const & index,Vec2d const & table0,Vec2d const & table1)2591 static inline Vec2d lookup4(Vec2q const & index, Vec2d const & table0, Vec2d const & table1) {
2592 #if INSTRSET >= 7  // AVX
2593     Vec2q index2 = index + index;          // index << 1
2594     __m128d r0 = _mm_permutevar_pd(table0, index2);
2595     __m128d r1 = _mm_permutevar_pd(table1, index2);
2596     __m128i i4 = _mm_slli_epi64(index, 62);
2597     return _mm_blendv_pd(r0, r1, _mm_castsi128_pd(i4));
2598 #else
2599     int32_t ii[4];
2600     double  tt[4];
2601     table0.store(tt);  table1.store(tt + 2);
2602     (index & 3).store(ii);
2603     return Vec2d(tt[ii[0]], tt[ii[2]]);
2604 #endif
2605 }
2606 
2607 template <int n>
lookup(Vec2q const & index,double const * table)2608 static inline Vec2d lookup(Vec2q const & index, double const * table) {
2609     if (n <= 0) return 0.0;
2610     if (n <= 2) return lookup2(index, Vec2d().load(table));
2611 #if INSTRSET < 8  // not AVX2
2612     if (n <= 4) return lookup4(index, Vec2d().load(table), Vec2d().load(table + 2));
2613 #endif
2614     // Limit index
2615     Vec2uq index1;
2616     if ((n & (n-1)) == 0) {
2617         // n is a power of 2, make index modulo n
2618         index1 = Vec2uq(index) & (n-1);
2619     }
2620     else {
2621         // n is not a power of 2, limit to n-1
2622         index1 = min(Vec2uq(index), n-1);
2623     }
2624 #if INSTRSET >= 8  // AVX2
2625     return _mm_i64gather_pd(table, index1, 8);
2626 #else
2627     uint32_t ii[4];  index1.store(ii);
2628     return Vec2d(table[ii[0]], table[ii[2]]);
2629 #endif
2630 }
2631 
2632 
2633 /*****************************************************************************
2634 *
2635 *          Gather functions with fixed indexes
2636 *
2637 *****************************************************************************/
2638 // Load elements from array a with indices i0, i1, i2, i3
2639 template <int i0, int i1, int i2, int i3>
gather4f(void const * a)2640 static inline Vec4f gather4f(void const * a) {
2641     return reinterpret_f(gather4i<i0, i1, i2, i3>(a));
2642 }
2643 
2644 // Load elements from array a with indices i0, i1
2645 template <int i0, int i1>
gather2d(void const * a)2646 static inline Vec2d gather2d(void const * a) {
2647     return reinterpret_d(gather2q<i0, i1>(a));
2648 }
2649 
2650 /*****************************************************************************
2651 *
2652 *          Vector scatter functions
2653 *
2654 ******************************************************************************
2655 *
2656 * These functions write the elements of a vector to arbitrary positions in an
2657 * array in memory. Each vector element is written to an array position
2658 * determined by an index. An element is not written if the corresponding
2659 * index is out of range.
2660 * The indexes can be specified as constant template parameters or as an
2661 * integer vector.
2662 *
2663 * The scatter functions are useful if the data are distributed in a sparce
2664 * manner into the array. If the array is dense then it is more efficient
2665 * to permute the data into the right positions and then write the whole
2666 * permuted vector into the array.
2667 *
2668 * Example:
2669 * Vec8d a(10,11,12,13,14,15,16,17);
2670 * double b[16] = {0};
2671 * scatter<0,2,14,10,1,-1,5,9>(a,b);
2672 * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
2673 *
2674 *****************************************************************************/
2675 
2676 template <int i0, int i1, int i2, int i3>
scatter(Vec4f const & data,float * array)2677 static inline void scatter(Vec4f const & data, float * array) {
2678 #if defined (__AVX512VL__)
2679     __m128i indx = constant4i<i0,i1,i2,i3>();
2680     __mmask16 mask = uint16_t(i0>=0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3);
2681     _mm_mask_i32scatter_ps(array, mask, indx, data, 4);
2682 #else
2683     const int index[4] = {i0,i1,i2,i3};
2684     for (int i = 0; i < 4; i++) {
2685         if (index[i] >= 0) array[index[i]] = data[i];
2686     }
2687 #endif
2688 }
2689 
2690 template <int i0, int i1>
scatter(Vec2d const & data,double * array)2691 static inline void scatter(Vec2d const & data, double * array) {
2692     if (i0 >= 0) array[i0] = data[0];
2693     if (i1 >= 0) array[i1] = data[1];
2694 }
2695 
scatter(Vec4i const & index,uint32_t limit,Vec4f const & data,float * array)2696 static inline void scatter(Vec4i const & index, uint32_t limit, Vec4f const & data, float * array) {
2697 #if defined (__AVX512VL__)
2698     __mmask16 mask = _mm_cmplt_epu32_mask(index, Vec4ui(limit));
2699     _mm_mask_i32scatter_ps(array, mask, index, data, 4);
2700 #else
2701     for (int i = 0; i < 4; i++) {
2702         if (uint32_t(index[i]) < limit) array[index[i]] = data[i];
2703     }
2704 #endif
2705 }
2706 
scatter(Vec2q const & index,uint32_t limit,Vec2d const & data,double * array)2707 static inline void scatter(Vec2q const & index, uint32_t limit, Vec2d const & data, double * array) {
2708     if (uint64_t(index[0]) < uint64_t(limit)) array[index[0]] = data[0];
2709     if (uint64_t(index[1]) < uint64_t(limit)) array[index[1]] = data[1];
2710 }
2711 
scatter(Vec4i const & index,uint32_t limit,Vec2d const & data,double * array)2712 static inline void scatter(Vec4i const & index, uint32_t limit, Vec2d const & data, double * array) {
2713     if (uint32_t(index[0]) < limit) array[index[0]] = data[0];
2714     if (uint32_t(index[1]) < limit) array[index[1]] = data[1];
2715 }
2716 
2717 /*****************************************************************************
2718 *
2719 *          Horizontal scan functions
2720 *
2721 *****************************************************************************/
2722 
2723 // Get index to the first element that is true. Return -1 if all are false
horizontal_find_first(Vec4fb const & x)2724 static inline int horizontal_find_first(Vec4fb const & x) {
2725     return horizontal_find_first(Vec4ib(x));
2726 }
2727 
horizontal_find_first(Vec2db const & x)2728 static inline int horizontal_find_first(Vec2db const & x) {
2729     return horizontal_find_first(Vec2qb(x));
2730 }
2731 
2732 // Count the number of elements that are true
horizontal_count(Vec4fb const & x)2733 static inline uint32_t horizontal_count(Vec4fb const & x) {
2734     return horizontal_count(Vec4ib(x));
2735 }
2736 
horizontal_count(Vec2db const & x)2737 static inline uint32_t horizontal_count(Vec2db const & x) {
2738     return horizontal_count(Vec2qb(x));
2739 }
2740 
2741 /*****************************************************************************
2742 *
2743 *          Boolean <-> bitfield conversion functions
2744 *
2745 *****************************************************************************/
2746 
2747 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec4fb const & x)2748 static inline uint8_t to_bits(Vec4fb const & x) {
2749     return to_bits(Vec4ib(x));
2750 }
2751 
2752 // to_Vec4fb: convert integer bitfield to boolean vector
to_Vec4fb(uint8_t x)2753 static inline Vec4fb to_Vec4fb(uint8_t x) {
2754     return Vec4fb(to_Vec4ib(x));
2755 }
2756 
2757 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec2db const & x)2758 static inline uint8_t to_bits(Vec2db const & x) {
2759     return to_bits(Vec2qb(x));
2760 }
2761 
2762 // to_Vec2db: convert integer bitfield to boolean vector
to_Vec2db(uint8_t x)2763 static inline Vec2db to_Vec2db(uint8_t x) {
2764     return Vec2db(to_Vec2qb(x));
2765 }
2766 
2767 #ifdef VCL_NAMESPACE
2768 }
2769 #endif
2770 
2771 #endif // VECTORF128_H
2772