1 /****************************  vectori512e.h   *******************************
2 * Author:        Agner Fog
3 * Date created:  2014-07-23
4 * Last modified: 2017-02-19
5 * Version:       1.27
6 * Project:       vector classes
7 * Description:
8 * Header file defining integer vector classes as interface to intrinsic
9 * functions in x86 microprocessors with AVX512 and later instruction sets.
10 *
11 * Instructions:
12 * Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired
13 * instruction set, which must be at least AVX512.
14 *
15 * The following vector classes are defined here:
16 * Vec16i    Vector of  16  32-bit signed   integers
17 * Vec16ui   Vector of  16  32-bit unsigned integers
18 * Vec16ib   Vector of  16  Booleans for use with Vec16i and Vec16ui
19 * Vec8q     Vector of   8  64-bit signed   integers
20 * Vec8uq    Vector of   8  64-bit unsigned integers
21 * Vec8qb    Vector of   8  Booleans for use with Vec8q and Vec8uq
22 *
23 * Each vector object is represented internally in the CPU as a 512-bit register.
24 * This header file defines operators and functions for these vectors.
25 *
26 * For detailed instructions, see VectorClass.pdf
27 *
28 * (c) Copyright 2014-2017 GNU General Public License http://www.gnu.org/licenses
29 *****************************************************************************/
30 
31 // check combination of header files
32 #if defined (VECTORI512_H)
33 #if    VECTORI512_H != 1
34 #error Two different versions of vectori512.h included
35 #endif
36 #else
37 #define VECTORI512_H  1
38 
39 #ifdef VCL_NAMESPACE
40 namespace VCL_NAMESPACE {
41 #endif
42 
43 /*****************************************************************************
44 *
45 *          base class Vec512ie
46 *
47 *****************************************************************************/
48 // base class to replace _mm512i when AVX512 is not supported
49 class Vec512ie {
50 protected:
51     Vec256b z0;                         // low half
52     Vec256b z1;                         // high half
53 public:
Vec512ie(void)54     Vec512ie(void) {};                  // default constructor
Vec512ie(Vec8i const & x0,Vec8i const & x1)55     Vec512ie(Vec8i const & x0, Vec8i const & x1) {      // constructor to build from two Vec8i
56         z0 = x0;  z1 = x1;
57     }
get_low()58     Vec8i get_low() const {            // get low half
59         return Vec8i(z0);
60     }
get_high()61     Vec8i get_high() const {           // get high half
62         return Vec8i(z1);
63     }
64 };
65 
66 
67 /*****************************************************************************
68 *
69 *          Vector of 512 1-bit unsigned integers or Booleans
70 *
71 *****************************************************************************/
72 class Vec512b : public Vec512ie {
73 public:
74     // Default constructor:
Vec512b()75     Vec512b() {
76     }
77     // Constructor to build from two Vec256b:
Vec512b(Vec256b const & a0,Vec256b const & a1)78     Vec512b(Vec256b const & a0, Vec256b const & a1) {
79         z0 = a0;  z1 = a1;
80     }
81     // Constructor to convert from type Vec512ie
Vec512b(Vec512ie const & x)82     Vec512b(Vec512ie const & x) {
83         z0 = x.get_low();  z1 = x.get_high();
84     }
85     // Assignment operator to convert from type Vec512ie
86     Vec512b & operator = (Vec512ie const & x) {
87         z0 = x.get_low();  z1 = x.get_high();
88         return *this;
89     }
90     // Member function to load from array (unaligned)
load(void const * p)91     Vec512b & load(void const * p) {
92         z0 = Vec8i().load(p);
93         z1 = Vec8i().load((int32_t const*)p+8);
94         return *this;
95     }
96     // Member function to load from array, aligned by 64
load_a(void const * p)97     Vec512b & load_a(void const * p) {
98         z0 = Vec8i().load_a(p);
99         z1 = Vec8i().load_a((int32_t const*)p+8);
100         return *this;
101     }
102     // Member function to store into array (unaligned)
store(void * p)103     void store(void * p) const {
104         Vec8i(z0).store(p);
105         Vec8i(z1).store((int32_t*)p+8);
106     }
107     // Member function to store into array, aligned by 64
store_a(void * p)108     void store_a(void * p) const {
109         Vec8i(z0).store_a(p);
110         Vec8i(z1).store_a((int32_t*)p+8);
111     }
112     // Member function to change a single bit
113     // Note: This function is inefficient. Use load function if changing more than one bit
set_bit(uint32_t index,int value)114     Vec512b const & set_bit(uint32_t index, int value) {
115         if (index < 256) {
116             z0 = Vec8i(z0).set_bit(index, value);
117         }
118         else {
119             z1 = Vec8i(z1).set_bit(index-256, value);
120         }
121         return *this;
122     }
123     // Member function to get a single bit
124     // Note: This function is inefficient. Use store function if reading more than one bit
get_bit(uint32_t index)125     int get_bit(uint32_t index) const {
126         if (index < 256) {
127             return Vec8i(z0).get_bit(index);
128         }
129         else {
130             return Vec8i(z1).get_bit(index-256);
131         }
132     }
133     // Extract a single element. Use store function if extracting more than one element.
134     // Operator [] can only read an element, not write.
135     bool operator [] (uint32_t index) const {
136         return get_bit(index) != 0;
137     }
138     // Member functions to split into two Vec128b:
get_low()139     Vec256b get_low() const {
140         return z0;
141     }
get_high()142     Vec256b get_high() const {
143         return z1;
144     }
size()145     static int size () {
146         return 512;
147     }
148 };
149 
150 // Define operators for this class
151 
152 // vector operator & : bitwise and
153 static inline Vec512b operator & (Vec512b const & a, Vec512b const & b) {
154     return Vec512b(a.get_low() & b.get_low(), a.get_high() & b.get_high());
155 }
156 static inline Vec512b operator && (Vec512b const & a, Vec512b const & b) {
157     return a & b;
158 }
159 
160 // vector operator | : bitwise or
161 static inline Vec512b operator | (Vec512b const & a, Vec512b const & b) {
162     return Vec512b(a.get_low() | b.get_low(), a.get_high() | b.get_high());
163 }
164 static inline Vec512b operator || (Vec512b const & a, Vec512b const & b) {
165     return a | b;
166 }
167 
168 // vector operator ^ : bitwise xor
169 static inline Vec512b operator ^ (Vec512b const & a, Vec512b const & b) {
170     return Vec512b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
171 }
172 
173 // vector operator ~ : bitwise not
174 static inline Vec512b operator ~ (Vec512b const & a) {
175     return Vec512b(~a.get_low(), ~a.get_high());
176 }
177 
178 // vector operator &= : bitwise and
179 static inline Vec512b & operator &= (Vec512b & a, Vec512b const & b) {
180     a = a & b;
181     return a;
182 }
183 
184 // vector operator |= : bitwise or
185 static inline Vec512b & operator |= (Vec512b & a, Vec512b const & b) {
186     a = a | b;
187     return a;
188 }
189 
190 // vector operator ^= : bitwise xor
191 static inline Vec512b & operator ^= (Vec512b & a, Vec512b const & b) {
192     a = a ^ b;
193     return a;
194 }
195 
196 // Define functions for this class
197 
198 // function andnot: a & ~ b
andnot(Vec512b const & a,Vec512b const & b)199 static inline Vec512b andnot (Vec512b const & a, Vec512b const & b) {
200     return Vec512b(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
201 }
202 
203 
204 
205 /*****************************************************************************
206 *
207 *          Generate compile-time constant vector
208 *
209 *****************************************************************************/
210 // Generate a constant vector of 8 integers stored in memory.
211 // Can be converted to any integer vector type
212 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
constant16i()213 static inline Vec512ie constant16i() {
214     static const union {
215         int32_t i[16];
216         Vec256b y[2];  // note: requires C++0x or later. Use option -std=c++0x
217     } u = {{i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15}};
218     return Vec512ie(u.y[0], u.y[1]);
219 }
220 
221 
222 /*****************************************************************************
223 *
224 *          Boolean vector base classes for AVX512
225 *
226 *****************************************************************************/
227 
228 class Vec16b : public Vec512b {
229 public:
230     // Default constructor:
Vec16b()231     Vec16b () {
232     }
233     // Constructor to build from all elements:
Vec16b(bool b0,bool b1,bool b2,bool b3,bool b4,bool b5,bool b6,bool b7,bool b8,bool b9,bool b10,bool b11,bool b12,bool b13,bool b14,bool b15)234     Vec16b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7,
235     bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15) {
236         *this = Vec512b(Vec8i(-(int)b0, -(int)b1, -(int)b2, -(int)b3, -(int)b4, -(int)b5, -(int)b6, -(int)b7), Vec8i(-(int)b8, -(int)b9, -(int)b10, -(int)b11, -(int)b12, -(int)b13, -(int)b14, -(int)b15));
237     }
238     // Constructor to convert from type Vec512b
Vec16b(Vec512b const & x)239     Vec16b (Vec512b const & x) {
240         z0 = x.get_low();
241         z1 = x.get_high();
242     }
243     // Constructor to make from two halves
Vec16b(Vec8ib const & x0,Vec8ib const & x1)244     Vec16b (Vec8ib const & x0, Vec8ib const & x1) {
245         z0 = x0;
246         z1 = x1;
247     }
248     // Constructor to make from two halves
Vec16b(Vec8i const & x0,Vec8i const & x1)249     Vec16b (Vec8i const & x0, Vec8i const & x1) {
250         z0 = x0;
251         z1 = x1;
252     }
253     // Constructor to broadcast single value:
Vec16b(bool b)254     Vec16b(bool b) {
255         z0 = z1 = Vec8i(-int32_t(b));
256     }
257     // Assignment operator to broadcast scalar value:
258     Vec16b & operator = (bool b) {
259         z0 = z1 = Vec8i(-int32_t(b));
260         return *this;
261     }
262 private:
263     // Prevent constructing from int, etc. because of ambiguity
264     Vec16b(int b);
265     // Prevent assigning int because of ambiguity
266     Vec16b & operator = (int x);
267 public:
268     // split into two halves
get_low()269     Vec8ib get_low() const {
270         return Vec8ib(z0);
271     }
get_high()272     Vec8ib get_high() const {
273         return Vec8ib(z1);
274     }
275     // Assignment operator to convert from type Vec512b
276     Vec16b & operator = (Vec512b const & x) {
277         z0 = x.get_low();
278         z1 = x.get_high();
279         return *this;
280     }
281     // Member function to change a single element in vector
282     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,bool value)283     Vec16b const & insert(uint32_t index, bool value) {
284         if (index < 8) {
285             z0 = Vec8ib(z0).insert(index, value);
286         }
287         else {
288             z1 = Vec8ib(z1).insert(index-8, value);
289         }
290         return *this;
291     }
292     // Member function extract a single element from vector
extract(uint32_t index)293     bool extract(uint32_t index) const {
294         if (index < 8) {
295             return Vec8ib(z0).extract(index);
296         }
297         else {
298             return Vec8ib(z1).extract(index-8);
299         }
300     }
301     // Extract a single element. Operator [] can only read an element, not write.
302     bool operator [] (uint32_t index) const {
303         return extract(index);
304     }
size()305     static int size () {
306         return 16;
307     }
308 };
309 
310 // Define operators for this class
311 
312 // vector operator & : bitwise and
313 static inline Vec16b operator & (Vec16b const & a, Vec16b const & b) {
314     return Vec16b(a.get_low() & b.get_low(), a.get_high() & b.get_high());
315 }
316 static inline Vec16b operator && (Vec16b const & a, Vec16b const & b) {
317     return a & b;
318 }
319 
320 // vector operator | : bitwise or
321 static inline Vec16b operator | (Vec16b const & a, Vec16b const & b) {
322     return Vec16b(a.get_low() | b.get_low(), a.get_high() | b.get_high());
323 }
324 static inline Vec16b operator || (Vec16b const & a, Vec16b const & b) {
325     return a | b;
326 }
327 
328 // vector operator ^ : bitwise xor
329 static inline Vec16b operator ^ (Vec16b const & a, Vec16b const & b) {
330     return Vec16b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
331 }
332 
333 // vector operator ~ : bitwise not
334 static inline Vec16b operator ~ (Vec16b const & a) {
335     return Vec16b(~(a.get_low()), ~(a.get_high()));
336 }
337 
338 // vector operator ! : element not
339 static inline Vec16b operator ! (Vec16b const & a) {
340     return ~a;
341 }
342 
343 // vector operator &= : bitwise and
344 static inline Vec16b & operator &= (Vec16b & a, Vec16b const & b) {
345     a = a & b;
346     return a;
347 }
348 
349 // vector operator |= : bitwise or
350 static inline Vec16b & operator |= (Vec16b & a, Vec16b const & b) {
351     a = a | b;
352     return a;
353 }
354 
355 // vector operator ^= : bitwise xor
356 static inline Vec16b & operator ^= (Vec16b & a, Vec16b const & b) {
357     a = a ^ b;
358     return a;
359 }
360 
361 /*****************************************************************************
362 *
363 *          Functions for boolean vectors
364 *
365 *****************************************************************************/
366 
367 // function andnot: a & ~ b
andnot(Vec16b const & a,Vec16b const & b)368 static inline Vec16b andnot (Vec16b const & a, Vec16b const & b) {
369     return Vec16b(Vec8ib(andnot(a.get_low(),b.get_low())), Vec8ib(andnot(a.get_high(),b.get_high())));
370 }
371 
372 // horizontal_and. Returns true if all bits are 1
horizontal_and(Vec16b const & a)373 static inline bool horizontal_and (Vec16b const & a) {
374     return  horizontal_and(a.get_low() & a.get_high());
375 }
376 
377 // horizontal_or. Returns true if at least one bit is 1
horizontal_or(Vec16b const & a)378 static inline bool horizontal_or (Vec16b const & a) {
379     return  horizontal_or(a.get_low() | a.get_high());
380 }
381 
382 
383 /*****************************************************************************
384 *
385 *          Vec16ib: Vector of 16 Booleans for use with Vec16i and Vec16ui
386 *
387 *****************************************************************************/
388 
389 class Vec16ib : public Vec16b {
390 public:
391     // Default constructor:
Vec16ib()392     Vec16ib () {
393     }
Vec16ib(Vec16b const & x)394     Vec16ib (Vec16b const & x) {
395         z0 = x.get_low();
396         z1 = x.get_high();
397     }
398     // Constructor to build from all elements:
Vec16ib(bool x0,bool x1,bool x2,bool x3,bool x4,bool x5,bool x6,bool x7,bool x8,bool x9,bool x10,bool x11,bool x12,bool x13,bool x14,bool x15)399     Vec16ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
400         bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) {
401         z0 = Vec8ib(x0, x1, x2, x3, x4, x5, x6, x7);
402         z1 = Vec8ib(x8, x9, x10, x11, x12, x13, x14, x15);
403     }
404     // Constructor to convert from type Vec512b
Vec16ib(Vec512b const & x)405     Vec16ib (Vec512b const & x) {
406         z0 = x.get_low();
407         z1 = x.get_high();
408     }
409     // Construct from two halves
Vec16ib(Vec8ib const & x0,Vec8ib const & x1)410     Vec16ib (Vec8ib const & x0, Vec8ib const & x1) {
411         z0 = x0;
412         z1 = x1;
413     }
414     // Assignment operator to convert from type Vec512b
415     Vec16ib & operator = (Vec512b const & x) {
416         z0 = x.get_low();
417         z1 = x.get_high();
418         return *this;
419     }
420     // Constructor to broadcast scalar value:
Vec16ib(bool b)421     Vec16ib(bool b) : Vec16b(b) {
422     }
423     // Assignment operator to broadcast scalar value:
424     Vec16ib & operator = (bool b) {
425         *this = Vec16b(b);
426         return *this;
427     }
428 private: // Prevent constructing from int, etc.
429     Vec16ib(int b);
430     Vec16ib & operator = (int x);
431 public:
432 };
433 
434 // Define operators for Vec16ib
435 
436 // vector operator & : bitwise and
437 static inline Vec16ib operator & (Vec16ib const & a, Vec16ib const & b) {
438     return Vec16b(a) & Vec16b(b);
439 }
440 static inline Vec16ib operator && (Vec16ib const & a, Vec16ib const & b) {
441     return a & b;
442 }
443 
444 // vector operator | : bitwise or
445 static inline Vec16ib operator | (Vec16ib const & a, Vec16ib const & b) {
446     return Vec16b(a) | Vec16b(b);
447 }
448 static inline Vec16ib operator || (Vec16ib const & a, Vec16ib const & b) {
449     return a | b;
450 }
451 
452 // vector operator ^ : bitwise xor
453 static inline Vec16ib operator ^ (Vec16ib const & a, Vec16ib const & b) {
454     return Vec16b(a) ^ Vec16b(b);
455 }
456 
457 // vector operator ~ : bitwise not
458 static inline Vec16ib operator ~ (Vec16ib const & a) {
459     return ~Vec16b(a);
460 }
461 
462 // vector operator ! : element not
463 static inline Vec16ib operator ! (Vec16ib const & a) {
464     return ~a;
465 }
466 
467 // vector operator &= : bitwise and
468 static inline Vec16ib & operator &= (Vec16ib & a, Vec16ib const & b) {
469     a = a & b;
470     return a;
471 }
472 
473 // vector operator |= : bitwise or
474 static inline Vec16ib & operator |= (Vec16ib & a, Vec16ib const & b) {
475     a = a | b;
476     return a;
477 }
478 
479 // vector operator ^= : bitwise xor
480 static inline Vec16ib & operator ^= (Vec16ib & a, Vec16ib const & b) {
481     a = a ^ b;
482     return a;
483 }
484 
485 // vector function andnot
andnot(Vec16ib const & a,Vec16ib const & b)486 static inline Vec16ib andnot (Vec16ib const & a, Vec16ib const & b) {
487     return Vec16ib(andnot(Vec16b(a), Vec16b(b)));
488 }
489 
490 
491 /*****************************************************************************
492 *
493 *          Vec8b: Base class vector of 8 Booleans
494 *
495 *****************************************************************************/
496 
497 class Vec8b : public Vec16b {
498 public:
499     // Default constructor:
Vec8b()500     Vec8b () {
501     }
Vec8b(Vec16b const & x)502     Vec8b (Vec16b const & x) {
503         z0 = x.get_low();
504         z1 = x.get_high();
505     }
506     // Constructor to convert from type Vec512b
Vec8b(Vec512b const & x)507     Vec8b (Vec512b const & x) {
508         z0 = x.get_low();
509         z1 = x.get_high();
510     }
511     // construct from two halves
Vec8b(Vec4qb const & x0,Vec4qb const & x1)512     Vec8b (Vec4qb const & x0, Vec4qb const & x1) {
513         z0 = x0;
514         z1 = x1;
515     }
516     // Constructor to broadcast single value:
Vec8b(bool b)517     Vec8b(bool b) {
518         z0 = z1 = Vec8i(-int32_t(b));
519     }
520     // Assignment operator to broadcast scalar value:
521     Vec8b & operator = (bool b) {
522         z0 = z1 = Vec8i(-int32_t(b));
523         return *this;
524     }
525 private:
526     // Prevent constructing from int, etc. because of ambiguity
527     Vec8b(int b);
528     // Prevent assigning int because of ambiguity
529     Vec8b & operator = (int x);
530 public:
531     // split into two halves
get_low()532     Vec4qb get_low() const {
533         return Vec4qb(z0);
534     }
get_high()535     Vec4qb get_high() const {
536         return Vec4qb(z1);
537     }
538     // Assignment operator to convert from type Vec512b
539     Vec8b & operator = (Vec512b const & x) {
540         z0 = x.get_low();
541         z1 = x.get_high();
542         return *this;
543     }
544     // Member function to change a single element in vector
545     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,bool value)546     Vec8b const & insert(uint32_t index, bool value) {
547         if (index < 4) {
548             z0 = Vec4qb(z0).insert(index, value);
549         }
550         else {
551             z1 = Vec4qb(z1).insert(index-4, value);
552         }
553         return *this;
554     }
extract(uint32_t index)555     bool extract(uint32_t index) const {
556         if (index < 4) {
557             return Vec4qb(Vec4q(z0)).extract(index);
558         }
559         else {
560             return Vec4qb(Vec4q(z1)).extract(index-4);
561         }
562     }
563     bool operator [] (uint32_t index) const {
564         return extract(index);
565     }
size()566     static int size () {
567         return 8;
568     }
569 };
570 
571 
572 /*****************************************************************************
573 *
574 *          Vec8qb: Vector of 8 Booleans for use with Vec8q and Vec8qu
575 *
576 *****************************************************************************/
577 
578 class Vec8qb : public Vec8b {
579 public:
580     // Default constructor:
Vec8qb()581     Vec8qb () {
582     }
Vec8qb(Vec16b const & x)583     Vec8qb (Vec16b const & x) {
584         z0 = x.get_low();
585         z1 = x.get_high();
586     }
587     // Constructor to build from all elements:
Vec8qb(bool x0,bool x1,bool x2,bool x3,bool x4,bool x5,bool x6,bool x7)588     Vec8qb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) {
589         z0 = Vec4qb(x0, x1, x2, x3);
590         z1 = Vec4qb(x4, x5, x6, x7);
591     }
592     // Constructor to convert from type Vec512b
Vec8qb(Vec512b const & x)593     Vec8qb (Vec512b const & x) {
594         z0 = x.get_low();
595         z1 = x.get_high();
596     }
597     // construct from two halves
Vec8qb(Vec4qb const & x0,Vec4qb const & x1)598     Vec8qb (Vec4qb const & x0, Vec4qb const & x1) {
599         z0 = x0;
600         z1 = x1;
601     }
602     // Assignment operator to convert from type Vec512b
603     Vec8qb & operator = (Vec512b const & x) {
604         z0 = x.get_low();
605         z1 = x.get_high();
606         return *this;
607     }
608     // Constructor to broadcast single value:
Vec8qb(bool b)609     Vec8qb(bool b) : Vec8b(b) {
610     }
611     // Assignment operator to broadcast scalar value:
612     Vec8qb & operator = (bool b) {
613         *this = Vec8b(b);
614         return *this;
615     }
616 private:
617     // Prevent constructing from int, etc. because of ambiguity
618     Vec8qb(int b);
619     // Prevent assigning int because of ambiguity
620     Vec8qb & operator = (int x);
621 public:
622 };
623 
624 // Define operators for Vec8qb
625 
626 // vector operator & : bitwise and
627 static inline Vec8qb operator & (Vec8qb const & a, Vec8qb const & b) {
628     return Vec16b(a) & Vec16b(b);
629 }
630 static inline Vec8qb operator && (Vec8qb const & a, Vec8qb const & b) {
631     return a & b;
632 }
633 
634 // vector operator | : bitwise or
635 static inline Vec8qb operator | (Vec8qb const & a, Vec8qb const & b) {
636     return Vec16b(a) | Vec16b(b);
637 }
638 static inline Vec8qb operator || (Vec8qb const & a, Vec8qb const & b) {
639     return a | b;
640 }
641 
642 // vector operator ^ : bitwise xor
643 static inline Vec8qb operator ^ (Vec8qb const & a, Vec8qb const & b) {
644     return Vec16b(a) ^ Vec16b(b);
645 }
646 
647 // vector operator ~ : bitwise not
648 static inline Vec8qb operator ~ (Vec8qb const & a) {
649     return ~Vec16b(a);
650 }
651 
652 // vector operator ! : element not
653 static inline Vec8qb operator ! (Vec8qb const & a) {
654     return ~a;
655 }
656 
657 // vector operator &= : bitwise and
658 static inline Vec8qb & operator &= (Vec8qb & a, Vec8qb const & b) {
659     a = a & b;
660     return a;
661 }
662 
663 // vector operator |= : bitwise or
664 static inline Vec8qb & operator |= (Vec8qb & a, Vec8qb const & b) {
665     a = a | b;
666     return a;
667 }
668 
669 // vector operator ^= : bitwise xor
670 static inline Vec8qb & operator ^= (Vec8qb & a, Vec8qb const & b) {
671     a = a ^ b;
672     return a;
673 }
674 
675 // vector function andnot
andnot(Vec8qb const & a,Vec8qb const & b)676 static inline Vec8qb andnot (Vec8qb const & a, Vec8qb const & b) {
677     return Vec8qb(andnot(Vec16b(a), Vec16b(b)));
678 }
679 
680 
681 /*****************************************************************************
682 *
683 *          Vector of 16 32-bit signed integers
684 *
685 *****************************************************************************/
686 
687 class Vec16i: public Vec512b {
688 public:
689     // Default constructor:
Vec16i()690     Vec16i() {
691     }
692     // Constructor to broadcast the same value into all elements:
Vec16i(int i)693     Vec16i(int i) {
694         z0 = z1 = Vec8i(i);
695     }
696     // Constructor to build from all elements:
Vec16i(int32_t i0,int32_t i1,int32_t i2,int32_t i3,int32_t i4,int32_t i5,int32_t i6,int32_t i7,int32_t i8,int32_t i9,int32_t i10,int32_t i11,int32_t i12,int32_t i13,int32_t i14,int32_t i15)697     Vec16i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7,
698     int32_t i8, int32_t i9, int32_t i10, int32_t i11, int32_t i12, int32_t i13, int32_t i14, int32_t i15) {
699         z0 = Vec8i(i0, i1, i2, i3, i4, i5, i6, i7);
700         z1 = Vec8i(i8, i9, i10, i11, i12, i13, i14, i15);
701     }
702     // Constructor to build from two Vec8i:
Vec16i(Vec8i const & a0,Vec8i const & a1)703     Vec16i(Vec8i const & a0, Vec8i const & a1) {
704         *this = Vec512b(a0, a1);
705     }
706     // Constructor to convert from type Vec512b
Vec16i(Vec512b const & x)707     Vec16i(Vec512b const & x) {
708         z0 = x.get_low();
709         z1 = x.get_high();
710     }
711     // Assignment operator to convert from type Vec512b
712     Vec16i & operator = (Vec512b const & x) {
713         z0 = x.get_low();
714         z1 = x.get_high();
715         return *this;
716     }
717     // Member function to load from array (unaligned)
load(void const * p)718     Vec16i & load(void const * p) {
719         Vec512b::load(p);
720         return *this;
721     }
722     // Member function to load from array, aligned by 64
load_a(void const * p)723     Vec16i & load_a(void const * p) {
724         Vec512b::load_a(p);
725         return *this;
726     }
727     // Partial load. Load n elements and set the rest to 0
load_partial(int n,void const * p)728     Vec16i & load_partial(int n, void const * p) {
729         if (n < 8) {
730             z0 = Vec8i().load_partial(n, p);
731             z1 = Vec8i(0);
732         }
733         else {
734             z0 = Vec8i().load(p);
735             z1 = Vec8i().load_partial(n - 8, (int32_t const*)p + 8);
736         }
737         return *this;
738     }
739     // Partial store. Store n elements
store_partial(int n,void * p)740     void store_partial(int n, void * p) const {
741         if (n < 8) {
742             Vec8i(get_low()).store_partial(n, p);
743         }
744         else {
745             Vec8i(get_low()).store(p);
746             Vec8i(get_high()).store_partial(n - 8, (int32_t *)p + 8);
747         }
748     }
749     // cut off vector to n elements. The last 8-n elements are set to zero
cutoff(int n)750     Vec16i & cutoff(int n) {
751         if (n < 8) {
752             z0 = Vec8i(z0).cutoff(n);
753             z1 = Vec8i(0);
754         }
755         else {
756             z1 = Vec8i(z1).cutoff(n - 8);
757         }
758         return *this;
759     }
760     // Member function to change a single element in vector
insert(uint32_t index,int32_t value)761     Vec16i const & insert(uint32_t index, int32_t value) {
762         if (index < 8) {
763             z0 = Vec8i(z0).insert(index, value);
764         }
765         else {
766             z1 = Vec8i(z1).insert(index - 8, value);
767         }
768         return *this;
769     }
770     // Member function extract a single element from vector
extract(uint32_t index)771     int32_t extract(uint32_t index) const {
772         if (index < 8) {
773             return Vec8i(z0).extract(index);
774         }
775         else {
776             return Vec8i(z1).extract(index - 8);
777         }
778     }
779     // Extract a single element. Use store function if extracting more than one element.
780     // Operator [] can only read an element, not write.
781     int32_t operator [] (uint32_t index) const {
782         return extract(index);
783     }
784     // Member functions to split into two Vec8i:
get_low()785     Vec8i get_low() const {
786         return Vec8i(z0);
787     }
get_high()788     Vec8i get_high() const {
789         return Vec8i(z1);
790     }
size()791     static int size () {
792         return 16;
793     }
794 };
795 
796 
797 // Define operators for Vec16i
798 
799 // vector operator + : add element by element
800 static inline Vec16i operator + (Vec16i const & a, Vec16i const & b) {
801     return Vec16i(a.get_low() + b.get_low(), a.get_high() + b.get_high());
802 }
803 
804 // vector operator += : add
805 static inline Vec16i & operator += (Vec16i & a, Vec16i const & b) {
806     a = a + b;
807     return a;
808 }
809 
810 // postfix operator ++
811 static inline Vec16i operator ++ (Vec16i & a, int) {
812     Vec16i a0 = a;
813     a = a + 1;
814     return a0;
815 }
816 
817 // prefix operator ++
818 static inline Vec16i & operator ++ (Vec16i & a) {
819     a = a + 1;
820     return a;
821 }
822 
823 // vector operator - : subtract element by element
824 static inline Vec16i operator - (Vec16i const & a, Vec16i const & b) {
825     return Vec16i(a.get_low() - b.get_low(), a.get_high() - b.get_high());
826 }
827 
828 // vector operator - : unary minus
829 static inline Vec16i operator - (Vec16i const & a) {
830     return Vec16i(-a.get_low(), -a.get_high());
831 }
832 
833 // vector operator -= : subtract
834 static inline Vec16i & operator -= (Vec16i & a, Vec16i const & b) {
835     a = a - b;
836     return a;
837 }
838 
839 // postfix operator --
840 static inline Vec16i operator -- (Vec16i & a, int) {
841     Vec16i a0 = a;
842     a = a - 1;
843     return a0;
844 }
845 
846 // prefix operator --
847 static inline Vec16i & operator -- (Vec16i & a) {
848     a = a - 1;
849     return a;
850 }
851 
852 // vector operator * : multiply element by element
853 static inline Vec16i operator * (Vec16i const & a, Vec16i const & b) {
854     return Vec16i(a.get_low() * b.get_low(), a.get_high() * b.get_high());
855 }
856 
857 // vector operator *= : multiply
858 static inline Vec16i & operator *= (Vec16i & a, Vec16i const & b) {
859     a = a * b;
860     return a;
861 }
862 
863 // vector operator / : divide all elements by same integer
864 // See bottom of file
865 
866 
867 // vector operator << : shift left
868 static inline Vec16i operator << (Vec16i const & a, int32_t b) {
869     return Vec16i(a.get_low() << b, a.get_high() << b);
870 }
871 
872 // vector operator <<= : shift left
873 static inline Vec16i & operator <<= (Vec16i & a, int32_t b) {
874     a = a << b;
875     return a;
876 }
877 
878 // vector operator >> : shift right arithmetic
879 static inline Vec16i operator >> (Vec16i const & a, int32_t b) {
880     return Vec16i(a.get_low() >> b, a.get_high() >> b);
881 }
882 
883 // vector operator >>= : shift right arithmetic
884 static inline Vec16i & operator >>= (Vec16i & a, int32_t b) {
885     a = a >> b;
886     return a;
887 }
888 
889 // vector operator == : returns true for elements for which a == b
890 static inline Vec16ib operator == (Vec16i const & a, Vec16i const & b) {
891     return Vec16ib(a.get_low() == b.get_low(), a.get_high() == b.get_high());
892 }
893 
894 // vector operator != : returns true for elements for which a != b
895 static inline Vec16ib operator != (Vec16i const & a, Vec16i const & b) {
896     return Vec16ib(a.get_low() != b.get_low(), a.get_high() != b.get_high());
897 }
898 
899 // vector operator > : returns true for elements for which a > b
900 static inline Vec16ib operator > (Vec16i const & a, Vec16i const & b) {
901     return Vec16ib(a.get_low() > b.get_low(), a.get_high() > b.get_high());
902 }
903 
904 // vector operator < : returns true for elements for which a < b
905 static inline Vec16ib operator < (Vec16i const & a, Vec16i const & b) {
906     return b > a;
907 }
908 
909 // vector operator >= : returns true for elements for which a >= b (signed)
910 static inline Vec16ib operator >= (Vec16i const & a, Vec16i const & b) {
911     return Vec16ib(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
912 }
913 
914 // vector operator <= : returns true for elements for which a <= b (signed)
915 static inline Vec16ib operator <= (Vec16i const & a, Vec16i const & b) {
916     return b >= a;
917 }
918 
919 // vector operator & : bitwise and
920 static inline Vec16i operator & (Vec16i const & a, Vec16i const & b) {
921     return Vec16i(a.get_low() & b.get_low(), a.get_high() & b.get_high());
922 }
923 
924 // vector operator &= : bitwise and
925 static inline Vec16i & operator &= (Vec16i & a, Vec16i const & b) {
926     a = a & b;
927     return a;
928 }
929 
930 // vector operator | : bitwise or
931 static inline Vec16i operator | (Vec16i const & a, Vec16i const & b) {
932     return Vec16i(a.get_low() | b.get_low(), a.get_high() | b.get_high());
933 }
934 
935 // vector operator |= : bitwise or
936 static inline Vec16i & operator |= (Vec16i & a, Vec16i const & b) {
937     a = a | b;
938     return a;
939 }
940 
941 // vector operator ^ : bitwise xor
942 static inline Vec16i operator ^ (Vec16i const & a, Vec16i const & b) {
943     return Vec16i(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
944 }
945 
946 // vector operator ^= : bitwise xor
947 static inline Vec16i & operator ^= (Vec16i & a, Vec16i const & b) {
948     a = a ^ b;
949     return a;
950 }
951 
952 // vector operator ~ : bitwise not
953 static inline Vec16i operator ~ (Vec16i const & a) {
954     return Vec16i(~(a.get_low()), ~(a.get_high()));
955 }
956 
957 // Functions for this class
958 
959 // Select between two operands. Corresponds to this pseudocode:
960 // for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec16ib const & s,Vec16i const & a,Vec16i const & b)961 static inline Vec16i select (Vec16ib const & s, Vec16i const & a, Vec16i const & b) {
962     return Vec16i(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
963 }
964 
965 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec16ib const & f,Vec16i const & a,Vec16i const & b)966 static inline Vec16i if_add (Vec16ib const & f, Vec16i const & a, Vec16i const & b) {
967     return Vec16i(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
968 }
969 
970 // Horizontal add: Calculates the sum of all vector elements.
971 // Overflow will wrap around
horizontal_add(Vec16i const & a)972 static inline int32_t horizontal_add (Vec16i const & a) {
973     return horizontal_add(a.get_low() + a.get_high());
974 }
975 
976 // function add_saturated: add element by element, signed with saturation
add_saturated(Vec16i const & a,Vec16i const & b)977 static inline Vec16i add_saturated(Vec16i const & a, Vec16i const & b) {
978     return Vec16i(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
979 }
980 
981 // function sub_saturated: subtract element by element, signed with saturation
sub_saturated(Vec16i const & a,Vec16i const & b)982 static inline Vec16i sub_saturated(Vec16i const & a, Vec16i const & b) {
983     return Vec16i(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
984 }
985 
986 // function max: a > b ? a : b
max(Vec16i const & a,Vec16i const & b)987 static inline Vec16i max(Vec16i const & a, Vec16i const & b) {
988     return Vec16i(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
989 }
990 
991 // function min: a < b ? a : b
min(Vec16i const & a,Vec16i const & b)992 static inline Vec16i min(Vec16i const & a, Vec16i const & b) {
993     return Vec16i(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
994 }
995 
996 // function abs: a >= 0 ? a : -a
abs(Vec16i const & a)997 static inline Vec16i abs(Vec16i const & a) {
998     return Vec16i(abs(a.get_low()), abs(a.get_high()));
999 }
1000 
1001 // function abs_saturated: same as abs, saturate if overflow
abs_saturated(Vec16i const & a)1002 static inline Vec16i abs_saturated(Vec16i const & a) {
1003     return Vec16i(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
1004 }
1005 
1006 // function rotate_left all elements
1007 // Use negative count to rotate right
rotate_left(Vec16i const & a,int b)1008 static inline Vec16i rotate_left(Vec16i const & a, int b) {
1009     return Vec16i(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b));
1010 }
1011 
1012 
1013 /*****************************************************************************
1014 *
1015 *          Vector of 16 32-bit unsigned integers
1016 *
1017 *****************************************************************************/
1018 
1019 class Vec16ui : public Vec16i {
1020 public:
1021     // Default constructor:
Vec16ui()1022     Vec16ui() {
1023     };
1024     // Constructor to broadcast the same value into all elements:
Vec16ui(uint32_t i)1025     Vec16ui(uint32_t i) {
1026         z0 = z1 = Vec8ui(i);
1027     };
1028     // Constructor to build from all elements:
Vec16ui(uint32_t i0,uint32_t i1,uint32_t i2,uint32_t i3,uint32_t i4,uint32_t i5,uint32_t i6,uint32_t i7,uint32_t i8,uint32_t i9,uint32_t i10,uint32_t i11,uint32_t i12,uint32_t i13,uint32_t i14,uint32_t i15)1029     Vec16ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7,
1030     uint32_t i8, uint32_t i9, uint32_t i10, uint32_t i11, uint32_t i12, uint32_t i13, uint32_t i14, uint32_t i15) {
1031         z0 = Vec8ui(i0, i1, i2, i3, i4, i5, i6, i7);
1032         z1 = Vec8ui(i8, i9, i10, i11, i12, i13, i14, i15);
1033     };
1034     // Constructor to build from two Vec8ui:
Vec16ui(Vec8ui const & a0,Vec8ui const & a1)1035     Vec16ui(Vec8ui const & a0, Vec8ui const & a1) {
1036         z0 = a0;
1037         z1 = a1;
1038     }
1039     // Constructor to convert from type Vec512b
Vec16ui(Vec512b const & x)1040     Vec16ui(Vec512b const & x) {
1041         *this = x;
1042     };
1043     // Assignment operator to convert from type Vec512b
1044     Vec16ui & operator = (Vec512b const & x) {
1045         z0 = x.get_low();
1046         z1 = x.get_high();
1047         return *this;
1048     };
1049     // Member function to load from array (unaligned)
load(void const * p)1050     Vec16ui & load(void const * p) {
1051         Vec16i::load(p);
1052         return *this;
1053     }
1054     // Member function to load from array, aligned by 64
load_a(void const * p)1055     Vec16ui & load_a(void const * p) {
1056         Vec16i::load_a(p);
1057         return *this;
1058     }
1059     // Member function to change a single element in vector
1060     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,uint32_t value)1061     Vec16ui const & insert(uint32_t index, uint32_t value) {
1062         Vec16i::insert(index, value);
1063         return *this;
1064     }
1065     // Member function extract a single element from vector
extract(uint32_t index)1066     uint32_t extract(uint32_t index) const {
1067         return Vec16i::extract(index);
1068     }
1069     // Extract a single element. Use store function if extracting more than one element.
1070     // Operator [] can only read an element, not write.
1071     uint32_t operator [] (uint32_t index) const {
1072         return extract(index);
1073     }
1074     // Member functions to split into two Vec4ui:
get_low()1075     Vec8ui get_low() const {
1076         return Vec8ui(Vec16i::get_low());
1077     }
get_high()1078     Vec8ui get_high() const {
1079         return Vec8ui(Vec16i::get_high());
1080     }
1081 };
1082 
1083 // Define operators for this class
1084 
1085 // vector operator + : add
1086 static inline Vec16ui operator + (Vec16ui const & a, Vec16ui const & b) {
1087     return Vec16ui (Vec16i(a) + Vec16i(b));
1088 }
1089 
1090 // vector operator - : subtract
1091 static inline Vec16ui operator - (Vec16ui const & a, Vec16ui const & b) {
1092     return Vec16ui (Vec16i(a) - Vec16i(b));
1093 }
1094 
1095 // vector operator * : multiply
1096 static inline Vec16ui operator * (Vec16ui const & a, Vec16ui const & b) {
1097     return Vec16ui (Vec16i(a) * Vec16i(b));
1098 }
1099 
1100 // vector operator / : divide
1101 // See bottom of file
1102 
1103 // vector operator >> : shift right logical all elements
1104 static inline Vec16ui operator >> (Vec16ui const & a, uint32_t b) {
1105     return Vec16ui(a.get_low() >> b, a.get_high() >> b);
1106 }
1107 
1108 // vector operator >> : shift right logical all elements
1109 static inline Vec16ui operator >> (Vec16ui const & a, int32_t b) {
1110     return a >> (uint32_t)b;
1111 }
1112 
1113 // vector operator >>= : shift right logical
1114 static inline Vec16ui & operator >>= (Vec16ui & a, uint32_t b) {
1115     a = a >> b;
1116     return a;
1117 }
1118 
1119 // vector operator >>= : shift right logical
1120 static inline Vec16ui & operator >>= (Vec16ui & a, int32_t b) {
1121     a = a >> uint32_t(b);
1122     return a;
1123 }
1124 
1125 // vector operator << : shift left all elements
1126 static inline Vec16ui operator << (Vec16ui const & a, uint32_t b) {
1127     return Vec16ui ((Vec16i)a << (int32_t)b);
1128 }
1129 
1130 // vector operator << : shift left all elements
1131 static inline Vec16ui operator << (Vec16ui const & a, int32_t b) {
1132     return Vec16ui ((Vec16i)a << (int32_t)b);
1133 }
1134 
1135 // vector operator < : returns true for elements for which a < b (unsigned)
1136 static inline Vec16ib operator < (Vec16ui const & a, Vec16ui const & b) {
1137     return Vec16ib(a.get_low() < b.get_low(), a.get_high() < b.get_high());
1138 }
1139 
1140 // vector operator > : returns true for elements for which a > b (unsigned)
1141 static inline Vec16ib operator > (Vec16ui const & a, Vec16ui const & b) {
1142     return b < a;
1143 }
1144 
1145 // vector operator >= : returns true for elements for which a >= b (unsigned)
1146 static inline Vec16ib operator >= (Vec16ui const & a, Vec16ui const & b) {
1147     return Vec16ib(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
1148 }
1149 
1150 // vector operator <= : returns true for elements for which a <= b (unsigned)
1151 static inline Vec16ib operator <= (Vec16ui const & a, Vec16ui const & b) {
1152     return b >= a;
1153 }
1154 
1155 // vector operator & : bitwise and
1156 static inline Vec16ui operator & (Vec16ui const & a, Vec16ui const & b) {
1157     return Vec16ui(Vec16i(a) & Vec16i(b));
1158 }
1159 
1160 // vector operator | : bitwise or
1161 static inline Vec16ui operator | (Vec16ui const & a, Vec16ui const & b) {
1162     return Vec16ui(Vec16i(a) | Vec16i(b));
1163 }
1164 
1165 // vector operator ^ : bitwise xor
1166 static inline Vec16ui operator ^ (Vec16ui const & a, Vec16ui const & b) {
1167     return Vec16ui(Vec16i(a) ^ Vec16i(b));
1168 }
1169 
1170 // vector operator ~ : bitwise not
1171 static inline Vec16ui operator ~ (Vec16ui const & a) {
1172     return Vec16ui( ~ Vec16i(a));
1173 }
1174 
1175 // Functions for this class
1176 
1177 // Select between two operands. Corresponds to this pseudocode:
1178 // for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec16ib const & s,Vec16ui const & a,Vec16ui const & b)1179 static inline Vec16ui select (Vec16ib const & s, Vec16ui const & a, Vec16ui const & b) {
1180     return Vec16ui(select(s, Vec16i(a), Vec16i(b)));
1181 }
1182 
1183 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec16ib const & f,Vec16ui const & a,Vec16ui const & b)1184 static inline Vec16ui if_add (Vec16ib const & f, Vec16ui const & a, Vec16ui const & b) {
1185     return Vec16ui(if_add(f, Vec16i(a), Vec16i(b)));
1186 }
1187 
1188 // Horizontal add: Calculates the sum of all vector elements.
1189 // Overflow will wrap around
horizontal_add(Vec16ui const & a)1190 static inline uint32_t horizontal_add (Vec16ui const & a) {
1191     return horizontal_add((Vec16i)a);
1192 }
1193 
1194 // horizontal_add_x: Horizontal add extended: Calculates the sum of all vector elements. Defined later in this file
1195 
1196 // function add_saturated: add element by element, unsigned with saturation
add_saturated(Vec16ui const & a,Vec16ui const & b)1197 static inline Vec16ui add_saturated(Vec16ui const & a, Vec16ui const & b) {
1198     return Vec16ui(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
1199 }
1200 
1201 // function sub_saturated: subtract element by element, unsigned with saturation
sub_saturated(Vec16ui const & a,Vec16ui const & b)1202 static inline Vec16ui sub_saturated(Vec16ui const & a, Vec16ui const & b) {
1203     return Vec16ui(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
1204 }
1205 
1206 // function max: a > b ? a : b
max(Vec16ui const & a,Vec16ui const & b)1207 static inline Vec16ui max(Vec16ui const & a, Vec16ui const & b) {
1208     return Vec16ui(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
1209 }
1210 
1211 // function min: a < b ? a : b
min(Vec16ui const & a,Vec16ui const & b)1212 static inline Vec16ui min(Vec16ui const & a, Vec16ui const & b) {
1213     return Vec16ui(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
1214 }
1215 
1216 
1217 /*****************************************************************************
1218 *
1219 *          Vector of 8 64-bit signed integers
1220 *
1221 *****************************************************************************/
1222 
1223 class Vec8q : public Vec512b {
1224 public:
1225     // Default constructor:
Vec8q()1226     Vec8q() {
1227     }
1228     // Constructor to broadcast the same value into all elements:
Vec8q(int64_t i)1229     Vec8q(int64_t i) {
1230         z0 = z1 = Vec4q(i);
1231     }
1232     // Constructor to build from all elements:
Vec8q(int64_t i0,int64_t i1,int64_t i2,int64_t i3,int64_t i4,int64_t i5,int64_t i6,int64_t i7)1233     Vec8q(int64_t i0, int64_t i1, int64_t i2, int64_t i3, int64_t i4, int64_t i5, int64_t i6, int64_t i7) {
1234         z0 = Vec4q(i0, i1, i2, i3);
1235         z1 = Vec4q(i4, i5, i6, i7);
1236     }
1237     // Constructor to build from two Vec4q:
Vec8q(Vec4q const & a0,Vec4q const & a1)1238     Vec8q(Vec4q const & a0, Vec4q const & a1) {
1239         z0 = a0;
1240         z1 = a1;
1241     }
1242     // Constructor to convert from type Vec512b
Vec8q(Vec512b const & x)1243     Vec8q(Vec512b const & x) {
1244         z0 = x.get_low();
1245         z1 = x.get_high();
1246     }
1247     // Assignment operator to convert from type Vec512b
1248     Vec8q & operator = (Vec512b const & x) {
1249         z0 = x.get_low();
1250         z1 = x.get_high();
1251         return *this;
1252     }
1253     // Member function to load from array (unaligned)
load(void const * p)1254     Vec8q & load(void const * p) {
1255         z0 = Vec4q().load(p);
1256         z1 = Vec4q().load((int64_t const*)p+4);
1257         return *this;
1258     }
1259     // Member function to load from array, aligned by 64
load_a(void const * p)1260     Vec8q & load_a(void const * p) {
1261         z0 = Vec4q().load_a(p);
1262         z1 = Vec4q().load_a((int64_t const*)p+4);
1263         return *this;
1264     }
1265     // Partial load. Load n elements and set the rest to 0
load_partial(int n,void const * p)1266     Vec8q & load_partial(int n, void const * p) {
1267         if (n < 4) {
1268             z0 = Vec4q().load_partial(n, p);
1269             z1 = Vec4q(0);
1270         }
1271         else {
1272             z0 = Vec4q().load(p);
1273             z1 = Vec4q().load_partial(n - 4, (int64_t const*)p + 4);
1274         }
1275         return *this;
1276     }
1277     // Partial store. Store n elements
store_partial(int n,void * p)1278     void store_partial(int n, void * p) const {
1279         if (n < 4) {
1280             Vec4q(get_low()).store_partial(n, p);
1281         }
1282         else {
1283             Vec4q(get_low()).store(p);
1284             Vec4q(get_high()).store_partial(n - 4, (int64_t *)p + 4);
1285         }
1286     }
1287     // cut off vector to n elements. The last 8-n elements are set to zero
cutoff(int n)1288     Vec8q & cutoff(int n) {
1289         if (n < 4) {
1290             z0 = Vec4q(z0).cutoff(n);
1291             z1 = Vec4q(0);
1292         }
1293         else {
1294             z1 = Vec4q(z1).cutoff(n - 4);
1295         }
1296         return *this;
1297     }
1298     // Member function to change a single element in vector
1299     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,int64_t value)1300     Vec8q const & insert(uint32_t index, int64_t value) {
1301         if (index < 4) {
1302             z0 = Vec4q(z0).insert(index, value);
1303         }
1304         else {
1305             z1 = Vec4q(z1).insert(index-4, value);
1306         }
1307         return *this;
1308     }
1309     // Member function extract a single element from vector
extract(uint32_t index)1310     int64_t extract(uint32_t index) const {
1311         if (index < 4) {
1312             return Vec4q(z0).extract(index);
1313         }
1314         else {
1315             return Vec4q(z1).extract(index - 4);
1316         }
1317     }
1318     // Extract a single element. Use store function if extracting more than one element.
1319     // Operator [] can only read an element, not write.
1320     int64_t operator [] (uint32_t index) const {
1321         return extract(index);
1322     }
1323     // Member functions to split into two Vec2q:
get_low()1324     Vec4q get_low() const {
1325         return Vec4q(z0);
1326     }
get_high()1327     Vec4q get_high() const {
1328         return Vec4q(z1);
1329     }
size()1330     static int size () {
1331         return 8;
1332     }
1333 };
1334 
1335 
1336 // Define operators for Vec8q
1337 
1338 // vector operator + : add element by element
1339 static inline Vec8q operator + (Vec8q const & a, Vec8q const & b) {
1340     return Vec8q(a.get_low() + b.get_low(), a.get_high() + b.get_high());
1341 }
1342 
1343 // vector operator += : add
1344 static inline Vec8q & operator += (Vec8q & a, Vec8q const & b) {
1345     a = a + b;
1346     return a;
1347 }
1348 
1349 // postfix operator ++
1350 static inline Vec8q operator ++ (Vec8q & a, int) {
1351     Vec8q a0 = a;
1352     a = a + 1;
1353     return a0;
1354 }
1355 
1356 // prefix operator ++
1357 static inline Vec8q & operator ++ (Vec8q & a) {
1358     a = a + 1;
1359     return a;
1360 }
1361 
1362 // vector operator - : subtract element by element
1363 static inline Vec8q operator - (Vec8q const & a, Vec8q const & b) {
1364     return Vec8q(a.get_low() - b.get_low(), a.get_high() - b.get_high());
1365 }
1366 
1367 // vector operator - : unary minus
1368 static inline Vec8q operator - (Vec8q const & a) {
1369     return Vec8q(- a.get_low(), - a.get_high());
1370 }
1371 
1372 // vector operator -= : subtract
1373 static inline Vec8q & operator -= (Vec8q & a, Vec8q const & b) {
1374     a = a - b;
1375     return a;
1376 }
1377 
1378 // postfix operator --
1379 static inline Vec8q operator -- (Vec8q & a, int) {
1380     Vec8q a0 = a;
1381     a = a - 1;
1382     return a0;
1383 }
1384 
1385 // prefix operator --
1386 static inline Vec8q & operator -- (Vec8q & a) {
1387     a = a - 1;
1388     return a;
1389 }
1390 
1391 // vector operator * : multiply element by element
1392 static inline Vec8q operator * (Vec8q const & a, Vec8q const & b) {
1393     return Vec8q(a.get_low() * b.get_low(), a.get_high() * b.get_high());
1394 }
1395 
1396 // vector operator *= : multiply
1397 static inline Vec8q & operator *= (Vec8q & a, Vec8q const & b) {
1398     a = a * b;
1399     return a;
1400 }
1401 
1402 // vector operator << : shift left
1403 static inline Vec8q operator << (Vec8q const & a, int32_t b) {
1404     return Vec8q(a.get_low() << b, a.get_high() << b);
1405 }
1406 
1407 // vector operator <<= : shift left
1408 static inline Vec8q & operator <<= (Vec8q & a, int32_t b) {
1409     a = a << b;
1410     return a;
1411 }
1412 
1413 // vector operator >> : shift right arithmetic
1414 static inline Vec8q operator >> (Vec8q const & a, int32_t b) {
1415     return Vec8q(a.get_low() >> b, a.get_high() >> b);
1416 }
1417 
1418 // vector operator >>= : shift right arithmetic
1419 static inline Vec8q & operator >>= (Vec8q & a, int32_t b) {
1420     a = a >> b;
1421     return a;
1422 }
1423 
1424 // vector operator == : returns true for elements for which a == b
1425 static inline Vec8qb operator == (Vec8q const & a, Vec8q const & b) {
1426     return Vec8qb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
1427 }
1428 
1429 // vector operator != : returns true for elements for which a != b
1430 static inline Vec8qb operator != (Vec8q const & a, Vec8q const & b) {
1431     return Vec8qb(a.get_low() != b.get_low(), a.get_high() != b.get_high());
1432 }
1433 
1434 // vector operator < : returns true for elements for which a < b
1435 static inline Vec8qb operator < (Vec8q const & a, Vec8q const & b) {
1436     return Vec8qb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
1437 }
1438 
1439 // vector operator > : returns true for elements for which a > b
1440 static inline Vec8qb operator > (Vec8q const & a, Vec8q const & b) {
1441     return b < a;
1442 }
1443 
1444 // vector operator >= : returns true for elements for which a >= b (signed)
1445 static inline Vec8qb operator >= (Vec8q const & a, Vec8q const & b) {
1446     return Vec8qb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
1447 }
1448 
1449 // vector operator <= : returns true for elements for which a <= b (signed)
1450 static inline Vec8qb operator <= (Vec8q const & a, Vec8q const & b) {
1451     return b >= a;
1452 }
1453 
1454 // vector operator & : bitwise and
1455 static inline Vec8q operator & (Vec8q const & a, Vec8q const & b) {
1456     return Vec8q(a.get_low() & b.get_low(), a.get_high() & b.get_high());
1457 }
1458 
1459 // vector operator &= : bitwise and
1460 static inline Vec8q & operator &= (Vec8q & a, Vec8q const & b) {
1461     a = a & b;
1462     return a;
1463 }
1464 
1465 // vector operator | : bitwise or
1466 static inline Vec8q operator | (Vec8q const & a, Vec8q const & b) {
1467     return Vec8q(a.get_low() | b.get_low(), a.get_high() | b.get_high());
1468 }
1469 
1470 // vector operator |= : bitwise or
1471 static inline Vec8q & operator |= (Vec8q & a, Vec8q const & b) {
1472     a = a | b;
1473     return a;
1474 }
1475 
1476 // vector operator ^ : bitwise xor
1477 static inline Vec8q operator ^ (Vec8q const & a, Vec8q const & b) {
1478     return Vec8q(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
1479 }
1480 // vector operator ^= : bitwise xor
1481 static inline Vec8q & operator ^= (Vec8q & a, Vec8q const & b) {
1482     a = a ^ b;
1483     return a;
1484 }
1485 
1486 // vector operator ~ : bitwise not
1487 static inline Vec8q operator ~ (Vec8q const & a) {
1488     return Vec8q(~(a.get_low()), ~(a.get_high()));
1489 }
1490 
1491 // Functions for this class
1492 
1493 // Select between two operands. Corresponds to this pseudocode:
1494 // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec8qb const & s,Vec8q const & a,Vec8q const & b)1495 static inline Vec8q select (Vec8qb const & s, Vec8q const & a, Vec8q const & b) {
1496     return Vec8q(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
1497 }
1498 
1499 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec8qb const & f,Vec8q const & a,Vec8q const & b)1500 static inline Vec8q if_add (Vec8qb const & f, Vec8q const & a, Vec8q const & b) {
1501     return Vec8q(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
1502 }
1503 
1504 // Horizontal add: Calculates the sum of all vector elements.
1505 // Overflow will wrap around
horizontal_add(Vec8q const & a)1506 static inline int64_t horizontal_add (Vec8q const & a) {
1507     return horizontal_add(a.get_low() + a.get_high());
1508 }
1509 
1510 // Horizontal add extended: Calculates the sum of all vector elements
1511 // Elements are sign extended before adding to avoid overflow
horizontal_add_x(Vec16i const & x)1512 static inline int64_t horizontal_add_x (Vec16i const & x) {
1513     return horizontal_add_x(x.get_low()) + horizontal_add_x(x.get_high());
1514 }
1515 
1516 // Horizontal add extended: Calculates the sum of all vector elements
1517 // Elements are zero extended before adding to avoid overflow
horizontal_add_x(Vec16ui const & x)1518 static inline uint64_t horizontal_add_x (Vec16ui const & x) {
1519     return horizontal_add_x(x.get_low()) + horizontal_add_x(x.get_high());
1520 }
1521 
1522 // function max: a > b ? a : b
max(Vec8q const & a,Vec8q const & b)1523 static inline Vec8q max(Vec8q const & a, Vec8q const & b) {
1524     return Vec8q(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
1525 }
1526 
1527 // function min: a < b ? a : b
min(Vec8q const & a,Vec8q const & b)1528 static inline Vec8q min(Vec8q const & a, Vec8q const & b) {
1529     return Vec8q(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
1530 }
1531 
1532 // function abs: a >= 0 ? a : -a
abs(Vec8q const & a)1533 static inline Vec8q abs(Vec8q const & a) {
1534     return Vec8q(abs(a.get_low()), abs(a.get_high()));
1535 }
1536 
1537 // function abs_saturated: same as abs, saturate if overflow
abs_saturated(Vec8q const & a)1538 static inline Vec8q abs_saturated(Vec8q const & a) {
1539     return Vec8q(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
1540 }
1541 
1542 // function rotate_left all elements
1543 // Use negative count to rotate right
rotate_left(Vec8q const & a,int b)1544 static inline Vec8q rotate_left(Vec8q const & a, int b) {
1545     return Vec8q(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b));
1546 }
1547 
1548 
1549 /*****************************************************************************
1550 *
1551 *          Vector of 8 64-bit unsigned integers
1552 *
1553 *****************************************************************************/
1554 
1555 class Vec8uq : public Vec8q {
1556 public:
1557     // Default constructor:
Vec8uq()1558     Vec8uq() {
1559     }
1560     // Constructor to broadcast the same value into all elements:
Vec8uq(uint64_t i)1561     Vec8uq(uint64_t i) {
1562         z0 = z1 = Vec4uq(i);
1563     }
1564     // Constructor to convert from Vec8q:
Vec8uq(Vec8q const & x)1565     Vec8uq(Vec8q const & x) {
1566         z0 = x.get_low();
1567         z1 = x.get_high();
1568     }
1569     // Constructor to convert from type Vec512b
Vec8uq(Vec512b const & x)1570     Vec8uq(Vec512b const & x) {
1571         z0 = x.get_low();
1572         z1 = x.get_high();
1573     }
1574     // Constructor to build from all elements:
Vec8uq(uint64_t i0,uint64_t i1,uint64_t i2,uint64_t i3,uint64_t i4,uint64_t i5,uint64_t i6,uint64_t i7)1575     Vec8uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3, uint64_t i4, uint64_t i5, uint64_t i6, uint64_t i7) {
1576         z0 = Vec4q(i0, i1, i2, i3);
1577         z1 = Vec4q(i4, i5, i6, i7);
1578     }
1579     // Constructor to build from two Vec4uq:
Vec8uq(Vec4uq const & a0,Vec4uq const & a1)1580     Vec8uq(Vec4uq const & a0, Vec4uq const & a1) {
1581         z0 = a0;
1582         z1 = a1;
1583     }
1584     // Assignment operator to convert from Vec8q:
1585     Vec8uq  & operator = (Vec8q const & x) {
1586         z0 = x.get_low();
1587         z1 = x.get_high();
1588         return *this;
1589     }
1590     // Assignment operator to convert from type Vec512b
1591     Vec8uq & operator = (Vec512b const & x) {
1592         z0 = x.get_low();
1593         z1 = x.get_high();
1594         return *this;
1595     }
1596     // Member function to load from array (unaligned)
load(void const * p)1597     Vec8uq & load(void const * p) {
1598         Vec8q::load(p);
1599         return *this;
1600     }
1601     // Member function to load from array, aligned by 32
load_a(void const * p)1602     Vec8uq & load_a(void const * p) {
1603         Vec8q::load_a(p);
1604         return *this;
1605     }
1606     // Member function to change a single element in vector
1607     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,uint64_t value)1608     Vec8uq const & insert(uint32_t index, uint64_t value) {
1609         Vec8q::insert(index, value);
1610         return *this;
1611     }
1612     // Member function extract a single element from vector
extract(uint32_t index)1613     uint64_t extract(uint32_t index) const {
1614         return Vec8q::extract(index);
1615     }
1616     // Extract a single element. Use store function if extracting more than one element.
1617     // Operator [] can only read an element, not write.
1618     uint64_t operator [] (uint32_t index) const {
1619         return extract(index);
1620     }
1621     // Member functions to split into two Vec2uq:
get_low()1622     Vec4uq get_low() const {
1623         return Vec4uq(Vec8q::get_low());
1624     }
get_high()1625     Vec4uq get_high() const {
1626         return Vec4uq(Vec8q::get_high());
1627     }
1628 };
1629 
1630 // Define operators for this class
1631 
1632 // vector operator + : add
1633 static inline Vec8uq operator + (Vec8uq const & a, Vec8uq const & b) {
1634     return Vec8uq (Vec8q(a) + Vec8q(b));
1635 }
1636 
1637 // vector operator - : subtract
1638 static inline Vec8uq operator - (Vec8uq const & a, Vec8uq const & b) {
1639     return Vec8uq (Vec8q(a) - Vec8q(b));
1640 }
1641 
1642 // vector operator * : multiply element by element
1643 static inline Vec8uq operator * (Vec8uq const & a, Vec8uq const & b) {
1644     return Vec8uq (Vec8q(a) * Vec8q(b));
1645 }
1646 
1647 // vector operator >> : shift right logical all elements
1648 static inline Vec8uq operator >> (Vec8uq const & a, uint32_t b) {
1649     return Vec8uq(a.get_low() >> b, a.get_high() >> b);
1650 }
1651 
1652 // vector operator >> : shift right logical all elements
1653 static inline Vec8uq operator >> (Vec8uq const & a, int32_t b) {
1654     return a >> (uint32_t)b;
1655 }
1656 
1657 // vector operator >>= : shift right artihmetic
1658 static inline Vec8uq & operator >>= (Vec8uq & a, uint32_t b) {
1659     a = a >> b;
1660     return a;
1661 }
1662 
1663 // vector operator >>= : shift right logical
1664 static inline Vec8uq & operator >>= (Vec8uq & a, int32_t b) {
1665     a = a >> uint32_t(b);
1666     return a;
1667 }
1668 
1669 // vector operator << : shift left all elements
1670 static inline Vec8uq operator << (Vec8uq const & a, uint32_t b) {
1671     return Vec8uq ((Vec8q)a << (int32_t)b);
1672 }
1673 
1674 // vector operator << : shift left all elements
1675 static inline Vec8uq operator << (Vec8uq const & a, int32_t b) {
1676     return Vec8uq ((Vec8q)a << b);
1677 }
1678 
1679 // vector operator < : returns true for elements for which a < b (unsigned)
1680 static inline Vec8qb operator < (Vec8uq const & a, Vec8uq const & b) {
1681     return Vec8qb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
1682 }
1683 
1684 // vector operator > : returns true for elements for which a > b (unsigned)
1685 static inline Vec8qb operator > (Vec8uq const & a, Vec8uq const & b) {
1686     return b < a;
1687 }
1688 
1689 // vector operator >= : returns true for elements for which a >= b (unsigned)
1690 static inline Vec8qb operator >= (Vec8uq const & a, Vec8uq const & b) {
1691     return Vec8qb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
1692 }
1693 
1694 // vector operator <= : returns true for elements for which a <= b (unsigned)
1695 static inline Vec8qb operator <= (Vec8uq const & a, Vec8uq const & b) {
1696     return b >= a;
1697 }
1698 
1699 // vector operator & : bitwise and
1700 static inline Vec8uq operator & (Vec8uq const & a, Vec8uq const & b) {
1701     return Vec8uq(Vec8q(a) & Vec8q(b));
1702 }
1703 
1704 // vector operator | : bitwise or
1705 static inline Vec8uq operator | (Vec8uq const & a, Vec8uq const & b) {
1706     return Vec8uq(Vec8q(a) | Vec8q(b));
1707 }
1708 
1709 // vector operator ^ : bitwise xor
1710 static inline Vec8uq operator ^ (Vec8uq const & a, Vec8uq const & b) {
1711     return Vec8uq(Vec8q(a) ^ Vec8q(b));
1712 }
1713 
1714 // Functions for this class
1715 
1716 // Select between two operands. Corresponds to this pseudocode:
1717 // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec8qb const & s,Vec8uq const & a,Vec8uq const & b)1718 static inline Vec8uq select (Vec8qb const & s, Vec8uq const & a, Vec8uq const & b) {
1719     return Vec8uq(select(s, Vec8q(a), Vec8q(b)));
1720 }
1721 
1722 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec8qb const & f,Vec8uq const & a,Vec8uq const & b)1723 static inline Vec8uq if_add (Vec8qb const & f, Vec8uq const & a, Vec8uq const & b) {
1724     return Vec8uq(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
1725 }
1726 
1727 // Horizontal add: Calculates the sum of all vector elements.
1728 // Overflow will wrap around
horizontal_add(Vec8uq const & a)1729 static inline uint64_t horizontal_add (Vec8uq const & a) {
1730     return horizontal_add(Vec8q(a));
1731 }
1732 
1733 // function max: a > b ? a : b
max(Vec8uq const & a,Vec8uq const & b)1734 static inline Vec8uq max(Vec8uq const & a, Vec8uq const & b) {
1735     return Vec8uq(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
1736 }
1737 
1738 // function min: a < b ? a : b
min(Vec8uq const & a,Vec8uq const & b)1739 static inline Vec8uq min(Vec8uq const & a, Vec8uq const & b) {
1740     return Vec8uq(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
1741 }
1742 
1743 
1744 /*****************************************************************************
1745 *
1746 *          Vector permute functions
1747 *
1748 ******************************************************************************
1749 *
1750 * These permute functions can reorder the elements of a vector and optionally
1751 * set some elements to zero.
1752 *
1753 * The indexes are inserted as template parameters in <>. These indexes must be
1754 * constants. Each template parameter is an index to the element you want to select.
1755 * An index of -1 will generate zero. An index of -256 means don't care.
1756 *
1757 * Example:
1758 * Vec8q a(10,11,12,13,14,15,16,17);      // a is (10,11,12,13,14,15,16,17)
1759 * Vec8q b;
1760 * b = permute8q<0,2,7,7,-1,-1,1,1>(a);   // b is (10,12,17,17, 0, 0,11,11)
1761 *
1762 * A lot of the code here is metaprogramming aiming to find the instructions
1763 * that best fit the template parameters and instruction set. The metacode
1764 * will be reduced out to leave only a few vector instructions in release
1765 * mode with optimization on.
1766 *****************************************************************************/
1767 
1768 // Permute vector of 8 64-bit integers.
1769 // Index -1 gives 0, index -256 means don't care.
1770 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
permute8q(Vec8q const & a)1771 static inline Vec8q permute8q(Vec8q const & a) {
1772     return Vec8q(blend4q<i0,i1,i2,i3> (a.get_low(), a.get_high()),
1773                  blend4q<i4,i5,i6,i7> (a.get_low(), a.get_high()));
1774 }
1775 
1776 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
permute8uq(Vec8uq const & a)1777 static inline Vec8uq permute8uq(Vec8uq const & a) {
1778     return Vec8uq (permute8q<i0,i1,i2,i3,i4,i5,i6,i7> (a));
1779 }
1780 
1781 
1782 // Permute vector of 16 32-bit integers.
1783 // Index -1 gives 0, index -256 means don't care.
1784 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
permute16i(Vec16i const & a)1785 static inline Vec16i permute16i(Vec16i const & a) {
1786     return Vec16i(blend8i<i0,i1,i2 ,i3 ,i4 ,i5 ,i6 ,i7 > (a.get_low(), a.get_high()),
1787                   blend8i<i8,i9,i10,i11,i12,i13,i14,i15> (a.get_low(), a.get_high()));
1788 }
1789 
1790 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
permute16ui(Vec16ui const & a)1791 static inline Vec16ui permute16ui(Vec16ui const & a) {
1792     return Vec16ui (permute16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a));
1793 }
1794 
1795 
1796 /*****************************************************************************
1797 *
1798 *          Vector blend functions
1799 *
1800 ******************************************************************************
1801 *
1802 * These blend functions can mix elements from two different vectors and
1803 * optionally set some elements to zero.
1804 *
1805 * The indexes are inserted as template parameters in <>. These indexes must be
1806 * constants. Each template parameter is an index to the element you want to
1807 * select, where higher indexes indicate an element from the second source
1808 * vector. For example, if each vector has 8 elements, then indexes 0 - 7
1809 * will select an element from the first vector and indexes 8 - 15 will select
1810 * an element from the second vector. A negative index will generate zero.
1811 *
1812 * Example:
1813 * Vec8q a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
1814 * Vec8q b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
1815 * Vec8q c;
1816 * c = blend8q<1,0,9,8,7,-1,15,15> (a,b);    // c is (101, 100, 201, 200, 107,   0, 207, 207)
1817 *
1818 * A lot of the code here is metaprogramming aiming to find the instructions
1819 * that best fit the template parameters and instruction set. The metacode
1820 * will be reduced out to leave only a few vector instructions in release
1821 * mode with optimization on.
1822 *****************************************************************************/
1823 
1824 
1825 // helper function used below
1826 template <int n>
select4(Vec8q const & a,Vec8q const & b)1827 static inline Vec4q select4(Vec8q const & a, Vec8q const & b) {
1828     switch (n) {
1829     case 0:
1830         return a.get_low();
1831     case 1:
1832         return a.get_high();
1833     case 2:
1834         return b.get_low();
1835     case 3:
1836         return b.get_high();
1837     }
1838     return Vec4q(0);
1839 }
1840 
1841 // blend vectors Vec8q
1842 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
blend8q(Vec8q const & a,Vec8q const & b)1843 static inline Vec8q blend8q(Vec8q const & a, Vec8q const & b) {
1844     const int j0 = i0 >= 0 ? i0/4 : i0;
1845     const int j1 = i1 >= 0 ? i1/4 : i1;
1846     const int j2 = i2 >= 0 ? i2/4 : i2;
1847     const int j3 = i3 >= 0 ? i3/4 : i3;
1848     const int j4 = i4 >= 0 ? i4/4 : i4;
1849     const int j5 = i5 >= 0 ? i5/4 : i5;
1850     const int j6 = i6 >= 0 ? i6/4 : i6;
1851     const int j7 = i7 >= 0 ? i7/4 : i7;
1852     Vec4q x0, x1;
1853 
1854     const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3;
1855     const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
1856     const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3;
1857     const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7;
1858 
1859     // Combine all the indexes into a single bitfield, with 4 bits for each
1860     const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
1861 
1862     // Mask to zero out negative indexes
1863     const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
1864 
1865     if (r0 < 0) {
1866         x0 =  Vec4q(0);
1867     }
1868     else if (((m1 ^ r0*0x4444) & 0xCCCC & mz) == 0) {
1869         // i0 - i3 all from same source
1870         x0 = permute4q<i0 & -13, i1 & -13, i2 & -13, i3 & -13> (select4<r0> (a,b));
1871     }
1872     else if ((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0)) {
1873         // i0 - i3 all from two sources
1874         const int k0 =  i0 >= 0 ? i0 & 3 : i0;
1875         const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0);
1876         const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0);
1877         const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0);
1878         x0 = blend4q<k0,k1,k2,k3> (select4<r0>(a,b), select4<s0>(a,b));
1879     }
1880     else {
1881         // i0 - i3 from three or four different sources
1882         x0 = blend4q<0,1,6,7> (
1883              blend4q<i0 & -13, (i1 & -13) | 4, -0x100, -0x100> (select4<j0>(a,b), select4<j1>(a,b)),
1884              blend4q<-0x100, -0x100, i2 & -13, (i3 & -13) | 4> (select4<j2>(a,b), select4<j3>(a,b)));
1885     }
1886 
1887     if (r1 < 0) {
1888         x1 =  Vec4q(0);
1889     }
1890     else if (((m1 ^ uint32_t(r1)*0x44440000u) & 0xCCCC0000 & mz) == 0) {
1891         // i4 - i7 all from same source
1892         x1 = permute4q<i4 & -13, i5 & -13, i6 & -13, i7 & -13> (select4<r1> (a,b));
1893     }
1894     else if ((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1)) {
1895         // i4 - i7 all from two sources
1896         const int k4 =  i4 >= 0 ? i4 & 3 : i4;
1897         const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0);
1898         const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0);
1899         const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0);
1900         x1 = blend4q<k4,k5,k6,k7> (select4<r1>(a,b), select4<s1>(a,b));
1901     }
1902     else {
1903         // i4 - i7 from three or four different sources
1904         x1 = blend4q<0,1,6,7> (
1905              blend4q<i4 & -13, (i5 & -13) | 4, -0x100, -0x100> (select4<j4>(a,b), select4<j5>(a,b)),
1906              blend4q<-0x100, -0x100, i6 & -13, (i7 & -13) | 4> (select4<j6>(a,b), select4<j7>(a,b)));
1907     }
1908 
1909     return Vec8q(x0,x1);
1910 }
1911 
1912 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
blend8uq(Vec8uq const & a,Vec8uq const & b)1913 static inline Vec8uq blend8uq(Vec8uq const & a, Vec8uq const & b) {
1914     return Vec8uq( blend8q<i0,i1,i2,i3,i4,i5,i6,i7> (a,b));
1915 }
1916 
1917 
1918 // helper function used below
1919 template <int n>
select4(Vec16i const & a,Vec16i const & b)1920 static inline Vec8i select4(Vec16i const & a, Vec16i const & b) {
1921     switch (n) {
1922     case 0:
1923         return a.get_low();
1924     case 1:
1925         return a.get_high();
1926     case 2:
1927         return b.get_low();
1928     case 3:
1929         return b.get_high();
1930     }
1931     return  Vec8i(0);
1932 }
1933 
1934 template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
1935           int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 >
blend16i(Vec16i const & a,Vec16i const & b)1936 static inline Vec16i blend16i(Vec16i const & a, Vec16i const & b) {
1937 
1938     const int j0  = i0  >= 0 ? i0 /8 : i0;
1939     const int j1  = i1  >= 0 ? i1 /8 : i1;
1940     const int j2  = i2  >= 0 ? i2 /8 : i2;
1941     const int j3  = i3  >= 0 ? i3 /8 : i3;
1942     const int j4  = i4  >= 0 ? i4 /8 : i4;
1943     const int j5  = i5  >= 0 ? i5 /8 : i5;
1944     const int j6  = i6  >= 0 ? i6 /8 : i6;
1945     const int j7  = i7  >= 0 ? i7 /8 : i7;
1946     const int j8  = i8  >= 0 ? i8 /8 : i8;
1947     const int j9  = i9  >= 0 ? i9 /8 : i9;
1948     const int j10 = i10 >= 0 ? i10/8 : i10;
1949     const int j11 = i11 >= 0 ? i11/8 : i11;
1950     const int j12 = i12 >= 0 ? i12/8 : i12;
1951     const int j13 = i13 >= 0 ? i13/8 : i13;
1952     const int j14 = i14 >= 0 ? i14/8 : i14;
1953     const int j15 = i15 >= 0 ? i15/8 : i15;
1954 
1955     Vec8i x0, x1;
1956 
1957     const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2  >= 0 ? j2  : j3  >= 0 ? j3  : j4  >= 0 ? j4  : j5  >= 0 ? j5  : j6  >= 0 ? j6  : j7;
1958     const int r1 = j8 >= 0 ? j8 : j9 >= 0 ? j9 : j10 >= 0 ? j10 : j11 >= 0 ? j11 : j12 >= 0 ? j12 : j13 >= 0 ? j13 : j14 >= 0 ? j14 : j15;
1959     const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2  : (j3 >= 0 && j3 != r0) ? j3 : (j4 >= 0 && j4 != r0) ? j4 : (j5 >= 0 && j5 != r0) ? j5 : (j6 >= 0 && j6 != r0) ? j6 : j7;
1960     const int s1 = (j9 >= 0 && j9 != r1) ? j9 : (j10>= 0 && j10!= r1) ? j10 : (j11>= 0 && j11!= r1) ? j11: (j12>= 0 && j12!= r1) ? j12: (j13>= 0 && j13!= r1) ? j13: (j14>= 0 && j14!= r1) ? j14: j15;
1961 
1962     if (r0 < 0) {
1963         x0 = Vec8i(0);
1964     }
1965     else if (r0 == s0) {
1966         // i0 - i7 all from same source
1967         x0 = permute8i<i0&-25, i1&-25, i2&-25, i3&-25, i4&-25, i5&-25, i6&-25, i7&-25> (select4<r0> (a,b));
1968     }
1969     else if ((j2<0||j2==r0||j2==s0) && (j3<0||j3==r0||j3==s0) && (j4<0||j4==r0||j4==s0) && (j5<0||j5==r0||j5==s0) && (j6<0||j6==r0||j6==s0) && (j7<0||j7==r0||j7==s0)) {
1970         // i0 - i7 all from two sources
1971         const int k0 =  i0 >= 0 ? (i0 & 7) : i0;
1972         const int k1 = (i1 >= 0 ? (i1 & 7) : i1) | (j1 == s0 ? 8 : 0);
1973         const int k2 = (i2 >= 0 ? (i2 & 7) : i2) | (j2 == s0 ? 8 : 0);
1974         const int k3 = (i3 >= 0 ? (i3 & 7) : i3) | (j3 == s0 ? 8 : 0);
1975         const int k4 = (i4 >= 0 ? (i4 & 7) : i4) | (j4 == s0 ? 8 : 0);
1976         const int k5 = (i5 >= 0 ? (i5 & 7) : i5) | (j5 == s0 ? 8 : 0);
1977         const int k6 = (i6 >= 0 ? (i6 & 7) : i6) | (j6 == s0 ? 8 : 0);
1978         const int k7 = (i7 >= 0 ? (i7 & 7) : i7) | (j7 == s0 ? 8 : 0);
1979         x0 = blend8i<k0,k1,k2,k3,k4,k5,k6,k7> (select4<r0>(a,b), select4<s0>(a,b));
1980     }
1981     else {
1982         // i0 - i7 from three or four different sources
1983         const int n0 = j0 >= 0 ? j0 /2*8 + 0 : j0;
1984         const int n1 = j1 >= 0 ? j1 /2*8 + 1 : j1;
1985         const int n2 = j2 >= 0 ? j2 /2*8 + 2 : j2;
1986         const int n3 = j3 >= 0 ? j3 /2*8 + 3 : j3;
1987         const int n4 = j4 >= 0 ? j4 /2*8 + 4 : j4;
1988         const int n5 = j5 >= 0 ? j5 /2*8 + 5 : j5;
1989         const int n6 = j6 >= 0 ? j6 /2*8 + 6 : j6;
1990         const int n7 = j7 >= 0 ? j7 /2*8 + 7 : j7;
1991         x0 = blend8i<n0, n1, n2, n3, n4, n5, n6, n7> (
1992              blend8i< j0   & 2 ? -256 : i0 &15,  j1   & 2 ? -256 : i1 &15,  j2   & 2 ? -256 : i2 &15,  j3   & 2 ? -256 : i3 &15,  j4   & 2 ? -256 : i4 &15,  j5   & 2 ? -256 : i5 &15,  j6   & 2 ? -256 : i6 &15,  j7   & 2 ? -256 : i7 &15> (a.get_low(),a.get_high()),
1993              blend8i<(j0^2)& 6 ? -256 : i0 &15, (j1^2)& 6 ? -256 : i1 &15, (j2^2)& 6 ? -256 : i2 &15, (j3^2)& 6 ? -256 : i3 &15, (j4^2)& 6 ? -256 : i4 &15, (j5^2)& 6 ? -256 : i5 &15, (j6^2)& 6 ? -256 : i6 &15, (j7^2)& 6 ? -256 : i7 &15> (b.get_low(),b.get_high()));
1994     }
1995 
1996     if (r1 < 0) {
1997         x1 = Vec8i(0);
1998     }
1999     else if (r1 == s1) {
2000         // i8 - i15 all from same source
2001         x1 = permute8i<i8&-25, i9&-25, i10&-25, i11&-25, i12&-25, i13&-25, i14&-25, i15&-25> (select4<r1> (a,b));
2002     }
2003     else if ((j10<0||j10==r1||j10==s1) && (j11<0||j11==r1||j11==s1) && (j12<0||j12==r1||j12==s1) && (j13<0||j13==r1||j13==s1) && (j14<0||j14==r1||j14==s1) && (j15<0||j15==r1||j15==s1)) {
2004         // i8 - i15 all from two sources
2005         const int k8 =  i8 >= 0 ? (i8 & 7) : i8;
2006         const int k9 = (i9 >= 0 ? (i9 & 7) : i9 ) | (j9 == s1 ? 8 : 0);
2007         const int k10= (i10>= 0 ? (i10& 7) : i10) | (j10== s1 ? 8 : 0);
2008         const int k11= (i11>= 0 ? (i11& 7) : i11) | (j11== s1 ? 8 : 0);
2009         const int k12= (i12>= 0 ? (i12& 7) : i12) | (j12== s1 ? 8 : 0);
2010         const int k13= (i13>= 0 ? (i13& 7) : i13) | (j13== s1 ? 8 : 0);
2011         const int k14= (i14>= 0 ? (i14& 7) : i14) | (j14== s1 ? 8 : 0);
2012         const int k15= (i15>= 0 ? (i15& 7) : i15) | (j15== s1 ? 8 : 0);
2013         x1 = blend8i<k8,k9,k10,k11,k12,k13,k14,k15> (select4<r1>(a,b), select4<s1>(a,b));
2014     }
2015     else {
2016         // i8 - i15 from three or four different sources
2017         const int n8 = j8 >= 0 ? j8 /2*8 + 0 : j8 ;
2018         const int n9 = j9 >= 0 ? j9 /2*8 + 1 : j9 ;
2019         const int n10= j10>= 0 ? j10/2*8 + 2 : j10;
2020         const int n11= j11>= 0 ? j11/2*8 + 3 : j11;
2021         const int n12= j12>= 0 ? j12/2*8 + 4 : j12;
2022         const int n13= j13>= 0 ? j13/2*8 + 5 : j13;
2023         const int n14= j14>= 0 ? j14/2*8 + 6 : j14;
2024         const int n15= j15>= 0 ? j15/2*8 + 7 : j15;
2025         x1 = blend8i<n8, n9, n10, n11, n12, n13, n14, n15> (
2026              blend8i< j8   & 2 ? -256 : i8 &15,  j9   & 2 ? -256 : i9 &15,  j10   & 2 ? -256 : i10 &15,  j11   & 2 ? -256 : i11 &15,  j12   & 2 ? -256 : i12 &15,  j13   & 2 ? -256 : i13 &15,  j14   & 2 ? -256 : i14 &15,  j15   & 2 ? -256 : i15 &15> (a.get_low(),a.get_high()),
2027              blend8i<(j8^2)& 6 ? -256 : i8 &15, (j9^2)& 6 ? -256 : i9 &15, (j10^2)& 6 ? -256 : i10 &15, (j11^2)& 6 ? -256 : i11 &15, (j12^2)& 6 ? -256 : i12 &15, (j13^2)& 6 ? -256 : i13 &15, (j14^2)& 6 ? -256 : i14 &15, (j15^2)& 6 ? -256 : i15 &15> (b.get_low(),b.get_high()));
2028     }
2029     return Vec16i(x0,x1);
2030 }
2031 
2032 template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
2033           int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 >
blend16ui(Vec16ui const & a,Vec16ui const & b)2034 static inline Vec16ui blend16ui(Vec16ui const & a, Vec16ui const & b) {
2035     return Vec16ui( blend16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (Vec16i(a),Vec16i(b)));
2036 }
2037 
2038 
2039 /*****************************************************************************
2040 *
2041 *          Vector lookup functions
2042 *
2043 ******************************************************************************
2044 *
2045 * These functions use vector elements as indexes into a table.
2046 * The table is given as one or more vectors or as an array.
2047 *
2048 * This can be used for several purposes:
2049 *  - table lookup
2050 *  - permute or blend with variable indexes
2051 *  - blend from more than two sources
2052 *  - gather non-contiguous data
2053 *
2054 * An index out of range may produce any value - the actual value produced is
2055 * implementation dependent and may be different for different instruction
2056 * sets. An index out of range does not produce an error message or exception.
2057 *
2058 * Example:
2059 * Vec8q a(2,0,0,6,4,3,5,0);                 // index a is (  2,   0,   0,   6,   4,   3,   5,   0)
2060 * Vec8q b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
2061 * Vec8q c;
2062 * c = lookup8 (a,b);                        // c is       (102, 100, 100, 106, 104, 103, 105, 100)
2063 *
2064 *****************************************************************************/
2065 
lookup16(Vec16i const & index,Vec16i const & table)2066 static inline Vec16i lookup16(Vec16i const & index, Vec16i const & table) {
2067     int32_t tab[16];
2068     table.store(tab);
2069     Vec8i t0 = lookup<16>(index.get_low(), tab);
2070     Vec8i t1 = lookup<16>(index.get_high(), tab);
2071     return Vec16i(t0, t1);
2072 }
2073 
2074 template <int n>
lookup(Vec16i const & index,void const * table)2075 static inline Vec16i lookup(Vec16i const & index, void const * table) {
2076     if (n <=  0) return 0;
2077     if (n <=  8) {
2078         Vec8i table1 = Vec8i().load(table);
2079         return Vec16i(
2080             lookup8 (index.get_low(),  table1),
2081             lookup8 (index.get_high(), table1));
2082     }
2083     if (n <= 16) return lookup16(index, Vec16i().load(table));
2084     // n > 16. Limit index
2085     Vec16ui i1;
2086     if ((n & (n-1)) == 0) {
2087         // n is a power of 2, make index modulo n
2088         i1 = Vec16ui(index) & (n-1);
2089     }
2090     else {
2091         // n is not a power of 2, limit to n-1
2092         i1 = min(Vec16ui(index), n-1);
2093     }
2094     int32_t const * t = (int32_t const *)table;
2095     return Vec16i(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]],
2096         t[i1[8]],t[i1[9]],t[i1[10]],t[i1[11]],t[i1[12]],t[i1[13]],t[i1[14]],t[i1[15]]);
2097 }
2098 
lookup8(Vec8q const & index,Vec8q const & table)2099 static inline Vec8q lookup8(Vec8q const & index, Vec8q const & table) {
2100     int64_t tab[8];
2101     table.store(tab);
2102     Vec4q t0 = lookup<8>(index.get_low(), tab);
2103     Vec4q t1 = lookup<8>(index.get_high(), tab);
2104     return Vec8q(t0, t1);
2105 }
2106 
2107 template <int n>
lookup(Vec8q const & index,void const * table)2108 static inline Vec8q lookup(Vec8q const & index, void const * table) {
2109     if (n <= 0) return 0;
2110     if (n <= 4) {
2111         Vec4q table1 = Vec4q().load(table);
2112         return Vec8q(
2113             lookup4 (index.get_low(),  table1),
2114             lookup4 (index.get_high(), table1));
2115     }
2116     if (n <= 8) {
2117         return lookup8(index, Vec8q().load(table));
2118     }
2119     // n > 8. Limit index
2120     Vec8uq i1;
2121     if ((n & (n-1)) == 0) {
2122         // n is a power of 2, make index modulo n
2123         i1 = Vec8uq(index) & (n-1);
2124     }
2125     else {
2126         // n is not a power of 2, limit to n-1
2127         i1 = min(Vec8uq(index), n-1);
2128     }
2129     int64_t const * t = (int64_t const *)table;
2130     return Vec8q(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]]);
2131 }
2132 
2133 /*****************************************************************************
2134 *
2135 *          Vector scatter functions
2136 *
2137 ******************************************************************************
2138 *
2139 * These functions write the elements of a vector to arbitrary positions in an
2140 * array in memory. Each vector element is written to an array position
2141 * determined by an index. An element is not written if the corresponding
2142 * index is out of range.
2143 * The indexes can be specified as constant template parameters or as an
2144 * integer vector.
2145 *
2146 * The scatter functions are useful if the data are distributed in a sparce
2147 * manner into the array. If the array is dense then it is more efficient
2148 * to permute the data into the right positions and then write the whole
2149 * permuted vector into the array.
2150 *
2151 * Example:
2152 * Vec8q a(10,11,12,13,14,15,16,17);
2153 * int64_t b[16] = {0};
2154 * scatter<0,2,14,10,1,-1,5,9>(a,b);
2155 * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
2156 *
2157 *****************************************************************************/
2158 
2159 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
2160     int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
scatter(Vec16i const & data,void * array)2161     static inline void scatter(Vec16i const & data, void * array) {
2162     int32_t* arr = (int32_t*)array;
2163     const int index[16] = {i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15};
2164     for (int i = 0; i < 16; i++) {
2165         if (index[i] >= 0) arr[index[i]] = data[i];
2166     }
2167 }
2168 
2169 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
scatter(Vec8q const & data,void * array)2170 static inline void scatter(Vec8q const & data, void * array) {
2171     int64_t* arr = (int64_t*)array;
2172     const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7};
2173     for (int i = 0; i < 8; i++) {
2174         if (index[i] >= 0) arr[index[i]] = data[i];
2175     }
2176 }
2177 
scatter(Vec16i const & index,uint32_t limit,Vec16i const & data,void * array)2178 static inline void scatter(Vec16i const & index, uint32_t limit, Vec16i const & data, void * array) {
2179     int32_t* arr = (int32_t*)array;
2180     for (int i = 0; i < 16; i++) {
2181         if (uint32_t(index[i]) < limit) arr[index[i]] = data[i];
2182     }
2183 }
2184 
scatter(Vec8q const & index,uint32_t limit,Vec8q const & data,void * array)2185 static inline void scatter(Vec8q const & index, uint32_t limit, Vec8q const & data, void * array) {
2186     int64_t* arr = (int64_t*)array;
2187     for (int i = 0; i < 8; i++) {
2188         if (uint64_t(index[i]) < uint64_t(limit)) arr[index[i]] = data[i];
2189     }
2190 }
2191 
scatter(Vec8i const & index,uint32_t limit,Vec8q const & data,void * array)2192 static inline void scatter(Vec8i const & index, uint32_t limit, Vec8q const & data, void * array) {
2193     int64_t* arr = (int64_t*)array;
2194     for (int i = 0; i < 8; i++) {
2195         if (uint32_t(index[i]) < limit) arr[index[i]] = data[i];
2196     }
2197 }
2198 
2199 /*****************************************************************************
2200 *
2201 *          Gather functions with fixed indexes
2202 *
2203 *****************************************************************************/
2204 // Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15
2205 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
2206 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
gather16i(void const * a)2207 static inline Vec16i gather16i(void const * a) {
2208     Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15)>=0> Negative_array_index;  // Error message if index is negative
2209     // find smallest and biggest index, using only compile-time constant expressions
2210     const int i01min   = i0  < i1  ? i0  : i1;
2211     const int i23min   = i2  < i3  ? i2  : i3;
2212     const int i45min   = i4  < i5  ? i4  : i5;
2213     const int i67min   = i6  < i7  ? i6  : i7;
2214     const int i89min   = i8  < i9  ? i8  : i9;
2215     const int i1011min = i10 < i11 ? i10 : i11;
2216     const int i1213min = i12 < i13 ? i12 : i13;
2217     const int i1415min = i14 < i15 ? i14 : i15;
2218     const int i0_3min   = i01min   < i23min    ? i01min   : i23min;
2219     const int i4_7min   = i45min   < i67min    ? i45min   : i67min;
2220     const int i8_11min  = i89min   < i1011min  ? i89min   : i1011min;
2221     const int i12_15min = i1213min < i1415min  ? i1213min : i1415min;
2222     const int i0_7min   = i0_3min  < i4_7min   ? i0_3min  : i4_7min;
2223     const int i8_15min  = i8_11min < i12_15min ? i8_11min : i12_15min;
2224     const int imin      = i0_7min  < i8_15min  ? i0_7min  : i8_15min;
2225     const int i01max   = i0  > i1  ? i0  : i1;
2226     const int i23max   = i2  > i3  ? i2  : i3;
2227     const int i45max   = i4  > i5  ? i4  : i5;
2228     const int i67max   = i6  > i7  ? i6  : i7;
2229     const int i89max   = i8  > i9  ? i8  : i9;
2230     const int i1011max = i10 > i11 ? i10 : i11;
2231     const int i1213max = i12 > i13 ? i12 : i13;
2232     const int i1415max = i14 > i15 ? i14 : i15;
2233     const int i0_3max   = i01max   > i23max    ? i01max   : i23max;
2234     const int i4_7max   = i45max   > i67max    ? i45max   : i67max;
2235     const int i8_11max  = i89max   > i1011max  ? i89max   : i1011max;
2236     const int i12_15max = i1213max > i1415max  ? i1213max : i1415max;
2237     const int i0_7max   = i0_3max  > i4_7max   ? i0_3max  : i4_7max;
2238     const int i8_15max  = i8_11max > i12_15max ? i8_11max : i12_15max;
2239     const int imax      = i0_7max  > i8_15max  ? i0_7max  : i8_15max;
2240     if (imax - imin <= 15) {
2241         // load one contiguous block and permute
2242         if (imax > 15) {
2243             // make sure we don't read past the end of the array
2244             Vec16i b = Vec16i().load((int32_t const *)a + imax-15);
2245             return permute16i<i0-imax+15, i1-imax+15, i2-imax+15, i3-imax+15, i4-imax+15, i5-imax+15, i6-imax+15, i7-imax+15,
2246                 i8-imax+15, i9-imax+15, i10-imax+15, i11-imax+15, i12-imax+15, i13-imax+15, i14-imax+15, i15-imax+15> (b);
2247         }
2248         else {
2249             Vec16i b = Vec16i().load((int32_t const *)a + imin);
2250             return permute16i<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin,
2251                 i8-imin, i9-imin, i10-imin, i11-imin, i12-imin, i13-imin, i14-imin, i15-imin> (b);
2252         }
2253     }
2254     if ((i0<imin+16  || i0>imax-16)  && (i1<imin+16  || i1>imax-16)  && (i2<imin+16  || i2>imax-16)  && (i3<imin+16  || i3>imax-16)
2255     &&  (i4<imin+16  || i4>imax-16)  && (i5<imin+16  || i5>imax-16)  && (i6<imin+16  || i6>imax-16)  && (i7<imin+16  || i7>imax-16)
2256     &&  (i8<imin+16  || i8>imax-16)  && (i9<imin+16  || i9>imax-16)  && (i10<imin+16 || i10>imax-16) && (i11<imin+16 || i11>imax-16)
2257     &&  (i12<imin+16 || i12>imax-16) && (i13<imin+16 || i13>imax-16) && (i14<imin+16 || i14>imax-16) && (i15<imin+16 || i15>imax-16) ) {
2258         // load two contiguous blocks and blend
2259         Vec16i b = Vec16i().load((int32_t const *)a + imin);
2260         Vec16i c = Vec16i().load((int32_t const *)a + imax-15);
2261         const int j0  = i0 <imin+16 ? i0 -imin : 31-imax+i0;
2262         const int j1  = i1 <imin+16 ? i1 -imin : 31-imax+i1;
2263         const int j2  = i2 <imin+16 ? i2 -imin : 31-imax+i2;
2264         const int j3  = i3 <imin+16 ? i3 -imin : 31-imax+i3;
2265         const int j4  = i4 <imin+16 ? i4 -imin : 31-imax+i4;
2266         const int j5  = i5 <imin+16 ? i5 -imin : 31-imax+i5;
2267         const int j6  = i6 <imin+16 ? i6 -imin : 31-imax+i6;
2268         const int j7  = i7 <imin+16 ? i7 -imin : 31-imax+i7;
2269         const int j8  = i8 <imin+16 ? i8 -imin : 31-imax+i8;
2270         const int j9  = i9 <imin+16 ? i9 -imin : 31-imax+i9;
2271         const int j10 = i10<imin+16 ? i10-imin : 31-imax+i10;
2272         const int j11 = i11<imin+16 ? i11-imin : 31-imax+i11;
2273         const int j12 = i12<imin+16 ? i12-imin : 31-imax+i12;
2274         const int j13 = i13<imin+16 ? i13-imin : 31-imax+i13;
2275         const int j14 = i14<imin+16 ? i14-imin : 31-imax+i14;
2276         const int j15 = i15<imin+16 ? i15-imin : 31-imax+i15;
2277         return blend16i<j0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15>(b, c);
2278     }
2279     // use lookup function
2280     return lookup<imax+1>(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), a);
2281 }
2282 
2283 
2284 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
gather8q(void const * a)2285 static inline Vec8q gather8q(void const * a) {
2286     Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7)>=0> Negative_array_index;  // Error message if index is negative
2287 
2288     const int i01min = i0 < i1 ? i0 : i1;
2289     const int i23min = i2 < i3 ? i2 : i3;
2290     const int i45min = i4 < i5 ? i4 : i5;
2291     const int i67min = i6 < i7 ? i6 : i7;
2292     const int i0123min = i01min < i23min ? i01min : i23min;
2293     const int i4567min = i45min < i67min ? i45min : i67min;
2294     const int imin = i0123min < i4567min ? i0123min : i4567min;
2295     const int i01max = i0 > i1 ? i0 : i1;
2296     const int i23max = i2 > i3 ? i2 : i3;
2297     const int i45max = i4 > i5 ? i4 : i5;
2298     const int i67max = i6 > i7 ? i6 : i7;
2299     const int i0123max = i01max > i23max ? i01max : i23max;
2300     const int i4567max = i45max > i67max ? i45max : i67max;
2301     const int imax = i0123max > i4567max ? i0123max : i4567max;
2302     if (imax - imin <= 7) {
2303         // load one contiguous block and permute
2304         if (imax > 7) {
2305             // make sure we don't read past the end of the array
2306             Vec8q b = Vec8q().load((int64_t const *)a + imax-7);
2307             return permute8q<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7> (b);
2308         }
2309         else {
2310             Vec8q b = Vec8q().load((int64_t const *)a + imin);
2311             return permute8q<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin> (b);
2312         }
2313     }
2314     if ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
2315     &&  (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
2316         // load two contiguous blocks and blend
2317         Vec8q b = Vec8q().load((int64_t const *)a + imin);
2318         Vec8q c = Vec8q().load((int64_t const *)a + imax-7);
2319         const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
2320         const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
2321         const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
2322         const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
2323         const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
2324         const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
2325         const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
2326         const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
2327         return blend8q<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
2328     }
2329     // use lookup function
2330     return lookup<imax+1>(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), a);
2331 }
2332 
2333 
2334 /*****************************************************************************
2335 *
2336 *          Functions for conversion between integer sizes
2337 *
2338 *****************************************************************************/
2339 
2340 // Extend 16-bit integers to 32-bit integers, signed and unsigned
2341 
2342 // Function extend_to_int : extends Vec16s to Vec16i with sign extension
extend_to_int(Vec16s const & a)2343 static inline Vec16i extend_to_int (Vec16s const & a) {
2344     return Vec16i(extend_low(a), extend_high(a));
2345 }
2346 
2347 // Function extend_to_int : extends Vec16us to Vec16ui with zero extension
extend_to_int(Vec16us const & a)2348 static inline Vec16ui extend_to_int (Vec16us const & a) {
2349     return Vec16i(extend_low(a), extend_high(a));
2350 }
2351 
2352 // Function extend_to_int : extends Vec16c to Vec16i with sign extension
extend_to_int(Vec16c const & a)2353 static inline Vec16i extend_to_int (Vec16c const & a) {
2354     return extend_to_int(Vec16s(extend_low(a), extend_high(a)));
2355 }
2356 
2357 // Function extend_to_int : extends Vec16uc to Vec16ui with zero extension
extend_to_int(Vec16uc const & a)2358 static inline Vec16ui extend_to_int (Vec16uc const & a) {
2359     return extend_to_int(Vec16s(extend_low(a), extend_high(a)));
2360 }
2361 
2362 
2363 // Extend 32-bit integers to 64-bit integers, signed and unsigned
2364 
2365 // Function extend_low : extends the low 8 elements to 64 bits with sign extension
extend_low(Vec16i const & a)2366 static inline Vec8q extend_low (Vec16i const & a) {
2367     return Vec8q(extend_low(a.get_low()), extend_high(a.get_low()));
2368 }
2369 
2370 // Function extend_high : extends the high 8 elements to 64 bits with sign extension
extend_high(Vec16i const & a)2371 static inline Vec8q extend_high (Vec16i const & a) {
2372     return Vec8q(extend_low(a.get_high()), extend_high(a.get_high()));
2373 }
2374 
2375 // Function extend_low : extends the low 8 elements to 64 bits with zero extension
extend_low(Vec16ui const & a)2376 static inline Vec8uq extend_low (Vec16ui const & a) {
2377     return Vec8q(extend_low(a.get_low()), extend_high(a.get_low()));
2378 }
2379 
2380 // Function extend_high : extends the high 8 elements to 64 bits with zero extension
extend_high(Vec16ui const & a)2381 static inline Vec8uq extend_high (Vec16ui const & a) {
2382     return Vec8q(extend_low(a.get_high()), extend_high(a.get_high()));
2383 }
2384 
2385 
2386 // Compress 32-bit integers to 8-bit integers, signed and unsigned, with and without saturation
2387 
2388 // Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
2389 // Overflow wraps around
compress_to_int8(Vec16i const & a)2390 static inline Vec16c compress_to_int8 (Vec16i const & a) {
2391     Vec16s b = compress(a.get_low(), a.get_high());
2392     Vec16c c = compress(b.get_low(), b.get_high());
2393     return c;
2394 }
2395 
compress_to_int16(Vec16i const & a)2396 static inline Vec16s compress_to_int16 (Vec16i const & a) {
2397     return compress(a.get_low(), a.get_high());
2398 }
2399 
2400 // with signed saturation
compress_to_int8_saturated(Vec16i const & a)2401 static inline Vec16c compress_to_int8_saturated (Vec16i const & a) {
2402     Vec16s b = compress_saturated(a.get_low(), a.get_high());
2403     Vec16c c = compress_saturated(b.get_low(), b.get_high());
2404     return c;
2405 }
2406 
compress_to_int16_saturated(Vec16i const & a)2407 static inline Vec16s compress_to_int16_saturated (Vec16i const & a) {
2408     return compress_saturated(a.get_low(), a.get_high());
2409 }
2410 
2411 // with unsigned saturation
compress_to_int8_saturated(Vec16ui const & a)2412 static inline Vec16uc compress_to_int8_saturated (Vec16ui const & a) {
2413     Vec16us b = compress_saturated(a.get_low(), a.get_high());
2414     Vec16uc c = compress_saturated(b.get_low(), b.get_high());
2415     return c;
2416 }
2417 
compress_to_int16_saturated(Vec16ui const & a)2418 static inline Vec16us compress_to_int16_saturated (Vec16ui const & a) {
2419     return compress_saturated(a.get_low(), a.get_high());
2420 }
2421 
2422 // Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
2423 
2424 // Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
2425 // Overflow wraps around
compress(Vec8q const & low,Vec8q const & high)2426 static inline Vec16i compress (Vec8q const & low, Vec8q const & high) {
2427     return Vec16i(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
2428 }
2429 
2430 // Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
2431 // Signed, with saturation
compress_saturated(Vec8q const & low,Vec8q const & high)2432 static inline Vec16i compress_saturated (Vec8q const & low, Vec8q const & high) {
2433     return Vec16i(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
2434 }
2435 
2436 // Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
2437 // Unsigned, with saturation
compress_saturated(Vec8uq const & low,Vec8uq const & high)2438 static inline Vec16ui compress_saturated (Vec8uq const & low, Vec8uq const & high) {
2439     return Vec16ui(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
2440 }
2441 
2442 
2443 /*****************************************************************************
2444 *
2445 *          Integer division operators
2446 *
2447 *          Please see the file vectori128.h for explanation.
2448 *
2449 *****************************************************************************/
2450 
2451 // vector operator / : divide each element by divisor
2452 
2453 // vector operator / : divide all elements by same integer
2454 static inline Vec16i operator / (Vec16i const & a, Divisor_i const & d) {
2455     return Vec16i(a.get_low() / d, a.get_high() / d);
2456 }
2457 
2458 // vector operator /= : divide
2459 static inline Vec16i & operator /= (Vec16i & a, Divisor_i const & d) {
2460     a = a / d;
2461     return a;
2462 }
2463 
2464 // vector operator / : divide all elements by same integer
2465 static inline Vec16ui operator / (Vec16ui const & a, Divisor_ui const & d) {
2466     return Vec16ui(a.get_low() / d, a.get_high() / d);
2467 }
2468 
2469 // vector operator /= : divide
2470 static inline Vec16ui & operator /= (Vec16ui & a, Divisor_ui const & d) {
2471     a = a / d;
2472     return a;
2473 }
2474 
2475 
2476 /*****************************************************************************
2477 *
2478 *          Integer division 2: divisor is a compile-time constant
2479 *
2480 *****************************************************************************/
2481 
2482 // Divide Vec16i by compile-time constant
2483 template <int32_t d>
divide_by_i(Vec16i const & a)2484 static inline Vec16i divide_by_i(Vec16i const & a) {
2485     return Vec16i(divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
2486 }
2487 
2488 // define Vec16i a / const_int(d)
2489 template <int32_t d>
2490 static inline Vec16i operator / (Vec16i const & a, Const_int_t<d>) {
2491     return divide_by_i<d>(a);
2492 }
2493 
2494 // define Vec16i a / const_uint(d)
2495 template <uint32_t d>
2496 static inline Vec16i operator / (Vec16i const & a, Const_uint_t<d>) {
2497     Static_error_check< (d<0x80000000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
2498     return divide_by_i<int32_t(d)>(a);                               // signed divide
2499 }
2500 
2501 // vector operator /= : divide
2502 template <int32_t d>
2503 static inline Vec16i & operator /= (Vec16i & a, Const_int_t<d> b) {
2504     a = a / b;
2505     return a;
2506 }
2507 
2508 // vector operator /= : divide
2509 template <uint32_t d>
2510 static inline Vec16i & operator /= (Vec16i & a, Const_uint_t<d> b) {
2511     a = a / b;
2512     return a;
2513 }
2514 
2515 // Divide Vec16ui by compile-time constant
2516 template <uint32_t d>
divide_by_ui(Vec16ui const & a)2517 static inline Vec16ui divide_by_ui(Vec16ui const & a) {
2518     return Vec16ui( divide_by_ui<d>(a.get_low()), divide_by_ui<d>(a.get_high()));
2519 }
2520 
2521 // define Vec16ui a / const_uint(d)
2522 template <uint32_t d>
2523 static inline Vec16ui operator / (Vec16ui const & a, Const_uint_t<d>) {
2524     return divide_by_ui<d>(a);
2525 }
2526 
2527 // define Vec16ui a / const_int(d)
2528 template <int32_t d>
2529 static inline Vec16ui operator / (Vec16ui const & a, Const_int_t<d>) {
2530     Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
2531     return divide_by_ui<d>(a);                                       // unsigned divide
2532 }
2533 
2534 // vector operator /= : divide
2535 template <uint32_t d>
2536 static inline Vec16ui & operator /= (Vec16ui & a, Const_uint_t<d> b) {
2537     a = a / b;
2538     return a;
2539 }
2540 
2541 // vector operator /= : divide
2542 template <int32_t d>
2543 static inline Vec16ui & operator /= (Vec16ui & a, Const_int_t<d> b) {
2544     a = a / b;
2545     return a;
2546 }
2547 
2548 
2549 /*****************************************************************************
2550 *
2551 *          Horizontal scan functions
2552 *
2553 *****************************************************************************/
2554 
2555 // Get index to the first element that is true. Return -1 if all are false
horizontal_find_first(Vec16ib const & x)2556 static inline int horizontal_find_first(Vec16ib const & x) {
2557     int a1 = horizontal_find_first(x.get_low());
2558     if (a1 >= 0) return a1;
2559     int a2 = horizontal_find_first(x.get_high());
2560     if (a2 < 0) return a2;
2561     return a2 + 8;
2562 }
2563 
horizontal_find_first(Vec8qb const & x)2564 static inline int horizontal_find_first(Vec8qb const & x) {
2565     int a1 = horizontal_find_first(x.get_low());
2566     if (a1 >= 0) return a1;
2567     int a2 = horizontal_find_first(x.get_high());
2568     if (a2 < 0) return a2;
2569     return a2 + 4;
2570 }
2571 
2572 // count the number of true elements
horizontal_count(Vec16ib const & x)2573 static inline uint32_t horizontal_count(Vec16ib const & x) {
2574     return horizontal_count(x.get_low()) + horizontal_count(x.get_high());
2575 }
2576 
horizontal_count(Vec8qb const & x)2577 static inline uint32_t horizontal_count(Vec8qb const & x) {
2578     return horizontal_count(x.get_low()) + horizontal_count(x.get_high());
2579 }
2580 
2581 
2582 /*****************************************************************************
2583 *
2584 *          Boolean <-> bitfield conversion functions
2585 *
2586 *****************************************************************************/
2587 
2588 // to_bits: convert to integer bitfield
to_bits(Vec16b const & a)2589 static inline uint16_t to_bits(Vec16b const & a) {
2590     return to_bits(a.get_low()) | ((uint16_t)to_bits(a.get_high()) << 8);
2591 }
2592 
2593 // to_bits: convert to integer bitfield
to_bits(Vec16ib const & a)2594 static inline uint16_t to_bits(Vec16ib const & a) {
2595     return to_bits(a.get_low()) | ((uint16_t)to_bits(a.get_high()) << 8);
2596 }
2597 
2598 // to_Vec16ib: convert integer bitfield to boolean vector
to_Vec16ib(uint16_t const & x)2599 static inline Vec16ib to_Vec16ib(uint16_t const & x) {
2600     return Vec16i(to_Vec8ib(uint8_t(x)), to_Vec8ib(uint8_t(x>>8)));
2601 }
2602 
2603 // to_bits: convert to integer bitfield
to_bits(Vec8b const & a)2604 static inline uint8_t to_bits(Vec8b const & a) {
2605     return to_bits(a.get_low()) | (to_bits(a.get_high()) << 4);
2606 }
2607 
2608 // to_Vec8qb: convert integer bitfield to boolean vector
to_Vec8qb(uint8_t x)2609 static inline Vec8qb to_Vec8qb(uint8_t x) {
2610     return Vec8q(to_Vec4qb(x), to_Vec4qb(x>>4));
2611 }
2612 
2613 #ifdef VCL_NAMESPACE
2614 }
2615 #endif
2616 
2617 #endif // VECTORI512_H
2618