1 /****************************  vectori512.h   *******************************
2 * Author:        Agner Fog
3 * Date created:  2014-07-23
4 * Last modified: 2017-02-19
5 * Version:       1.27
6 * Project:       vector classes
7 * Description:
8 * Header file defining integer vector classes as interface to intrinsic
9 * functions in x86 microprocessors with AVX512 and later instruction sets.
10 *
11 * Instructions:
12 * Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired
13 * instruction set, which must be at least AVX512.
14 *
15 * The following vector classes are defined here:
16 * Vec16i    Vector of  16  32-bit signed   integers
17 * Vec16ui   Vector of  16  32-bit unsigned integers
18 * Vec16ib   Vector of  16  Booleans for use with Vec16i and Vec16ui
19 * Vec8q     Vector of   8  64-bit signed   integers
20 * Vec8uq    Vector of   8  64-bit unsigned integers
21 * Vec8qb    Vector of   8  Booleans for use with Vec8q and Vec8uq
22 *
23 * Each vector object is represented internally in the CPU as a 512-bit register.
24 * This header file defines operators and functions for these vectors.
25 *
26 * For detailed instructions, see VectorClass.pdf
27 *
28 * (c) Copyright 2014-2017 GNU General Public License http://www.gnu.org/licenses
29 *****************************************************************************/
30 
31 // check combination of header files
32 #if defined (VECTORI512_H)
33 #if    VECTORI512_H != 2
34 #error Two different versions of vectori512.h included
35 #endif
36 #else
37 #define VECTORI512_H  2
38 
39 #ifdef VECTORF512_H
40 #error Please put header file vectori512.h before vectorf512.h
41 #endif
42 
43 
44 #if INSTRSET < 9   // AVX512 required
45 #error Wrong instruction set for vectori512.h, AVX512 required or use vectori512e.h
46 #endif
47 
48 #include "vectori256.h"
49 
50 #ifdef VCL_NAMESPACE
51 namespace VCL_NAMESPACE {
52 #endif
53 
54 // Bug fix for missing intrinsics:
55 // _mm512_cmpgt_epu32_mask, _mm512_cmpgt_epu64_mask
56 // all typecast intrinsics
57 // Fix expected in GCC version 4.9.3 or 5.0. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61878
58 
59 // questionable
60 // _mm512_mask_mov_epi32 check select(). Doc at https://software.intel.com/en-us/node/513888 is wrong. Bug report filed
61 
62 
63 #if defined (GCC_VERSION) && GCC_VERSION < 50000 && !defined(__INTEL_COMPILER) && !defined(__clang__)
64 
_mm512_castsi256_si512(__m256i x)65 static inline  __m512i _mm512_castsi256_si512(__m256i x) {
66     union {
67         __m512i a;
68         __m256i b;
69     } u;
70     u.b = x;
71     return u.a;
72 }
73 
_mm512_castsi512_si256(__m512i x)74 static inline  __m256i _mm512_castsi512_si256(__m512i x) {
75     union {
76         __m512i a;
77         __m256i b;
78     } u;
79     u.a = x;
80     return u.b;
81 }
82 
_mm512_castsi128_si512(__m128i x)83 static inline  __m512i _mm512_castsi128_si512(__m128i x) {
84     union {
85         __m128i a;
86         __m512i b;
87     } u;
88     u.a = x;
89     return u.b;
90 }
91 
_mm512_castsi512_si128(__m512i x)92 static inline  __m128i _mm512_castsi512_si128(__m512i x) {
93     union {
94         __m512i a;
95         __m128i b;
96     } u;
97     u.a = x;
98     return u.b;
99 }
100 
101 #endif
102 
103 
104 /*****************************************************************************
105 *
106 *          Generate compile-time constant vector
107 *
108 *****************************************************************************/
109 // Generate a constant vector of 8 integers stored in memory.
110 // Can be converted to any integer vector type
111 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
112 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
constant16i()113 static inline __m512i constant16i() {
114     static const union {
115         int32_t i[16];
116         __m512i zmm;
117     } u = {{i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15}};
118     return u.zmm;
119 }
120 
121 
122 /*****************************************************************************
123 *
124 *          Boolean vector base classes for AVX512
125 *
126 *****************************************************************************/
127 
128 class Vec16b {
129 protected:
130     __mmask16  m16; // Boolean vector
131 public:
132     // Default constructor:
Vec16b()133     Vec16b () {
134     }
135     // Constructor to convert from type __mmask16 used in intrinsics:
Vec16b(__mmask16 x)136     Vec16b (__mmask16 x) {
137         m16 = x;
138     }
139     // Constructor to build from all elements:
Vec16b(bool b0,bool b1,bool b2,bool b3,bool b4,bool b5,bool b6,bool b7,bool b8,bool b9,bool b10,bool b11,bool b12,bool b13,bool b14,bool b15)140     Vec16b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7,
141     bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15) {
142         m16 = uint16_t(b0 | b1<<1 | b2<<2 | b3<<3 | b4<<4 | b5<<5 | b6<<6 | b7<<7 |
143               b8<<8 | b9<<9 | b10<<10 | b11<<11 | b12<<12 | b13<<13 | b14<<14 | b15<<15);
144     }
145     // Constructor to broadcast single value:
Vec16b(bool b)146     Vec16b(bool b) {
147         m16 = __mmask16(-int16_t(b));
148     }
149 private: // Prevent constructing from int, etc.
150     Vec16b(int b);
151 public:
152     // Constructor to make from two halves
Vec16b(Vec8ib const & x0,Vec8ib const & x1)153     Vec16b (Vec8ib const & x0, Vec8ib const & x1) {
154         // = Vec16i(x0,x1) != 0;  (not defined yet)
155         __m512i z = _mm512_inserti64x4(_mm512_castsi256_si512(x0), x1, 1);
156         m16 = _mm512_cmpneq_epi32_mask(z, _mm512_setzero_epi32());
157     }
158     // Assignment operator to convert from type __mmask16 used in intrinsics:
159     Vec16b & operator = (__mmask16 x) {
160         m16 = x;
161         return *this;
162     }
163     // Assignment operator to broadcast scalar value:
164     Vec16b & operator = (bool b) {
165         m16 = Vec16b(b);
166         return *this;
167     }
168 private: // Prevent assigning int because of ambiguity
169     Vec16b & operator = (int x);
170 public:
171     // Type cast operator to convert to __mmask16 used in intrinsics
__mmask16()172     operator __mmask16() const {
173         return m16;
174     }
175     // split into two halves
get_low()176     Vec8ib get_low() const {
177         return to_Vec8ib((uint8_t)m16);
178     }
get_high()179     Vec8ib get_high() const {
180         return to_Vec8ib((uint16_t)m16 >> 8);
181     }
182     // Member function to change a single element in vector
183     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,bool value)184     Vec16b const & insert(uint32_t index, bool value) {
185         m16 = __mmask16(((uint16_t)m16 & ~(1 << index)) | (int)value << index);
186         return *this;
187     }
188     // Member function extract a single element from vector
extract(uint32_t index)189     bool extract(uint32_t index) const {
190         return ((uint32_t)m16 >> index) & 1;
191     }
192     // Extract a single element. Operator [] can only read an element, not write.
193     bool operator [] (uint32_t index) const {
194         return extract(index);
195     }
size()196     static int size () {
197         return 16;
198     }
199 };
200 
201 // Define operators for this class
202 
203 // vector operator & : bitwise and
204 static inline Vec16b operator & (Vec16b a, Vec16b b) {
205     return _mm512_kand(a, b);
206 }
207 static inline Vec16b operator && (Vec16b a, Vec16b b) {
208     return a & b;
209 }
210 
211 // vector operator | : bitwise or
212 static inline Vec16b operator | (Vec16b a, Vec16b b) {
213     return _mm512_kor(a, b);
214 }
215 static inline Vec16b operator || (Vec16b a, Vec16b b) {
216     return a | b;
217 }
218 
219 // vector operator ^ : bitwise xor
220 static inline Vec16b operator ^ (Vec16b a, Vec16b b) {
221     return _mm512_kxor(a, b);
222 }
223 
224 // vector operator ~ : bitwise not
225 static inline Vec16b operator ~ (Vec16b a) {
226     return _mm512_knot(a);
227 }
228 
229 // vector operator ! : element not
230 static inline Vec16b operator ! (Vec16b a) {
231     return ~a;
232 }
233 
234 // vector operator &= : bitwise and
235 static inline Vec16b & operator &= (Vec16b & a, Vec16b b) {
236     a = a & b;
237     return a;
238 }
239 
240 // vector operator |= : bitwise or
241 static inline Vec16b & operator |= (Vec16b & a, Vec16b b) {
242     a = a | b;
243     return a;
244 }
245 
246 // vector operator ^= : bitwise xor
247 static inline Vec16b & operator ^= (Vec16b & a, Vec16b b) {
248     a = a ^ b;
249     return a;
250 }
251 
252 
253 /*****************************************************************************
254 *
255 *          Functions for boolean vectors
256 *
257 *****************************************************************************/
258 
259 // function andnot: a & ~ b
andnot(Vec16b a,Vec16b b)260 static inline Vec16b andnot (Vec16b a, Vec16b b) {
261     return _mm512_kandn(b, a);
262 }
263 
264 // horizontal_and. Returns true if all bits are 1
horizontal_and(Vec16b const & a)265 static inline bool horizontal_and (Vec16b const & a) {
266     return (uint16_t)(__mmask16)a == 0xFFFF;
267 }
268 
269 // horizontal_or. Returns true if at least one bit is 1
horizontal_or(Vec16b const & a)270 static inline bool horizontal_or (Vec16b const & a) {
271     return (uint16_t)(__mmask16)a != 0;
272 }
273 
274 
275 /*****************************************************************************
276 *
277 *          Vec16ib: Vector of 16 Booleans for use with Vec16i and Vec16ui
278 *
279 *****************************************************************************/
280 
281 class Vec16ib : public Vec16b {
282 public:
283     // Default constructor:
Vec16ib()284     Vec16ib () {
285     }
Vec16ib(Vec16b x)286     Vec16ib (Vec16b x) {
287         m16 = x;
288     }
289     // Constructor to build from all elements:
Vec16ib(bool x0,bool x1,bool x2,bool x3,bool x4,bool x5,bool x6,bool x7,bool x8,bool x9,bool x10,bool x11,bool x12,bool x13,bool x14,bool x15)290     Vec16ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
291         bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) :
292         Vec16b(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {
293     }
294     // Constructor to convert from type __mmask16 used in intrinsics:
Vec16ib(__mmask16 x)295     Vec16ib (__mmask16 x) {
296         m16 = x;
297     }
298     // Constructor to broadcast single value:
Vec16ib(bool b)299     Vec16ib(bool b) : Vec16b(b) {}
300 private: // Prevent constructing from int, etc.
301     Vec16ib(int b);
302 public:
303     // Constructor to make from two halves
Vec16ib(Vec8ib const & x0,Vec8ib const & x1)304     Vec16ib (Vec8ib const & x0, Vec8ib const & x1) {
305         m16 = Vec16b(x0, x1);
306     }
307     // Assignment operator to convert from type __mmask16 used in intrinsics:
308     Vec16ib & operator = (__mmask16 x) {
309         m16 = x;
310         return *this;
311     }
312     // Assignment operator to broadcast scalar value:
313     Vec16ib & operator = (bool b) {
314         m16 = Vec16b(b);
315         return *this;
316     }
317 private: // Prevent assigning int because of ambiguity
318     Vec16ib & operator = (int x);
319 public:
320 };
321 
322 // Define operators for Vec16ib
323 
324 // vector operator & : bitwise and
325 static inline Vec16ib operator & (Vec16ib a, Vec16ib b) {
326     return Vec16b(a) & Vec16b(b);
327 }
328 static inline Vec16ib operator && (Vec16ib a, Vec16ib b) {
329     return a & b;
330 }
331 
332 // vector operator | : bitwise or
333 static inline Vec16ib operator | (Vec16ib a, Vec16ib b) {
334     return Vec16b(a) | Vec16b(b);
335 }
336 static inline Vec16ib operator || (Vec16ib a, Vec16ib b) {
337     return a | b;
338 }
339 
340 // vector operator ^ : bitwise xor
341 static inline Vec16ib operator ^ (Vec16ib a, Vec16ib b) {
342     return Vec16b(a) ^ Vec16b(b);
343 }
344 
345 // vector operator ~ : bitwise not
346 static inline Vec16ib operator ~ (Vec16ib a) {
347     return ~Vec16b(a);
348 }
349 
350 // vector operator ! : element not
351 static inline Vec16ib operator ! (Vec16ib a) {
352     return ~a;
353 }
354 
355 // vector operator &= : bitwise and
356 static inline Vec16ib & operator &= (Vec16ib & a, Vec16ib b) {
357     a = a & b;
358     return a;
359 }
360 
361 // vector operator |= : bitwise or
362 static inline Vec16ib & operator |= (Vec16ib & a, Vec16ib b) {
363     a = a | b;
364     return a;
365 }
366 
367 // vector operator ^= : bitwise xor
368 static inline Vec16ib & operator ^= (Vec16ib & a, Vec16ib b) {
369     a = a ^ b;
370     return a;
371 }
372 
373 // vector function andnot
andnot(Vec16ib a,Vec16ib b)374 static inline Vec16ib andnot (Vec16ib a, Vec16ib b) {
375     return Vec16ib(andnot(Vec16b(a), Vec16b(b)));
376 }
377 
378 
379 /*****************************************************************************
380 *
381 *          Vec8b: Base class vector of 8 Booleans
382 *
383 *****************************************************************************/
384 
385 class Vec8b : public Vec16b {
386 public:
387     // Default constructor:
Vec8b()388     Vec8b () {
389     }
390     // Constructor to convert from type __mmask16 used in intrinsics:
Vec8b(__mmask16 x)391     Vec8b (__mmask16 x) {
392         m16 = x;
393     }
394     // Constructor to build from all elements:
Vec8b(bool b0,bool b1,bool b2,bool b3,bool b4,bool b5,bool b6,bool b7)395     Vec8b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) {
396         m16 = uint16_t(b0 | b1<<1 | b2<<2 | b3<<3 | b4<<4 | b5<<5 | b6<<6 | b7<<7);
397     }
Vec8b(Vec16b const & x)398     Vec8b (Vec16b const & x) {
399         m16 = __mmask16(x);
400     }
401     // Constructor to broadcast single value:
Vec8b(bool b)402     Vec8b(bool b) {
403         m16 = __mmask16(-int8_t(b));
404     }
405     // Assignment operator to convert from type __mmask16 used in intrinsics:
406     Vec8b & operator = (__mmask16 x) {
407         m16 = x;
408         return *this;
409     }
410 private: // Prevent constructing from int etc. because of ambiguity
411     Vec8b(int b);
412     Vec8b & operator = (int x);
413 public:
414     // split into two halves
get_low()415     Vec4qb get_low() const {
416         return Vec4qb(Vec4q(_mm512_castsi512_si256(_mm512_maskz_set1_epi64(__mmask16(m16), -1LL))));
417     }
get_high()418     Vec4qb get_high() const {
419         return Vec8b(__mmask16(m16 >> 4)).get_low();
420     }
size()421     static int size () {
422         return 8;
423     }
424 };
425 
426 /*****************************************************************************
427 *
428 *          Functions for boolean vectors
429 *
430 *****************************************************************************/
431 
432 // function andnot: a & ~ b
andnot(Vec8b a,Vec8b b)433 static inline Vec8b andnot (Vec8b a, Vec8b b) {
434     return _mm512_kandn(b, a);
435 }
436 
437 // horizontal_and. Returns true if all bits are 1
horizontal_and(Vec8b const & a)438 static inline bool horizontal_and (Vec8b const & a) {
439     return (uint8_t)(__mmask16)a == 0xFF;
440 }
441 
442 // horizontal_or. Returns true if at least one bit is 1
horizontal_or(Vec8b const & a)443 static inline bool horizontal_or (Vec8b const & a) {
444     return (uint8_t)(__mmask16)a != 0;
445 }
446 
447 
448 /*****************************************************************************
449 *
450 *          Vec8qb: Vector of 8 Booleans for use with Vec8q and Vec8qu
451 *
452 *****************************************************************************/
453 
454 class Vec8qb : public Vec8b {
455 public:
456     // Default constructor:
Vec8qb()457     Vec8qb () {
458     }
Vec8qb(Vec16b x)459     Vec8qb (Vec16b x) {
460         m16 = x;
461     }
462     // Constructor to build from all elements:
Vec8qb(bool x0,bool x1,bool x2,bool x3,bool x4,bool x5,bool x6,bool x7)463     Vec8qb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) :
464         Vec8b(x0, x1, x2, x3, x4, x5, x6, x7) {
465     }
466     // Constructor to convert from type __mmask8 used in intrinsics:
Vec8qb(__mmask8 x)467     Vec8qb (__mmask8 x) {
468         m16 = (__mmask16)x;
469     }
470     // Constructor to convert from type __mmask16 used in intrinsics:
Vec8qb(__mmask16 x)471     Vec8qb (__mmask16 x) {
472         m16 = x;
473     }
474     // Assignment operator to convert from type __mmask16 used in intrinsics:
475     Vec8qb & operator = (__mmask16 x) {
476         m16 = x;
477         return *this;
478     }
479     // Constructor to broadcast single value:
Vec8qb(bool b)480     Vec8qb(bool b) : Vec8b(b) {}
481     // Assignment operator to broadcast scalar:
482     Vec8qb & operator = (bool b) {
483         m16 = Vec8b(b);
484         return *this;
485     }
486 private: // Prevent constructing from int, etc.
487     Vec8qb(int b);
488     Vec8qb & operator = (int x);
489 public:
490     // Constructor to make from two halves
Vec8qb(Vec4qb const & x0,Vec4qb const & x1)491     Vec8qb (Vec4qb const & x0, Vec4qb const & x1) {
492         // = Vec8q(x0,x1) != 0;  (not defined yet)
493         __m512i z = _mm512_inserti64x4(_mm512_castsi256_si512(x0), x1, 1);
494         m16 = _mm512_cmpneq_epi64_mask(z, _mm512_setzero_si512());
495     }
496 };
497 
498 // Define operators for Vec8qb
499 
500 // vector operator & : bitwise and
501 static inline Vec8qb operator & (Vec8qb a, Vec8qb b) {
502     return Vec16b(a) & Vec16b(b);
503 }
504 static inline Vec8qb operator && (Vec8qb a, Vec8qb b) {
505     return a & b;
506 }
507 
508 // vector operator | : bitwise or
509 static inline Vec8qb operator | (Vec8qb a, Vec8qb b) {
510     return Vec16b(a) | Vec16b(b);
511 }
512 static inline Vec8qb operator || (Vec8qb a, Vec8qb b) {
513     return a | b;
514 }
515 
516 // vector operator ^ : bitwise xor
517 static inline Vec8qb operator ^ (Vec8qb a, Vec8qb b) {
518     return Vec16b(a) ^ Vec16b(b);
519 }
520 
521 // vector operator ~ : bitwise not
522 static inline Vec8qb operator ~ (Vec8qb a) {
523     return ~Vec16b(a);
524 }
525 
526 // vector operator ! : element not
527 static inline Vec8qb operator ! (Vec8qb a) {
528     return ~a;
529 }
530 
531 // vector operator &= : bitwise and
532 static inline Vec8qb & operator &= (Vec8qb & a, Vec8qb b) {
533     a = a & b;
534     return a;
535 }
536 
537 // vector operator |= : bitwise or
538 static inline Vec8qb & operator |= (Vec8qb & a, Vec8qb b) {
539     a = a | b;
540     return a;
541 }
542 
543 // vector operator ^= : bitwise xor
544 static inline Vec8qb & operator ^= (Vec8qb & a, Vec8qb b) {
545     a = a ^ b;
546     return a;
547 }
548 
549 // to_bits: convert to integer bitfield
to_bits(Vec8qb a)550 static inline uint32_t to_bits(Vec8qb a) {
551     return (uint8_t)(__mmask16)a;
552 }
553 
554 // vector function andnot
andnot(Vec8qb a,Vec8qb b)555 static inline Vec8qb andnot (Vec8qb a, Vec8qb b) {
556     return Vec8qb(andnot(Vec16b(a), Vec16b(b)));
557 }
558 
559 
560 /*****************************************************************************
561 *
562 *          Vector of 512 1-bit unsigned integers (base class for Vec16i)
563 *
564 *****************************************************************************/
565 class Vec512b {
566 protected:
567     __m512i zmm; // Integer vector
568 public:
569     // Default constructor:
Vec512b()570     Vec512b() {
571     }
572     // Constructor to build from two Vec256b:
Vec512b(Vec256b const & a0,Vec256b const & a1)573     Vec512b(Vec256b const & a0, Vec256b const & a1) {
574         zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
575     }
576     // Constructor to convert from type __m512i used in intrinsics:
Vec512b(__m512i const & x)577     Vec512b(__m512i const & x) {
578         zmm = x;
579     }
580     // Assignment operator to convert from type __m512i used in intrinsics:
581     Vec512b & operator = (__m512i const & x) {
582         zmm = x;
583         return *this;
584     }
585     // Type cast operator to convert to __m512i used in intrinsics
__m512i()586     operator __m512i() const {
587         return zmm;
588     }
589     // Member function to load from array (unaligned)
load(void const * p)590     Vec512b & load(void const * p) {
591         zmm = _mm512_loadu_si512(p);
592         return *this;
593     }
594     // Member function to load from array, aligned by 64
595     // You may use load_a instead of load if you are certain that p points to an address
596     // divisible by 64, but there is hardly any speed advantage of load_a on modern processors
load_a(void const * p)597     Vec512b & load_a(void const * p) {
598         zmm = _mm512_load_si512(p);
599         return *this;
600     }
601     // Member function to store into array (unaligned)
store(void * p)602     void store(void * p) const {
603         _mm512_storeu_si512(p, zmm);
604     }
605     // Member function to store into array, aligned by 64
606     // You may use store_a instead of store if you are certain that p points to an address
607     // divisible by 64, but there is hardly any speed advantage of store_a on modern processors
store_a(void * p)608     void store_a(void * p) const {
609         _mm512_store_si512(p, zmm);
610     }
611     // Member function to change a single bit, mainly for test purposes
612     // Note: This function is inefficient. Use load function if changing more than one bit
set_bit(uint32_t index,int value)613     Vec512b const & set_bit(uint32_t index, int value) {
614         static uint64_t m[16] = {0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0};
615         int wi = (index >> 6) & 7;               // qword index
616         int bi = index & 0x3F;                   // bit index within qword w
617 
618         __m512i mask = Vec512b().load(m+8-wi);   // 1 in qword number wi
619         mask = _mm512_sll_epi64(mask,_mm_cvtsi32_si128(bi)); // mask with bit number b set
620         if (value & 1) {
621             zmm = _mm512_or_si512(mask,zmm);
622         }
623         else {
624             zmm = _mm512_andnot_si512(mask,zmm);
625         }
626         return *this;
627     }
628     // Member function to get a single bit, mainly for test purposes
629     // Note: This function is inefficient. Use store function if reading more than one bit
get_bit(uint32_t index)630     int get_bit(uint32_t index) const {
631         union {
632             __m512i z;
633             uint8_t i[64];
634         } u;
635         u.z = zmm;
636         int wi = (index >> 3) & 0x3F;            // byte index
637         int bi = index & 7;                      // bit index within byte w
638         return (u.i[wi] >> bi) & 1;
639     }
640     // Member functions to split into two Vec256b:
get_low()641     Vec256b get_low() const {
642         return _mm512_castsi512_si256(zmm);
643     }
get_high()644     Vec256b get_high() const {
645         return _mm512_extracti64x4_epi64(zmm,1);
646     }
size()647     static int size () {
648         return 512;
649     }
650 };
651 
652 
653 // Define operators for this class
654 
655 // vector operator & : bitwise and
656 static inline Vec512b operator & (Vec512b const & a, Vec512b const & b) {
657     return _mm512_and_epi32(a, b);
658 }
659 static inline Vec512b operator && (Vec512b const & a, Vec512b const & b) {
660     return a & b;
661 }
662 
663 // vector operator | : bitwise or
664 static inline Vec512b operator | (Vec512b const & a, Vec512b const & b) {
665     return _mm512_or_epi32(a, b);
666 }
667 static inline Vec512b operator || (Vec512b const & a, Vec512b const & b) {
668     return a | b;
669 }
670 
671 // vector operator ^ : bitwise xor
672 static inline Vec512b operator ^ (Vec512b const & a, Vec512b const & b) {
673     return _mm512_xor_epi32(a, b);
674 }
675 
676 // vector operator ~ : bitwise not
677 static inline Vec512b operator ~ (Vec512b const & a) {
678     return _mm512_xor_epi32(a, _mm512_set1_epi32(-1));
679 }
680 
681 // vector operator &= : bitwise and
682 static inline Vec512b & operator &= (Vec512b & a, Vec512b const & b) {
683     a = a & b;
684     return a;
685 }
686 
687 // vector operator |= : bitwise or
688 static inline Vec512b & operator |= (Vec512b & a, Vec512b const & b) {
689     a = a | b;
690     return a;
691 }
692 
693 // vector operator ^= : bitwise xor
694 static inline Vec512b & operator ^= (Vec512b & a, Vec512b const & b) {
695     a = a ^ b;
696     return a;
697 }
698 
699 // Define functions for this class
700 
701 // function andnot: a & ~ b
andnot(Vec512b const & a,Vec512b const & b)702 static inline Vec512b andnot (Vec512b const & a, Vec512b const & b) {
703     return _mm512_andnot_epi32(b, a);
704 }
705 
706 
707 /*****************************************************************************
708 *
709 *          Vector of 16 32-bit signed integers
710 *
711 *****************************************************************************/
712 
713 class Vec16i: public Vec512b {
714 public:
715     // Default constructor:
Vec16i()716     Vec16i() {
717     };
718     // Constructor to broadcast the same value into all elements:
Vec16i(int i)719     Vec16i(int i) {
720         zmm = _mm512_set1_epi32(i);
721     };
722     // Constructor to build from all elements:
Vec16i(int32_t i0,int32_t i1,int32_t i2,int32_t i3,int32_t i4,int32_t i5,int32_t i6,int32_t i7,int32_t i8,int32_t i9,int32_t i10,int32_t i11,int32_t i12,int32_t i13,int32_t i14,int32_t i15)723     Vec16i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7,
724         int32_t i8, int32_t i9, int32_t i10, int32_t i11, int32_t i12, int32_t i13, int32_t i14, int32_t i15) {
725         zmm = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
726     };
727     // Constructor to build from two Vec8i:
Vec16i(Vec8i const & a0,Vec8i const & a1)728     Vec16i(Vec8i const & a0, Vec8i const & a1) {
729         zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
730     }
731     // Constructor to convert from type __m512i used in intrinsics:
Vec16i(__m512i const & x)732     Vec16i(__m512i const & x) {
733         zmm = x;
734     };
735     // Assignment operator to convert from type __m512i used in intrinsics:
736     Vec16i & operator = (__m512i const & x) {
737         zmm = x;
738         return *this;
739     };
740     // Type cast operator to convert to __m512i used in intrinsics
__m512i()741     operator __m512i() const {
742         return zmm;
743     };
744     // Member function to load from array (unaligned)
load(void const * p)745     Vec16i & load(void const * p) {
746         zmm = _mm512_loadu_si512(p);
747         return *this;
748     }
749     // Member function to load from array, aligned by 64
load_a(void const * p)750     Vec16i & load_a(void const * p) {
751         zmm = _mm512_load_si512(p);
752         return *this;
753     }
754     // Partial load. Load n elements and set the rest to 0
load_partial(int n,void const * p)755     Vec16i & load_partial(int n, void const * p) {
756         zmm = _mm512_maskz_loadu_epi32(__mmask16((1 << n) - 1), p);
757         return *this;
758     }
759     // Partial store. Store n elements
store_partial(int n,void * p)760     void store_partial(int n, void * p) const {
761         _mm512_mask_storeu_epi32(p, __mmask16((1 << n) - 1), zmm);
762     }
763     // cut off vector to n elements. The last 16-n elements are set to zero
cutoff(int n)764     Vec16i & cutoff(int n) {
765         zmm = _mm512_maskz_mov_epi32(__mmask16((1 << n) - 1), zmm);
766         return *this;
767     }
768     // Member function to change a single element in vector
insert(uint32_t index,int32_t value)769     Vec16i const & insert(uint32_t index, int32_t value) {
770         zmm = _mm512_mask_set1_epi32(zmm, __mmask16(1 << index), value);
771         return *this;
772     };
773     // Member function extract a single element from vector
extract(uint32_t index)774     int32_t extract(uint32_t index) const {
775         int32_t a[16];
776         store(a);
777         return a[index & 15];
778     }
779     // Extract a single element. Use store function if extracting more than one element.
780     // Operator [] can only read an element, not write.
781     int32_t operator [] (uint32_t index) const {
782         return extract(index);
783     }
784     // Member functions to split into two Vec8i:
get_low()785     Vec8i get_low() const {
786         return _mm512_castsi512_si256(zmm);
787     }
get_high()788     Vec8i get_high() const {
789         return _mm512_extracti64x4_epi64(zmm,1);
790     }
size()791     static int size () {
792         return 16;
793     }
794 };
795 
796 
797 // Define operators for Vec16i
798 
799 // vector operator + : add element by element
800 static inline Vec16i operator + (Vec16i const & a, Vec16i const & b) {
801     return _mm512_add_epi32(a, b);
802 }
803 
804 // vector operator += : add
805 static inline Vec16i & operator += (Vec16i & a, Vec16i const & b) {
806     a = a + b;
807     return a;
808 }
809 
810 // postfix operator ++
811 static inline Vec16i operator ++ (Vec16i & a, int) {
812     Vec16i a0 = a;
813     a = a + 1;
814     return a0;
815 }
816 
817 // prefix operator ++
818 static inline Vec16i & operator ++ (Vec16i & a) {
819     a = a + 1;
820     return a;
821 }
822 
823 // vector operator - : subtract element by element
824 static inline Vec16i operator - (Vec16i const & a, Vec16i const & b) {
825     return _mm512_sub_epi32(a, b);
826 }
827 
828 // vector operator - : unary minus
829 static inline Vec16i operator - (Vec16i const & a) {
830     return _mm512_sub_epi32(_mm512_setzero_epi32(), a);
831 }
832 
833 // vector operator -= : subtract
834 static inline Vec16i & operator -= (Vec16i & a, Vec16i const & b) {
835     a = a - b;
836     return a;
837 }
838 
839 // postfix operator --
840 static inline Vec16i operator -- (Vec16i & a, int) {
841     Vec16i a0 = a;
842     a = a - 1;
843     return a0;
844 }
845 
846 // prefix operator --
847 static inline Vec16i & operator -- (Vec16i & a) {
848     a = a - 1;
849     return a;
850 }
851 
852 // vector operator * : multiply element by element
853 static inline Vec16i operator * (Vec16i const & a, Vec16i const & b) {
854     return _mm512_mullo_epi32(a, b);
855 }
856 
857 // vector operator *= : multiply
858 static inline Vec16i & operator *= (Vec16i & a, Vec16i const & b) {
859     a = a * b;
860     return a;
861 }
862 
863 // vector operator / : divide all elements by same integer
864 // See bottom of file
865 
866 
867 // vector operator << : shift left
868 static inline Vec16i operator << (Vec16i const & a, int32_t b) {
869     return _mm512_sll_epi32(a, _mm_cvtsi32_si128(b));
870 }
871 
872 // vector operator <<= : shift left
873 static inline Vec16i & operator <<= (Vec16i & a, int32_t b) {
874     a = a << b;
875     return a;
876 }
877 
878 // vector operator >> : shift right arithmetic
879 static inline Vec16i operator >> (Vec16i const & a, int32_t b) {
880     return _mm512_sra_epi32(a, _mm_cvtsi32_si128(b));
881 }
882 
883 // vector operator >>= : shift right arithmetic
884 static inline Vec16i & operator >>= (Vec16i & a, int32_t b) {
885     a = a >> b;
886     return a;
887 }
888 
889 // vector operator == : returns true for elements for which a == b
890 static inline Vec16ib operator == (Vec16i const & a, Vec16i const & b) {
891     return _mm512_cmpeq_epi32_mask(a, b);
892 }
893 
894 // vector operator != : returns true for elements for which a != b
895 static inline Vec16ib operator != (Vec16i const & a, Vec16i const & b) {
896     return _mm512_cmpneq_epi32_mask(a, b);
897 }
898 
899 // vector operator > : returns true for elements for which a > b
900 static inline Vec16ib operator > (Vec16i const & a, Vec16i const & b) {
901     return  _mm512_cmpgt_epi32_mask(a, b);
902 }
903 
904 // vector operator < : returns true for elements for which a < b
905 static inline Vec16ib operator < (Vec16i const & a, Vec16i const & b) {
906     return b > a;
907 }
908 
909 // vector operator >= : returns true for elements for which a >= b (signed)
910 static inline Vec16ib operator >= (Vec16i const & a, Vec16i const & b) {
911     return _mm512_cmpge_epi32_mask(a, b);
912 }
913 
914 // vector operator <= : returns true for elements for which a <= b (signed)
915 static inline Vec16ib operator <= (Vec16i const & a, Vec16i const & b) {
916     return b >= a;
917 }
918 
919 // vector operator & : bitwise and
920 static inline Vec16i operator & (Vec16i const & a, Vec16i const & b) {
921     return _mm512_and_epi32(a, b);
922 }
923 
924 // vector operator &= : bitwise and
925 static inline Vec16i & operator &= (Vec16i & a, Vec16i const & b) {
926     a = a & b;
927     return a;
928 }
929 
930 // vector operator | : bitwise or
931 static inline Vec16i operator | (Vec16i const & a, Vec16i const & b) {
932     return _mm512_or_epi32(a, b);
933 }
934 
935 // vector operator |= : bitwise or
936 static inline Vec16i & operator |= (Vec16i & a, Vec16i const & b) {
937     a = a | b;
938     return a;
939 }
940 
941 // vector operator ^ : bitwise xor
942 static inline Vec16i operator ^ (Vec16i const & a, Vec16i const & b) {
943     return _mm512_xor_epi32(a, b);
944 }
945 
946 // vector operator ^= : bitwise xor
947 static inline Vec16i & operator ^= (Vec16i & a, Vec16i const & b) {
948     a = a ^ b;
949     return a;
950 }
951 
952 // vector operator ~ : bitwise not
953 static inline Vec16i operator ~ (Vec16i const & a) {
954     return a ^ Vec16i(-1);
955 }
956 
957 // Functions for this class
958 
959 // Select between two operands. Corresponds to this pseudocode:
960 // for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec16ib const & s,Vec16i const & a,Vec16i const & b)961 static inline Vec16i select (Vec16ib const & s, Vec16i const & a, Vec16i const & b) {
962     return _mm512_mask_mov_epi32(b, s, a);  // conditional move may be optimized better by the compiler than blend
963     // return _mm512_mask_blend_epi32(s, b, a);
964 }
965 
966 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec16ib const & f,Vec16i const & a,Vec16i const & b)967 static inline Vec16i if_add (Vec16ib const & f, Vec16i const & a, Vec16i const & b) {
968     return _mm512_mask_add_epi32(a, f, a, b);
969 }
970 
971 // Horizontal add: Calculates the sum of all vector elements.
972 // Overflow will wrap around
horizontal_add(Vec16i const & a)973 static inline int32_t horizontal_add (Vec16i const & a) {
974 #if defined(__INTEL_COMPILER)
975     return _mm512_reduce_add_epi32(a);
976 #else
977     return horizontal_add(a.get_low() + a.get_high());
978 #endif
979 }
980 
981 // function add_saturated: add element by element, signed with saturation
982 // (is it faster to up-convert to 64 bit integers, and then downconvert the sum with saturation?)
add_saturated(Vec16i const & a,Vec16i const & b)983 static inline Vec16i add_saturated(Vec16i const & a, Vec16i const & b) {
984     __m512i sum    = _mm512_add_epi32(a, b);                  // a + b
985     __m512i axb    = _mm512_xor_epi32(a, b);                  // check if a and b have different sign
986     __m512i axs    = _mm512_xor_epi32(a, sum);                // check if a and sum have different sign
987     __m512i ovf1   = _mm512_andnot_epi32(axb,axs);            // check if sum has wrong sign
988     __m512i ovf2   = _mm512_srai_epi32(ovf1,31);              // -1 if overflow
989     __mmask16 ovf3 = _mm512_cmpneq_epi32_mask(ovf2, _mm512_setzero_epi32()); // same, as mask
990     __m512i asign  = _mm512_srli_epi32(a,31);                 // 1  if a < 0
991     __m512i sat1   = _mm512_srli_epi32(ovf2,1);               // 7FFFFFFF if overflow
992     __m512i sat2   = _mm512_add_epi32(sat1,asign);            // 7FFFFFFF if positive overflow 80000000 if negative overflow
993     return _mm512_mask_blend_epi32(ovf3, sum, sat2);          // sum if not overflow, else sat2
994 }
995 
996 // function sub_saturated: subtract element by element, signed with saturation
sub_saturated(Vec16i const & a,Vec16i const & b)997 static inline Vec16i sub_saturated(Vec16i const & a, Vec16i const & b) {
998     __m512i diff   = _mm512_sub_epi32(a, b);                  // a + b
999     __m512i axb    = _mm512_xor_si512(a, b);                  // check if a and b have different sign
1000     __m512i axs    = _mm512_xor_si512(a, diff);               // check if a and sum have different sign
1001     __m512i ovf1   = _mm512_and_si512(axb,axs);               // check if sum has wrong sign
1002     __m512i ovf2   = _mm512_srai_epi32(ovf1,31);              // -1 if overflow
1003     __mmask16 ovf3 = _mm512_cmpneq_epi32_mask(ovf2, _mm512_setzero_epi32()); // same, as mask
1004     __m512i asign  = _mm512_srli_epi32(a,31);                 // 1  if a < 0
1005     __m512i sat1   = _mm512_srli_epi32(ovf2,1);               // 7FFFFFFF if overflow
1006     __m512i sat2   = _mm512_add_epi32(sat1,asign);            // 7FFFFFFF if positive overflow 80000000 if negative overflow
1007     return _mm512_mask_blend_epi32(ovf3, diff, sat2);         // sum if not overflow, else sat2
1008 }
1009 
1010 // function max: a > b ? a : b
max(Vec16i const & a,Vec16i const & b)1011 static inline Vec16i max(Vec16i const & a, Vec16i const & b) {
1012     return _mm512_max_epi32(a,b);
1013 }
1014 
1015 // function min: a < b ? a : b
min(Vec16i const & a,Vec16i const & b)1016 static inline Vec16i min(Vec16i const & a, Vec16i const & b) {
1017     return _mm512_min_epi32(a,b);
1018 }
1019 
1020 // function abs: a >= 0 ? a : -a
abs(Vec16i const & a)1021 static inline Vec16i abs(Vec16i const & a) {
1022     return _mm512_abs_epi32(a);
1023 }
1024 
1025 // function abs_saturated: same as abs, saturate if overflow
abs_saturated(Vec16i const & a)1026 static inline Vec16i abs_saturated(Vec16i const & a) {
1027     return _mm512_min_epu32(abs(a), Vec16i(0x7FFFFFFF));
1028 }
1029 
1030 // function rotate_left all elements
1031 // Use negative count to rotate right
rotate_left(Vec16i const & a,int b)1032 static inline Vec16i rotate_left(Vec16i const & a, int b) {
1033     return _mm512_rolv_epi32(a, Vec16i(b));
1034 }
1035 
1036 
1037 /*****************************************************************************
1038 *
1039 *          Vector of 16 32-bit unsigned integers
1040 *
1041 *****************************************************************************/
1042 
1043 
1044 class Vec16ui : public Vec16i {
1045 public:
1046     // Default constructor:
Vec16ui()1047     Vec16ui() {
1048     };
1049     // Constructor to broadcast the same value into all elements:
Vec16ui(uint32_t i)1050     Vec16ui(uint32_t i) {
1051         zmm = _mm512_set1_epi32(i);
1052     };
1053     // Constructor to build from all elements:
Vec16ui(uint32_t i0,uint32_t i1,uint32_t i2,uint32_t i3,uint32_t i4,uint32_t i5,uint32_t i6,uint32_t i7,uint32_t i8,uint32_t i9,uint32_t i10,uint32_t i11,uint32_t i12,uint32_t i13,uint32_t i14,uint32_t i15)1054     Vec16ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7,
1055         uint32_t i8, uint32_t i9, uint32_t i10, uint32_t i11, uint32_t i12, uint32_t i13, uint32_t i14, uint32_t i15) {
1056         zmm = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
1057     };
1058     // Constructor to build from two Vec8ui:
Vec16ui(Vec8ui const & a0,Vec8ui const & a1)1059     Vec16ui(Vec8ui const & a0, Vec8ui const & a1) {
1060         zmm = Vec16i(Vec8i(a0), Vec8i(a1));
1061     }
1062     // Constructor to convert from type __m512i used in intrinsics:
Vec16ui(__m512i const & x)1063     Vec16ui(__m512i const & x) {
1064         zmm = x;
1065     };
1066     // Assignment operator to convert from type __m512i used in intrinsics:
1067     Vec16ui & operator = (__m512i const & x) {
1068         zmm = x;
1069         return *this;
1070     };
1071     // Member function to load from array (unaligned)
load(void const * p)1072     Vec16ui & load(void const * p) {
1073         Vec16i::load(p);
1074         return *this;
1075     }
1076     // Member function to load from array, aligned by 64
load_a(void const * p)1077     Vec16ui & load_a(void const * p) {
1078         Vec16i::load_a(p);
1079         return *this;
1080     }
1081     // Member function to change a single element in vector
1082     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,uint32_t value)1083     Vec16ui const & insert(uint32_t index, uint32_t value) {
1084         Vec16i::insert(index, value);
1085         return *this;
1086     }
1087     // Member function extract a single element from vector
extract(uint32_t index)1088     uint32_t extract(uint32_t index) const {
1089         return Vec16i::extract(index);
1090     }
1091     // Extract a single element. Use store function if extracting more than one element.
1092     // Operator [] can only read an element, not write.
1093     uint32_t operator [] (uint32_t index) const {
1094         return extract(index);
1095     }
1096     // Member functions to split into two Vec4ui:
get_low()1097     Vec8ui get_low() const {
1098         return Vec8ui(Vec16i::get_low());
1099     }
get_high()1100     Vec8ui get_high() const {
1101         return Vec8ui(Vec16i::get_high());
1102     }
1103 };
1104 
1105 // Define operators for this class
1106 
1107 // vector operator + : add
1108 static inline Vec16ui operator + (Vec16ui const & a, Vec16ui const & b) {
1109     return Vec16ui (Vec16i(a) + Vec16i(b));
1110 }
1111 
1112 // vector operator - : subtract
1113 static inline Vec16ui operator - (Vec16ui const & a, Vec16ui const & b) {
1114     return Vec16ui (Vec16i(a) - Vec16i(b));
1115 }
1116 
1117 // vector operator * : multiply
1118 static inline Vec16ui operator * (Vec16ui const & a, Vec16ui const & b) {
1119     return Vec16ui (Vec16i(a) * Vec16i(b));
1120 }
1121 
1122 // vector operator / : divide
1123 // See bottom of file
1124 
1125 // vector operator >> : shift right logical all elements
1126 static inline Vec16ui operator >> (Vec16ui const & a, uint32_t b) {
1127     return _mm512_srl_epi32(a, _mm_cvtsi32_si128(b));
1128 }
1129 
1130 // vector operator >> : shift right logical all elements
1131 static inline Vec16ui operator >> (Vec16ui const & a, int32_t b) {
1132     return a >> (uint32_t)b;
1133 }
1134 
1135 // vector operator >>= : shift right logical
1136 static inline Vec16ui & operator >>= (Vec16ui & a, uint32_t b) {
1137     a = a >> b;
1138     return a;
1139 }
1140 
1141 // vector operator >>= : shift right logical
1142 static inline Vec16ui & operator >>= (Vec16ui & a, int32_t b) {
1143     a = a >> uint32_t(b);
1144     return a;
1145 }
1146 
1147 // vector operator << : shift left all elements
1148 static inline Vec16ui operator << (Vec16ui const & a, uint32_t b) {
1149     return Vec16ui ((Vec16i)a << (int32_t)b);
1150 }
1151 
1152 // vector operator << : shift left all elements
1153 static inline Vec16ui operator << (Vec16ui const & a, int32_t b) {
1154     return Vec16ui ((Vec16i)a << (int32_t)b);
1155 }
1156 
1157 // vector operator < : returns true for elements for which a < b (unsigned)
1158 static inline Vec16ib operator < (Vec16ui const & a, Vec16ui const & b) {
1159     return _mm512_cmplt_epu32_mask(a, b);
1160 }
1161 
1162 // vector operator > : returns true for elements for which a > b (unsigned)
1163 static inline Vec16ib operator > (Vec16ui const & a, Vec16ui const & b) {
1164     return b < a;
1165 }
1166 
1167 
1168 // vector operator >= : returns true for elements for which a >= b (unsigned)
1169 static inline Vec16ib operator >= (Vec16ui const & a, Vec16ui const & b) {
1170     return  _mm512_cmpge_epu32_mask(a, b);
1171 }
1172 
1173 // vector operator <= : returns true for elements for which a <= b (unsigned)
1174 static inline Vec16ib operator <= (Vec16ui const & a, Vec16ui const & b) {
1175     return b >= a;
1176 }
1177 
1178 // vector operator & : bitwise and
1179 static inline Vec16ui operator & (Vec16ui const & a, Vec16ui const & b) {
1180     return Vec16ui(Vec16i(a) & Vec16i(b));
1181 }
1182 
1183 // vector operator | : bitwise or
1184 static inline Vec16ui operator | (Vec16ui const & a, Vec16ui const & b) {
1185     return Vec16ui(Vec16i(a) | Vec16i(b));
1186 }
1187 
1188 // vector operator ^ : bitwise xor
1189 static inline Vec16ui operator ^ (Vec16ui const & a, Vec16ui const & b) {
1190     return Vec16ui(Vec16i(a) ^ Vec16i(b));
1191 }
1192 
1193 // vector operator ~ : bitwise not
1194 static inline Vec16ui operator ~ (Vec16ui const & a) {
1195     return Vec16ui( ~ Vec16i(a));
1196 }
1197 
1198 // Functions for this class
1199 
1200 // Select between two operands. Corresponds to this pseudocode:
1201 // for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec16ib const & s,Vec16ui const & a,Vec16ui const & b)1202 static inline Vec16ui select (Vec16ib const & s, Vec16ui const & a, Vec16ui const & b) {
1203     return Vec16ui(select(s, Vec16i(a), Vec16i(b)));
1204 }
1205 
1206 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec16ib const & f,Vec16ui const & a,Vec16ui const & b)1207 static inline Vec16ui if_add (Vec16ib const & f, Vec16ui const & a, Vec16ui const & b) {
1208     return Vec16ui(if_add(f, Vec16i(a), Vec16i(b)));
1209 }
1210 
1211 // Horizontal add: Calculates the sum of all vector elements.
1212 // Overflow will wrap around
horizontal_add(Vec16ui const & a)1213 static inline uint32_t horizontal_add (Vec16ui const & a) {
1214     return horizontal_add((Vec16i)a);
1215 }
1216 
1217 // horizontal_add_x: Horizontal add extended: Calculates the sum of all vector elements. Defined later in this file
1218 
1219 // function add_saturated: add element by element, unsigned with saturation
add_saturated(Vec16ui const & a,Vec16ui const & b)1220 static inline Vec16ui add_saturated(Vec16ui const & a, Vec16ui const & b) {
1221     Vec16ui sum      = a + b;
1222     Vec16ib overflow = sum < (a | b);                  // overflow if (a + b) < (a | b)
1223     return _mm512_mask_set1_epi32(sum, overflow, -1);  // 0xFFFFFFFF if overflow
1224 }
1225 
1226 // function sub_saturated: subtract element by element, unsigned with saturation
sub_saturated(Vec16ui const & a,Vec16ui const & b)1227 static inline Vec16ui sub_saturated(Vec16ui const & a, Vec16ui const & b) {
1228     Vec16ui diff      = a - b;
1229     return _mm512_maskz_mov_epi32(diff <= a, diff);   // underflow if diff > a gives zero
1230 }
1231 
1232 // function max: a > b ? a : b
max(Vec16ui const & a,Vec16ui const & b)1233 static inline Vec16ui max(Vec16ui const & a, Vec16ui const & b) {
1234     return _mm512_max_epu32(a,b);
1235 }
1236 
1237 // function min: a < b ? a : b
min(Vec16ui const & a,Vec16ui const & b)1238 static inline Vec16ui min(Vec16ui const & a, Vec16ui const & b) {
1239     return _mm512_min_epu32(a,b);
1240 }
1241 
1242 
1243 /*****************************************************************************
1244 *
1245 *          Vector of 8 64-bit signed integers
1246 *
1247 *****************************************************************************/
1248 
1249 class Vec8q : public Vec512b {
1250 public:
1251     // Default constructor:
Vec8q()1252     Vec8q() {
1253     }
1254     // Constructor to broadcast the same value into all elements:
Vec8q(int64_t i)1255     Vec8q(int64_t i) {
1256         zmm = _mm512_set1_epi64(i);
1257     }
1258     // Constructor to build from all elements:
Vec8q(int64_t i0,int64_t i1,int64_t i2,int64_t i3,int64_t i4,int64_t i5,int64_t i6,int64_t i7)1259     Vec8q(int64_t i0, int64_t i1, int64_t i2, int64_t i3, int64_t i4, int64_t i5, int64_t i6, int64_t i7) {
1260         zmm = _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7);
1261     }
1262     // Constructor to build from two Vec4q:
Vec8q(Vec4q const & a0,Vec4q const & a1)1263     Vec8q(Vec4q const & a0, Vec4q const & a1) {
1264         zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
1265     }
1266     // Constructor to convert from type __m512i used in intrinsics:
Vec8q(__m512i const & x)1267     Vec8q(__m512i const & x) {
1268         zmm = x;
1269     }
1270     // Assignment operator to convert from type __m512i used in intrinsics:
1271     Vec8q & operator = (__m512i const & x) {
1272         zmm = x;
1273         return *this;
1274     }
1275     // Type cast operator to convert to __m512i used in intrinsics
__m512i()1276     operator __m512i() const {
1277         return zmm;
1278     }
1279     // Member function to load from array (unaligned)
load(void const * p)1280     Vec8q & load(void const * p) {
1281         zmm = _mm512_loadu_si512(p);
1282         return *this;
1283     }
1284     // Member function to load from array, aligned by 64
load_a(void const * p)1285     Vec8q & load_a(void const * p) {
1286         zmm = _mm512_load_si512(p);
1287         return *this;
1288     }
1289     // Partial load. Load n elements and set the rest to 0
load_partial(int n,void const * p)1290     Vec8q & load_partial(int n, void const * p) {
1291         zmm = _mm512_maskz_loadu_epi64(__mmask16((1 << n) - 1), p);
1292         return *this;
1293     }
1294     // Partial store. Store n elements
store_partial(int n,void * p)1295     void store_partial(int n, void * p) const {
1296         _mm512_mask_storeu_epi64(p, __mmask16((1 << n) - 1), zmm);
1297     }
1298     // cut off vector to n elements. The last 8-n elements are set to zero
cutoff(int n)1299     Vec8q & cutoff(int n) {
1300         zmm = _mm512_maskz_mov_epi64(__mmask16((1 << n) - 1), zmm);
1301         return *this;
1302     }
1303     // Member function to change a single element in vector
1304     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,int64_t value)1305     Vec8q const & insert(uint32_t index, int64_t value) {
1306         zmm = _mm512_mask_set1_epi64(zmm, __mmask16(1 << index), value);
1307         // zmm = _mm512_mask_blend_epi64(__mmask16(1 << index), zmm, _mm512_set1_epi64(value));
1308         return *this;
1309     }
1310     // Member function extract a single element from vector
extract(uint32_t index)1311     int64_t extract(uint32_t index) const {
1312         int64_t a[8];
1313         store (a);
1314         return a[index & 7];
1315     }
1316     // Extract a single element. Use store function if extracting more than one element.
1317     // Operator [] can only read an element, not write.
1318     int64_t operator [] (uint32_t index) const {
1319         return extract(index);
1320     }
1321     // Member functions to split into two Vec2q:
get_low()1322     Vec4q get_low() const {
1323         return _mm512_castsi512_si256(zmm);
1324     }
get_high()1325     Vec4q get_high() const {
1326         return _mm512_extracti64x4_epi64(zmm,1);
1327     }
size()1328     static int size () {
1329         return 8;
1330     }
1331 };
1332 
1333 
1334 // Define operators for Vec8q
1335 
1336 // vector operator + : add element by element
1337 static inline Vec8q operator + (Vec8q const & a, Vec8q const & b) {
1338     return _mm512_add_epi64(a, b);
1339 }
1340 
1341 // vector operator += : add
1342 static inline Vec8q & operator += (Vec8q & a, Vec8q const & b) {
1343     a = a + b;
1344     return a;
1345 }
1346 
1347 // postfix operator ++
1348 static inline Vec8q operator ++ (Vec8q & a, int) {
1349     Vec8q a0 = a;
1350     a = a + 1;
1351     return a0;
1352 }
1353 
1354 // prefix operator ++
1355 static inline Vec8q & operator ++ (Vec8q & a) {
1356     a = a + 1;
1357     return a;
1358 }
1359 
1360 // vector operator - : subtract element by element
1361 static inline Vec8q operator - (Vec8q const & a, Vec8q const & b) {
1362     return _mm512_sub_epi64(a, b);
1363 }
1364 
1365 // vector operator - : unary minus
1366 static inline Vec8q operator - (Vec8q const & a) {
1367     return _mm512_sub_epi64(_mm512_setzero_epi32(), a);
1368 }
1369 
1370 // vector operator -= : subtract
1371 static inline Vec8q & operator -= (Vec8q & a, Vec8q const & b) {
1372     a = a - b;
1373     return a;
1374 }
1375 
1376 // postfix operator --
1377 static inline Vec8q operator -- (Vec8q & a, int) {
1378     Vec8q a0 = a;
1379     a = a - 1;
1380     return a0;
1381 }
1382 
1383 // prefix operator --
1384 static inline Vec8q & operator -- (Vec8q & a) {
1385     a = a - 1;
1386     return a;
1387 }
1388 
1389 // vector operator * : multiply element by element
1390 static inline Vec8q operator * (Vec8q const & a, Vec8q const & b) {
1391 #ifdef __AVX512DQ__
1392     return _mm512_mullo_epi64(a, b);
1393 #elif defined (__INTEL_COMPILER)
1394     return _mm512_mullox_epi64(a, b);                      // _mm512_mullox_epi64 missing in gcc
1395 #else
1396     // instruction does not exist. Split into 32-bit multiplies
1397     //__m512i ahigh = _mm512_shuffle_epi32(a, 0xB1);       // swap H<->L
1398     __m512i ahigh   = _mm512_srli_epi64(a, 32);            // high 32 bits of each a
1399     __m512i bhigh   = _mm512_srli_epi64(b, 32);            // high 32 bits of each b
1400     __m512i prodahb = _mm512_mul_epu32(ahigh, b);          // ahigh*b
1401     __m512i prodbha = _mm512_mul_epu32(bhigh, a);          // bhigh*a
1402     __m512i prodhl  = _mm512_add_epi64(prodahb, prodbha);  // sum of high*low products
1403     __m512i prodhi  = _mm512_slli_epi64(prodhl, 32);       // same, shifted high
1404     __m512i prodll  = _mm512_mul_epu32(a, b);              // alow*blow = 64 bit unsigned products
1405     __m512i prod    = _mm512_add_epi64(prodll, prodhi);    // low*low+(high*low)<<32
1406     return  prod;
1407 #endif
1408 }
1409 
1410 // vector operator *= : multiply
1411 static inline Vec8q & operator *= (Vec8q & a, Vec8q const & b) {
1412     a = a * b;
1413     return a;
1414 }
1415 
1416 // vector operator << : shift left
1417 static inline Vec8q operator << (Vec8q const & a, int32_t b) {
1418     return _mm512_sll_epi64(a, _mm_cvtsi32_si128(b));
1419 }
1420 
1421 // vector operator <<= : shift left
1422 static inline Vec8q & operator <<= (Vec8q & a, int32_t b) {
1423     a = a << b;
1424     return a;
1425 }
1426 
1427 // vector operator >> : shift right arithmetic
1428 static inline Vec8q operator >> (Vec8q const & a, int32_t b) {
1429     return _mm512_sra_epi64(a, _mm_cvtsi32_si128(b));
1430 }
1431 
1432 // vector operator >>= : shift right arithmetic
1433 static inline Vec8q & operator >>= (Vec8q & a, int32_t b) {
1434     a = a >> b;
1435     return a;
1436 }
1437 
1438 // vector operator == : returns true for elements for which a == b
1439 static inline Vec8qb operator == (Vec8q const & a, Vec8q const & b) {
1440     return Vec8qb(_mm512_cmpeq_epi64_mask(a, b));
1441 }
1442 
1443 // vector operator != : returns true for elements for which a != b
1444 static inline Vec8qb operator != (Vec8q const & a, Vec8q const & b) {
1445     return Vec8qb(_mm512_cmpneq_epi64_mask(a, b));
1446 }
1447 
1448 // vector operator < : returns true for elements for which a < b
1449 static inline Vec8qb operator < (Vec8q const & a, Vec8q const & b) {
1450     return Vec8qb(_mm512_cmplt_epi64_mask(a, b));
1451 }
1452 
1453 // vector operator > : returns true for elements for which a > b
1454 static inline Vec8qb operator > (Vec8q const & a, Vec8q const & b) {
1455     return b < a;
1456 }
1457 
1458 // vector operator >= : returns true for elements for which a >= b (signed)
1459 static inline Vec8qb operator >= (Vec8q const & a, Vec8q const & b) {
1460     return Vec8qb(_mm512_cmpge_epi64_mask(a, b));
1461 }
1462 
1463 // vector operator <= : returns true for elements for which a <= b (signed)
1464 static inline Vec8qb operator <= (Vec8q const & a, Vec8q const & b) {
1465     return b >= a;
1466 }
1467 
1468 // vector operator & : bitwise and
1469 static inline Vec8q operator & (Vec8q const & a, Vec8q const & b) {
1470     return _mm512_and_epi32(a, b);
1471 }
1472 
1473 // vector operator &= : bitwise and
1474 static inline Vec8q & operator &= (Vec8q & a, Vec8q const & b) {
1475     a = a & b;
1476     return a;
1477 }
1478 
1479 // vector operator | : bitwise or
1480 static inline Vec8q operator | (Vec8q const & a, Vec8q const & b) {
1481     return _mm512_or_epi32(a, b);
1482 }
1483 
1484 // vector operator |= : bitwise or
1485 static inline Vec8q & operator |= (Vec8q & a, Vec8q const & b) {
1486     a = a | b;
1487     return a;
1488 }
1489 
1490 // vector operator ^ : bitwise xor
1491 static inline Vec8q operator ^ (Vec8q const & a, Vec8q const & b) {
1492     return _mm512_xor_epi32(a, b);
1493 }
1494 // vector operator ^= : bitwise xor
1495 static inline Vec8q & operator ^= (Vec8q & a, Vec8q const & b) {
1496     a = a ^ b;
1497     return a;
1498 }
1499 
1500 // vector operator ~ : bitwise not
1501 static inline Vec8q operator ~ (Vec8q const & a) {
1502     return Vec8q(~ Vec16i(a));
1503 }
1504 
1505 // Functions for this class
1506 
1507 // Select between two operands. Corresponds to this pseudocode:
1508 // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec8qb const & s,Vec8q const & a,Vec8q const & b)1509 static inline Vec8q select (Vec8qb const & s, Vec8q const & a, Vec8q const & b) {
1510     return _mm512_mask_mov_epi64(b, s, a);
1511     //return _mm512_mask_blend_epi64(s, b, a);
1512 }
1513 
1514 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec8qb const & f,Vec8q const & a,Vec8q const & b)1515 static inline Vec8q if_add (Vec8qb const & f, Vec8q const & a, Vec8q const & b) {
1516     return _mm512_mask_add_epi64(a, f, a, b);
1517 }
1518 
1519 // Horizontal add: Calculates the sum of all vector elements.
1520 // Overflow will wrap around
horizontal_add(Vec8q const & a)1521 static inline int64_t horizontal_add (Vec8q const & a) {
1522 #if defined(__INTEL_COMPILER)
1523     return _mm512_reduce_add_epi64(a);
1524 #else
1525     return horizontal_add(a.get_low()+a.get_high());
1526 #endif
1527 }
1528 
1529 // Horizontal add extended: Calculates the sum of all vector elements
1530 // Elements are sign extended before adding to avoid overflow
horizontal_add_x(Vec16i const & x)1531 static inline int64_t horizontal_add_x (Vec16i const & x) {
1532     Vec8q a = _mm512_cvtepi32_epi64(x.get_low());
1533     Vec8q b = _mm512_cvtepi32_epi64(x.get_high());
1534     return horizontal_add(a+b);
1535 }
1536 
1537 // Horizontal add extended: Calculates the sum of all vector elements
1538 // Elements are zero extended before adding to avoid overflow
horizontal_add_x(Vec16ui const & x)1539 static inline uint64_t horizontal_add_x (Vec16ui const & x) {
1540     Vec8q a = _mm512_cvtepu32_epi64(x.get_low());
1541     Vec8q b = _mm512_cvtepu32_epi64(x.get_high());
1542     return horizontal_add(a+b);
1543 }
1544 
1545 // function max: a > b ? a : b
max(Vec8q const & a,Vec8q const & b)1546 static inline Vec8q max(Vec8q const & a, Vec8q const & b) {
1547     return _mm512_max_epi64(a, b);
1548 }
1549 
1550 // function min: a < b ? a : b
min(Vec8q const & a,Vec8q const & b)1551 static inline Vec8q min(Vec8q const & a, Vec8q const & b) {
1552     return _mm512_min_epi64(a, b);
1553 }
1554 
1555 // function abs: a >= 0 ? a : -a
abs(Vec8q const & a)1556 static inline Vec8q abs(Vec8q const & a) {
1557     return _mm512_abs_epi64(a);
1558 }
1559 
1560 // function abs_saturated: same as abs, saturate if overflow
abs_saturated(Vec8q const & a)1561 static inline Vec8q abs_saturated(Vec8q const & a) {
1562     return _mm512_min_epu64(abs(a), Vec8q(0x7FFFFFFFFFFFFFFF));
1563 }
1564 
1565 // function rotate_left all elements
1566 // Use negative count to rotate right
rotate_left(Vec8q const & a,int b)1567 static inline Vec8q rotate_left(Vec8q const & a, int b) {
1568     return _mm512_rolv_epi64(a, Vec8q(b));
1569 }
1570 
1571 
1572 /*****************************************************************************
1573 *
1574 *          Vector of 8 64-bit unsigned integers
1575 *
1576 *****************************************************************************/
1577 
1578 class Vec8uq : public Vec8q {
1579 public:
1580     // Default constructor:
Vec8uq()1581     Vec8uq() {
1582     }
1583     // Constructor to broadcast the same value into all elements:
Vec8uq(uint64_t i)1584     Vec8uq(uint64_t i) {
1585         zmm = Vec8q(i);
1586     }
1587     // Constructor to convert from Vec8q:
Vec8uq(Vec8q const & x)1588     Vec8uq(Vec8q const & x) {
1589         zmm = x;
1590     }
1591     // Constructor to convert from type __m512i used in intrinsics:
Vec8uq(__m512i const & x)1592     Vec8uq(__m512i const & x) {
1593         zmm = x;
1594     }
1595     // Constructor to build from all elements:
Vec8uq(uint64_t i0,uint64_t i1,uint64_t i2,uint64_t i3,uint64_t i4,uint64_t i5,uint64_t i6,uint64_t i7)1596     Vec8uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3, uint64_t i4, uint64_t i5, uint64_t i6, uint64_t i7) {
1597         zmm = Vec8q(i0, i1, i2, i3, i4, i5, i6, i7);
1598     }
1599     // Constructor to build from two Vec4uq:
Vec8uq(Vec4uq const & a0,Vec4uq const & a1)1600     Vec8uq(Vec4uq const & a0, Vec4uq const & a1) {
1601         zmm = Vec8q(Vec4q(a0), Vec4q(a1));
1602     }
1603     // Assignment operator to convert from Vec8q:
1604     Vec8uq  & operator = (Vec8q const & x) {
1605         zmm = x;
1606         return *this;
1607     }
1608     // Assignment operator to convert from type __m512i used in intrinsics:
1609     Vec8uq & operator = (__m512i const & x) {
1610         zmm = x;
1611         return *this;
1612     }
1613     // Member function to load from array (unaligned)
load(void const * p)1614     Vec8uq & load(void const * p) {
1615         Vec8q::load(p);
1616         return *this;
1617     }
1618     // Member function to load from array, aligned by 32
load_a(void const * p)1619     Vec8uq & load_a(void const * p) {
1620         Vec8q::load_a(p);
1621         return *this;
1622     }
1623     // Member function to change a single element in vector
1624     // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,uint64_t value)1625     Vec8uq const & insert(uint32_t index, uint64_t value) {
1626         Vec8q::insert(index, value);
1627         return *this;
1628     }
1629     // Member function extract a single element from vector
extract(uint32_t index)1630     uint64_t extract(uint32_t index) const {
1631         return Vec8q::extract(index);
1632     }
1633     // Extract a single element. Use store function if extracting more than one element.
1634     // Operator [] can only read an element, not write.
1635     uint64_t operator [] (uint32_t index) const {
1636         return extract(index);
1637     }
1638     // Member functions to split into two Vec2uq:
get_low()1639     Vec4uq get_low() const {
1640         return Vec4uq(Vec8q::get_low());
1641     }
get_high()1642     Vec4uq get_high() const {
1643         return Vec4uq(Vec8q::get_high());
1644     }
1645 };
1646 
1647 // Define operators for this class
1648 
1649 // vector operator + : add
1650 static inline Vec8uq operator + (Vec8uq const & a, Vec8uq const & b) {
1651     return Vec8uq (Vec8q(a) + Vec8q(b));
1652 }
1653 
1654 // vector operator - : subtract
1655 static inline Vec8uq operator - (Vec8uq const & a, Vec8uq const & b) {
1656     return Vec8uq (Vec8q(a) - Vec8q(b));
1657 }
1658 
1659 // vector operator * : multiply element by element
1660 static inline Vec8uq operator * (Vec8uq const & a, Vec8uq const & b) {
1661     return Vec8uq (Vec8q(a) * Vec8q(b));
1662 }
1663 
1664 // vector operator >> : shift right logical all elements
1665 static inline Vec8uq operator >> (Vec8uq const & a, uint32_t b) {
1666     return _mm512_srl_epi64(a,_mm_cvtsi32_si128(b));
1667 }
1668 
1669 // vector operator >> : shift right logical all elements
1670 static inline Vec8uq operator >> (Vec8uq const & a, int32_t b) {
1671     return a >> (uint32_t)b;
1672 }
1673 
1674 // vector operator >>= : shift right artihmetic
1675 static inline Vec8uq & operator >>= (Vec8uq & a, uint32_t b) {
1676     a = a >> b;
1677     return a;
1678 }
1679 
1680 // vector operator >>= : shift right logical
1681 static inline Vec8uq & operator >>= (Vec8uq & a, int32_t b) {
1682     a = a >> uint32_t(b);
1683     return a;
1684 }
1685 
1686 // vector operator << : shift left all elements
1687 static inline Vec8uq operator << (Vec8uq const & a, uint32_t b) {
1688     return Vec8uq ((Vec8q)a << (int32_t)b);
1689 }
1690 
1691 // vector operator << : shift left all elements
1692 static inline Vec8uq operator << (Vec8uq const & a, int32_t b) {
1693     return Vec8uq ((Vec8q)a << b);
1694 }
1695 
1696 // vector operator < : returns true for elements for which a < b (unsigned)
1697 static inline Vec8qb operator < (Vec8uq const & a, Vec8uq const & b) {
1698     return _mm512_cmplt_epu64_mask(a, b);
1699 }
1700 
1701 // vector operator > : returns true for elements for which a > b (unsigned)
1702 static inline Vec8qb operator > (Vec8uq const & a, Vec8uq const & b) {
1703     return b < a;
1704 }
1705 
1706 // vector operator >= : returns true for elements for which a >= b (unsigned)
1707 static inline Vec8qb operator >= (Vec8uq const & a, Vec8uq const & b) {
1708     return _mm512_cmpge_epu64_mask(a, b);
1709 }
1710 
1711 // vector operator <= : returns true for elements for which a <= b (unsigned)
1712 static inline Vec8qb operator <= (Vec8uq const & a, Vec8uq const & b) {
1713     return b >= a;
1714 }
1715 
1716 // vector operator & : bitwise and
1717 static inline Vec8uq operator & (Vec8uq const & a, Vec8uq const & b) {
1718     return Vec8uq(Vec8q(a) & Vec8q(b));
1719 }
1720 
1721 // vector operator | : bitwise or
1722 static inline Vec8uq operator | (Vec8uq const & a, Vec8uq const & b) {
1723     return Vec8uq(Vec8q(a) | Vec8q(b));
1724 }
1725 
1726 // vector operator ^ : bitwise xor
1727 static inline Vec8uq operator ^ (Vec8uq const & a, Vec8uq const & b) {
1728     return Vec8uq(Vec8q(a) ^ Vec8q(b));
1729 }
1730 
1731 // Functions for this class
1732 
1733 // Select between two operands. Corresponds to this pseudocode:
1734 // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec8qb const & s,Vec8uq const & a,Vec8uq const & b)1735 static inline Vec8uq select (Vec8qb const & s, Vec8uq const & a, Vec8uq const & b) {
1736     return Vec8uq(select(s, Vec8q(a), Vec8q(b)));
1737 }
1738 
1739 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec8qb const & f,Vec8uq const & a,Vec8uq const & b)1740 static inline Vec8uq if_add (Vec8qb const & f, Vec8uq const & a, Vec8uq const & b) {
1741     return _mm512_mask_add_epi64(a, f, a, b);
1742 }
1743 
1744 // Horizontal add: Calculates the sum of all vector elements.
1745 // Overflow will wrap around
horizontal_add(Vec8uq const & a)1746 static inline uint64_t horizontal_add (Vec8uq const & a) {
1747     return horizontal_add(Vec8q(a));
1748 }
1749 
1750 // function max: a > b ? a : b
max(Vec8uq const & a,Vec8uq const & b)1751 static inline Vec8uq max(Vec8uq const & a, Vec8uq const & b) {
1752     return _mm512_max_epu64(a, b);
1753 }
1754 
1755 // function min: a < b ? a : b
min(Vec8uq const & a,Vec8uq const & b)1756 static inline Vec8uq min(Vec8uq const & a, Vec8uq const & b) {
1757     return _mm512_min_epu64(a, b);
1758 }
1759 
1760 
1761 /*****************************************************************************
1762 *
1763 *          Vector permute functions
1764 *
1765 ******************************************************************************
1766 *
1767 * These permute functions can reorder the elements of a vector and optionally
1768 * set some elements to zero.
1769 *
1770 * The indexes are inserted as template parameters in <>. These indexes must be
1771 * constants. Each template parameter is an index to the element you want to select.
1772 * An index of -1 will generate zero. An index of -256 means don't care.
1773 *
1774 * Example:
1775 * Vec8q a(10,11,12,13,14,15,16,17);      // a is (10,11,12,13,14,15,16,17)
1776 * Vec8q b;
1777 * b = permute8q<0,2,7,7,-1,-1,1,1>(a);   // b is (10,12,17,17, 0, 0,11,11)
1778 *
1779 * A lot of the code here is metaprogramming aiming to find the instructions
1780 * that best fit the template parameters and instruction set. The metacode
1781 * will be reduced out to leave only a few vector instructions in release
1782 * mode with optimization on.
1783 *****************************************************************************/
1784 
1785 // Permute vector of 8 64-bit integers.
1786 // Index -1 gives 0, index -256 means don't care.
1787 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
permute8q(Vec8q const & a)1788 static inline Vec8q permute8q(Vec8q const & a) {
1789 
1790     // Combine indexes into a single bitfield, with 4 bits for each
1791     const int m1 = (i0&7) | (i1&7)<<4 | (i2&7)<< 8 | (i3&7)<<12 | (i4&7)<<16 | (i5&7)<<20 | (i6&7)<<24 | (i7&7)<<28;
1792 
1793     // Mask to zero out negative indexes
1794     const int mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000);
1795     const int m2 = m1 & mz;
1796 
1797     // zeroing needed
1798     const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7) & 0x80) != 0;
1799 
1800     // special case: all zero
1801     if (mz == 0) return  _mm512_setzero_epi32();
1802 
1803     // mask for elements not zeroed
1804     const __mmask16  z = __mmask16((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7);
1805     // same with 2 bits for each element
1806     const __mmask16 zz = __mmask16((i0>=0?3:0) | (i1>=0?0xC:0) | (i2>=0?0x30:0) | (i3>=0?0xC0:0) | (i4>=0?0x300:0) | (i5>=0?0xC00:0) | (i6>=0?0x3000:0) | (i7>=0?0xC000:0));
1807 
1808     if (((m1 ^ 0x76543210) & mz) == 0) {
1809         // no shuffling
1810         if (dozero) {
1811             // zero some elements
1812             return _mm512_maskz_mov_epi64(z, a);
1813         }
1814         return a;                                 // do nothing
1815     }
1816 
1817     if (((m1 ^ 0x66442200) & 0x66666666 & mz) == 0) {
1818         // no exchange of data between the four 128-bit lanes
1819         const int pat = ((m2 | m2 >> 8 | m2 >> 16 | m2 >> 24) & 0x11) * 0x01010101;
1820         const int pmask = ((pat & 1) * 10 + 4) | ((((pat >> 4) & 1) * 10 + 4) << 4);
1821         if (((m1 ^ pat) & mz & 0x11111111) == 0) {
1822             // same permute pattern in all lanes
1823             if (dozero) {  // permute within lanes and zero
1824                 return _mm512_maskz_shuffle_epi32(zz, a, (_MM_PERM_ENUM)pmask);
1825             }
1826             else {  // permute within lanes
1827                 return _mm512_shuffle_epi32(a, (_MM_PERM_ENUM)pmask);
1828             }
1829         }
1830         // different permute patterns in each lane. It's faster to do a full permute than four masked permutes within lanes
1831     }
1832     if ((((m1 ^ 0x10101010) & 0x11111111 & mz) == 0)
1833     &&  ((m1 ^ (m1 >> 4)) & 0x06060606 & mz & (mz >> 4)) == 0) {
1834         // permute lanes only. no permutation within each lane
1835         const int m3 = m2 | (m2 >> 4);
1836         const int s = ((m3 >> 1) & 3) | (((m3 >> 9) & 3) << 2) | (((m3 >> 17) & 3) << 4) | (((m3 >> 25) & 3) << 6);
1837         if (dozero) {
1838             // permute lanes and zero some 64-bit elements
1839             return  _mm512_maskz_shuffle_i64x2(z, a, a, (_MM_PERM_ENUM)s);
1840         }
1841         else {
1842             // permute lanes
1843             return _mm512_shuffle_i64x2(a, a, (_MM_PERM_ENUM)s);
1844         }
1845     }
1846     // full permute needed
1847     const __m512i pmask = constant16i<i0&7, 0, i1&7, 0, i2&7, 0, i3&7, 0, i4&7, 0, i5&7, 0, i6&7, 0, i7&7, 0>();
1848     if (dozero) {
1849         // full permute and zeroing
1850         // Documentation is inconsistent. which order of the operands is correct?
1851         return _mm512_maskz_permutexvar_epi64(z, pmask, a);
1852     }
1853     else {
1854         return _mm512_permutexvar_epi64(pmask, a);
1855     }
1856 }
1857 
1858 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
permute8uq(Vec8uq const & a)1859 static inline Vec8uq permute8uq(Vec8uq const & a) {
1860     return Vec8uq (permute8q<i0,i1,i2,i3,i4,i5,i6,i7> (a));
1861 }
1862 
1863 
1864 // Permute vector of 16 32-bit integers.
1865 // Index -1 gives 0, index -256 means don't care.
1866 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
permute16i(Vec16i const & a)1867 static inline Vec16i permute16i(Vec16i const & a) {
1868 
1869     // Combine indexes into a single bitfield, with 4 bits for each
1870     const uint64_t m1 = (i0&15) | (i1&15)<<4 | (i2&15)<< 8 | (i3&15)<<12 | (i4&15)<<16 | (i5&15)<<20 | (i6&15)<<24 | (i7&15LL)<<28   // 15LL avoids sign extension of (int32_t | int64_t)
1871         | (i8&15LL)<<32 | (i9&15LL)<<36 | (i10&15LL)<<40 | (i11&15LL)<<44 | (i12&15LL)<<48 | (i13&15LL)<<52 | (i14&15LL)<<56 | (i15&15LL)<<60;
1872 
1873     // Mask to zero out negative indexes
1874     const uint64_t mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000ULL) | (i8<0?0:0xF00000000)
1875         | (i9<0?0:0xF000000000) | (i10<0?0:0xF0000000000) | (i11<0?0:0xF00000000000) | (i12<0?0:0xF000000000000) | (i13<0?0:0xF0000000000000) | (i14<0?0:0xF00000000000000) | (i15<0?0:0xF000000000000000);
1876 
1877     const uint64_t m2 = m1 & mz;
1878 
1879     // zeroing needed
1880     const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15) & 0x80) != 0;
1881 
1882     // special case: all zero
1883     if (mz == 0) return  _mm512_setzero_epi32();
1884 
1885     // mask for elements not zeroed
1886     const __mmask16 z = __mmask16((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7
1887         | (i8>=0)<<8 | (i9>=0)<<9 | (i10>=0)<<10 | (i11>=0)<<11 | (i12>=0)<<12 | (i13>=0)<<13 | (i14>=0)<<14 | (i15>=0)<<15);
1888 
1889     if (((m1 ^ 0xFEDCBA9876543210) & mz) == 0) {
1890         // no shuffling
1891         if (dozero) {
1892             // zero some elements
1893             return _mm512_maskz_mov_epi32(z, a);
1894         }
1895         return a;                                 // do nothing
1896     }
1897 
1898     if (((m1 ^ 0xCCCC888844440000) & 0xCCCCCCCCCCCCCCCC & mz) == 0) {
1899         // no exchange of data between the four 128-bit lanes
1900         const uint64_t pat = ((m2 | (m2 >> 16) | (m2 >> 32) | (m2 >> 48)) & 0x3333) * 0x0001000100010001;
1901         const int pmask = (pat & 3) | (((pat >> 4) & 3) << 2) | (((pat >> 8) & 3) << 4) | (((pat >> 12) & 3) << 6);
1902         if (((m1 ^ pat) & 0x3333333333333333 & mz) == 0) {
1903             // same permute pattern in all lanes
1904             if (dozero) {  // permute within lanes and zero
1905                 return _mm512_maskz_shuffle_epi32(z, a, (_MM_PERM_ENUM)pmask);
1906             }
1907             else {  // permute within lanes
1908                 return _mm512_shuffle_epi32(a, (_MM_PERM_ENUM)pmask);
1909             }
1910         }
1911         // different permute patterns in each lane. It's faster to do a full permute than four masked permutes within lanes
1912     }
1913     const uint64_t lane = (m2 | m2 >> 4 | m2 >> 8 | m2 >> 12) & 0x000C000C000C000C;
1914     if ((((m1 ^ 0x3210321032103210) & 0x3333333333333333 & mz) == 0)
1915     &&  ((m1 ^ (lane * 0x1111)) & 0xCCCCCCCCCCCCCCCC & mz) == 0) {
1916         // permute lanes only. no permutation within each lane
1917         const uint64_t s = ((lane >> 2) & 3) | (((lane >> 18) & 3) << 2) | (((lane >> 34) & 3) << 4) | (((lane >> 50) & 3) << 6);
1918         if (dozero) {
1919             // permute lanes and zero some 64-bit elements
1920             return  _mm512_maskz_shuffle_i32x4(z, a, a, (_MM_PERM_ENUM)s);
1921         }
1922         else {
1923             // permute lanes
1924             return _mm512_shuffle_i32x4(a, a, (_MM_PERM_ENUM)s);
1925         }
1926     }
1927     // full permute needed
1928     const __m512i pmask = constant16i<i0&15, i1&15, i2&15, i3&15, i4&15, i5&15, i6&15, i7&15, i8&15, i9&15, i10&15, i11&15, i12&15, i13&15, i14&15, i15&15>();
1929     if (dozero) {
1930         // full permute and zeroing
1931         return _mm512_maskz_permutexvar_epi32(z, pmask, a);
1932     }
1933     else {
1934         return _mm512_permutexvar_epi32(pmask, a);
1935     }
1936 }
1937 
1938 
1939 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
permute16ui(Vec16ui const & a)1940 static inline Vec16ui permute16ui(Vec16ui const & a) {
1941     return Vec16ui (permute16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a));
1942 }
1943 
1944 
1945 /*****************************************************************************
1946 *
1947 *          Vector blend functions
1948 *
1949 ******************************************************************************
1950 *
1951 * These blend functions can mix elements from two different vectors and
1952 * optionally set some elements to zero.
1953 *
1954 * The indexes are inserted as template parameters in <>. These indexes must be
1955 * constants. Each template parameter is an index to the element you want to
1956 * select, where higher indexes indicate an element from the second source
1957 * vector. For example, if each vector has 8 elements, then indexes 0 - 7
1958 * will select an element from the first vector and indexes 8 - 15 will select
1959 * an element from the second vector. A negative index will generate zero.
1960 *
1961 * Example:
1962 * Vec8q a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
1963 * Vec8q b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
1964 * Vec8q c;
1965 * c = blend8q<1,0,9,8,7,-1,15,15> (a,b);    // c is (101, 100, 201, 200, 107,   0, 207, 207)
1966 *
1967 * A lot of the code here is metaprogramming aiming to find the instructions
1968 * that best fit the template parameters and instruction set. The metacode
1969 * will be reduced out to leave only a few vector instructions in release
1970 * mode with optimization on.
1971 *****************************************************************************/
1972 
1973 
1974 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
blend8q(Vec8q const & a,Vec8q const & b)1975 static inline Vec8q blend8q(Vec8q const & a, Vec8q const & b) {
1976 
1977     // Combine indexes into a single bitfield, with 4 bits for each
1978     const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<< 8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
1979 
1980     // Mask to zero out negative indexes
1981     const int mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000);
1982     const int m2 = m1 & mz;
1983 
1984     // zeroing needed
1985     const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7) & 0x80) != 0;
1986 
1987     // mask for elements not zeroed
1988     const __mmask16 z = __mmask16((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7);
1989 
1990     // special case: all zero
1991     if (mz == 0) return  _mm512_setzero_epi32();
1992 
1993     // special case: all from a
1994     if ((m1 & 0x88888888 & mz) == 0) {
1995         return permute8q <i0, i1, i2, i3, i4, i5, i6, i7> (a);
1996     }
1997 
1998     // special case: all from b
1999     if ((~m1 & 0x88888888 & mz) == 0) {
2000         return permute8q <i0^8, i1^8, i2^8, i3^8, i4^8, i5^8, i6^8, i7^8> (b);
2001     }
2002 
2003     // special case: blend without permute
2004     if (((m1 ^ 0x76543210) & 0x77777777 & mz) == 0) {
2005         __mmask16 blendmask = __mmask16((i0&8)>>3 | (i1&8)>>2 | (i2&8)>>1 | (i3&8)>>0 | (i4&8)<<1 | (i5&8)<<2 | (i6&8)<<3 | (i7&8)<<4 );
2006         __m512i t = _mm512_mask_blend_epi64(blendmask, a, b);
2007         if (dozero) {
2008             t = _mm512_maskz_mov_epi64(z, t);
2009         }
2010         return t;
2011     }
2012     // special case: all data stay within their lane
2013     if (((m1 ^ 0x66442200) & 0x66666666 & mz) == 0) {
2014         // mask for elements from a and b
2015         const uint32_t mb = ((i0&8)?0xF:0) | ((i1&8)?0xF0:0) | ((i2&8)?0xF00:0) | ((i3&8)?0xF000:0) | ((i4&8)?0xF0000:0) | ((i5&8)?0xF00000:0) | ((i6&8)?0xF000000:0) | ((i7&8)?0xF0000000:0);
2016         const uint32_t mbz = mb & mz;     // mask for nonzero elements from b
2017         const uint32_t maz = ~mb & mz;    // mask for nonzero elements from a
2018         const uint32_t m1a = m1 & maz;
2019         const uint32_t m1b = m1 & mbz;
2020         const uint32_t pata = ((m1a | m1a >> 8 | m1a >> 16 | m1a >> 24) & 0xFF) * 0x01010101;  // permute pattern for elements from a
2021         const uint32_t patb = ((m1b | m1b >> 8 | m1b >> 16 | m1b >> 24) & 0xFF) * 0x01010101;  // permute pattern for elements from b
2022         if (((m1 ^ pata) & 0x11111111 & maz) == 0 && ((m1 ^ patb) & 0x11111111 & mbz) == 0) {
2023             // Same permute pattern in all lanes:
2024             // This code generates two instructions instead of one, but we are avoiding the slow lane-crossing instruction,
2025             // and we are saving 64 bytes of data cache.
2026             // 1. Permute a, zero elements not from a (using _mm512_maskz_shuffle_epi32)
2027             __m512i ta = permute8q< (maz&0xF)?i0&7:-1, (maz&0xF0)?i1&7:-1, (maz&0xF00)?i2&7:-1, (maz&0xF000)?i3&7:-1,
2028                 (maz&0xF0000)?i4&7:-1, (maz&0xF00000)?i5&7:-1, (maz&0xF000000)?i6&7:-1, (maz&0xF0000000)?i7&7:-1> (a);
2029             // write mask for elements from b
2030             const __mmask16 sb = ((mbz&0xF)?3:0) | ((mbz&0xF0)?0xC:0) | ((mbz&0xF00)?0x30:0) | ((mbz&0xF000)?0xC0:0) | ((mbz&0xF0000)?0x300:0) | ((mbz&0xF00000)?0xC00:0) | ((mbz&0xF000000)?0x3000:0) | ((mbz&0xF0000000)?0xC000:0);
2031             // permute index for elements from b
2032             const int pi = ((patb & 1) * 10 + 4) | ((((patb >> 4) & 1) * 10 + 4) << 4);
2033             // 2. Permute elements from b and combine with elements from a through write mask
2034             return _mm512_mask_shuffle_epi32(ta, sb, b, (_MM_PERM_ENUM)pi);
2035         }
2036         // not same permute pattern in all lanes. use full permute
2037     }
2038     // general case: full permute
2039     const __m512i pmask = constant16i<i0&0xF, 0, i1&0xF, 0, i2&0xF, 0, i3&0xF, 0, i4&0xF, 0, i5&0xF, 0, i6&0xF, 0, i7&0xF, 0>();
2040     if (dozero) {
2041         return _mm512_maskz_permutex2var_epi64(z, a, pmask, b);
2042     }
2043     else {
2044         return _mm512_permutex2var_epi64(a, pmask, b);
2045     }
2046 }
2047 
2048 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
blend8uq(Vec8uq const & a,Vec8uq const & b)2049 static inline Vec8uq blend8uq(Vec8uq const & a, Vec8uq const & b) {
2050     return Vec8uq( blend8q<i0,i1,i2,i3,i4,i5,i6,i7> (a,b));
2051 }
2052 
2053 
2054 template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
2055           int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 >
blend16i(Vec16i const & a,Vec16i const & b)2056 static inline Vec16i blend16i(Vec16i const & a, Vec16i const & b) {
2057 
2058     // Combine indexes into a single bitfield, with 4 bits for each indicating shuffle, but not source
2059     const uint64_t m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xFLL)<<28
2060         | (i8&0xFLL)<<32 | (i9&0xFLL)<<36 | (i10&0xFLL)<<40 | (i11&0xFLL)<<44 | (i12&0xFLL)<<48 | (i13&0xFLL)<<52 | (i14&0xFLL)<<56 | (i15&0xFLL)<<60;
2061 
2062     // Mask to zero out negative indexes
2063     const uint64_t mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000ULL)
2064         | (i8<0?0:0xF00000000) | (i9<0?0:0xF000000000) | (i10<0?0:0xF0000000000) | (i11<0?0:0xF00000000000) | (i12<0?0:0xF000000000000) | (i13<0?0:0xF0000000000000) | (i14<0?0:0xF00000000000000) | (i15<0?0:0xF000000000000000);
2065     const uint64_t m2 = m1 & mz;
2066 
2067     // collect bit 4 of each index = select source
2068     const uint64_t ms = ((i0&16)?0xF:0) | ((i1&16)?0xF0:0) | ((i2&16)?0xF00:0) | ((i3&16)?0xF000:0) | ((i4&16)?0xF0000:0) | ((i5&16)?0xF00000:0) | ((i6&16)?0xF000000:0) | ((i7&16)?0xF0000000ULL:0)
2069         | ((i8&16)?0xF00000000:0) | ((i9&16)?0xF000000000:0) | ((i10&16)?0xF0000000000:0) | ((i11&16)?0xF00000000000:0) | ((i12&16)?0xF000000000000:0) | ((i13&16)?0xF0000000000000:0) | ((i14&16)?0xF00000000000000:0) | ((i15&16)?0xF000000000000000:0);
2070 
2071     // zeroing needed
2072     const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15) & 0x80) != 0;
2073 
2074     // mask for elements not zeroed
2075     const __mmask16 z = __mmask16((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7
2076         | (i8>=0)<<8 | (i9>=0)<<9 | (i10>=0)<<10 | (i11>=0)<<11 | (i12>=0)<<12 | (i13>=0)<<13 | (i14>=0)<<14 | (i15>=0)<<15);
2077 
2078     // special case: all zero
2079     if (mz == 0) return  _mm512_setzero_epi32();
2080 
2081     // special case: all from a
2082     if ((ms & mz) == 0) {
2083         return permute16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a);
2084     }
2085 
2086     // special case: all from b
2087     if ((~ms & mz) == 0) {
2088         return permute16i<i0^16,i1^16,i2^16,i3^16,i4^16,i5^16,i6^16,i7^16,i8^16,i9^16,i10^16,i11^16,i12^16,i13^16,i14^16,i15^16 > (b);
2089     }
2090 
2091     // special case: blend without permute
2092     if (((m1 ^ 0xFEDCBA9876543210) & mz) == 0) {
2093         __mmask16 blendmask = __mmask16((i0&16)>>4 | (i1&16)>>3 | (i2&16)>>2 | (i3&16)>>1 | (i4&16) | (i5&16)<<1 | (i6&16)<<2 | (i7&16)<<3
2094             | (i8&16)<<4 | (i9&16)<<5 | (i10&16)<<6 | (i11&16)<<7 | (i12&16)<<8 | (i13&16)<<9 | (i14&16)<<10 | (i15&16)<<11);
2095         __m512i t = _mm512_mask_blend_epi32(blendmask, a, b);
2096         if (dozero) {
2097             t = _mm512_maskz_mov_epi32(z, t);
2098         }
2099         return t;
2100     }
2101 
2102     // special case: all data stay within their lane
2103     if (((m1 ^ 0xCCCC888844440000) & 0xCCCCCCCCCCCCCCCC & mz) == 0) {
2104 
2105         // mask for elements from a and b
2106         const uint64_t mb  = ms;
2107         const uint64_t mbz = mb & mz;     // mask for nonzero elements from b
2108         const uint64_t maz = ~mb & mz;    // mask for nonzero elements from a
2109         const uint64_t m1a = m1 & maz;
2110         const uint64_t m1b = m1 & mbz;
2111         const uint64_t pata = ((m1a | m1a >> 16 | m1a >> 32 | m1a >> 48) & 0xFFFF) * 0x0001000100010001;  // permute pattern for elements from a
2112         const uint64_t patb = ((m1b | m1b >> 16 | m1b >> 32 | m1b >> 48) & 0xFFFF) * 0x0001000100010001;  // permute pattern for elements from b
2113         if (((m1 ^ pata) & 0x3333333333333333 & maz) == 0 && ((m1 ^ patb) & 0x3333333333333333 & mbz) == 0) {
2114             // Same permute pattern in all lanes:
2115             // This code generates two instructions instead of one, but we are avoiding the slow lane-crossing instruction,
2116             // and we are saving 64 bytes of data cache.
2117             // 1. Permute a, zero elements not from a (using _mm512_maskz_shuffle_epi32)
2118             __m512i ta = permute16i< (maz&0xF)?i0&15:-1, (maz&0xF0)?i1&15:-1, (maz&0xF00)?i2&15:-1, (maz&0xF000)?i3&15:-1,
2119                 (maz&0xF0000)?i4&15:-1, (maz&0xF00000)?i5&15:-1, (maz&0xF000000)?i6&15:-1, (maz&0xF0000000)?i7&15:-1,
2120                 (maz&0xF00000000)?i8&15:-1, (maz&0xF000000000)?i9&15:-1, (maz&0xF0000000000)?i10&15:-1, (maz&0xF00000000000)?i11&15:-1,
2121                 (maz&0xF000000000000)?i12&15:-1, (maz&0xF0000000000000)?i13&15:-1, (maz&0xF00000000000000)?i14&15:-1, (maz&0xF000000000000000)?i15&15:-1> (a);
2122             // write mask for elements from b
2123             const __mmask16 sb = ((mbz&0xF)?1:0) | ((mbz&0xF0)?0x2:0) | ((mbz&0xF00)?0x4:0) | ((mbz&0xF000)?0x8:0) | ((mbz&0xF0000)?0x10:0) | ((mbz&0xF00000)?0x20:0) | ((mbz&0xF000000)?0x40:0) | ((mbz&0xF0000000)?0x80:0)
2124                 | ((mbz&0xF00000000)?0x100:0) | ((mbz&0xF000000000)?0x200:0) | ((mbz&0xF0000000000)?0x400:0) | ((mbz&0xF00000000000)?0x800:0) | ((mbz&0xF000000000000)?0x1000:0) | ((mbz&0xF0000000000000)?0x2000:0) | ((mbz&0xF00000000000000)?0x4000:0) | ((mbz&0xF000000000000000)?0x8000:0);
2125             // permute index for elements from b
2126             const int pi = (patb & 3) | (((patb >> 4) & 3) << 2) | (((patb >> 8) & 3) << 4) | (((patb >> 12) & 3) << 6);
2127             // 2. Permute elements from b and combine with elements from a through write mask
2128             return _mm512_mask_shuffle_epi32(ta, sb, b, (_MM_PERM_ENUM)pi);
2129         }
2130         // not same permute pattern in all lanes. use full permute
2131     }
2132 
2133     // general case: full permute
2134     const __m512i pmask = constant16i<i0&0x1F, i1&0x1F, i2&0x1F, i3&0x1F, i4&0x1F, i5&0x1F, i6&0x1F, i7&0x1F,
2135         i8&0x1F, i9&0x1F, i10&0x1F, i11&0x1F, i12&0x1F, i13&0x1F, i14&0x1F, i15&0x1F>();
2136     if (dozero) {
2137         return _mm512_maskz_permutex2var_epi32(z, a, pmask, b);
2138     }
2139     else {
2140         return _mm512_permutex2var_epi32(a, pmask, b);
2141     }
2142 }
2143 
2144 template <int i0,  int i1,  int i2,  int i3,  int i4,  int i5,  int i6,  int i7,
2145           int i8,  int i9,  int i10, int i11, int i12, int i13, int i14, int i15 >
blend16ui(Vec16ui const & a,Vec16ui const & b)2146 static inline Vec16ui blend16ui(Vec16ui const & a, Vec16ui const & b) {
2147     return Vec16ui( blend16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (Vec16i(a),Vec16i(b)));
2148 }
2149 
2150 
2151 /*****************************************************************************
2152 *
2153 *          Vector lookup functions
2154 *
2155 ******************************************************************************
2156 *
2157 * These functions use vector elements as indexes into a table.
2158 * The table is given as one or more vectors or as an array.
2159 *
2160 * This can be used for several purposes:
2161 *  - table lookup
2162 *  - permute or blend with variable indexes
2163 *  - blend from more than two sources
2164 *  - gather non-contiguous data
2165 *
2166 * An index out of range may produce any value - the actual value produced is
2167 * implementation dependent and may be different for different instruction
2168 * sets. An index out of range does not produce an error message or exception.
2169 *
2170 * Example:
2171 * Vec8q a(2,0,0,6,4,3,5,0);                 // index a is (  2,   0,   0,   6,   4,   3,   5,   0)
2172 * Vec8q b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
2173 * Vec8q c;
2174 * c = lookup8 (a,b);                        // c is       (102, 100, 100, 106, 104, 103, 105, 100)
2175 *
2176 *****************************************************************************/
2177 
lookup16(Vec16i const & index,Vec16i const & table)2178 static inline Vec16i lookup16(Vec16i const & index, Vec16i const & table) {
2179     return _mm512_permutexvar_epi32(index, table);
2180 }
2181 
2182 template <int n>
lookup(Vec16i const & index,void const * table)2183 static inline Vec16i lookup(Vec16i const & index, void const * table) {
2184     if (n <= 0) return 0;
2185     if (n <= 16) {
2186         Vec16i table1 = Vec16i().load(table);
2187         return lookup16(index, table1);
2188     }
2189     if (n <= 32) {
2190         Vec16i table1 = Vec16i().load(table);
2191         Vec16i table2 = Vec16i().load((int8_t*)table + 64);
2192         return _mm512_permutex2var_epi32(table1, index, table2);
2193     }
2194     // n > 32. Limit index
2195     Vec16ui index1;
2196     if ((n & (n-1)) == 0) {
2197         // n is a power of 2, make index modulo n
2198         index1 = Vec16ui(index) & (n-1);
2199     }
2200     else {
2201         // n is not a power of 2, limit to n-1
2202         index1 = min(Vec16ui(index), uint32_t(n-1));
2203     }
2204     return _mm512_i32gather_epi32(index1, (const int*)table, 4);
2205     // return  _mm512_i32gather_epi32(index1, table, _MM_UPCONV_EPI32_NONE, 4, 0);
2206 }
2207 
2208 
lookup8(Vec8q const & index,Vec8q const & table)2209 static inline Vec8q lookup8(Vec8q const & index, Vec8q const & table) {
2210     return _mm512_permutexvar_epi64(index, table);
2211 }
2212 
2213 template <int n>
lookup(Vec8q const & index,void const * table)2214 static inline Vec8q lookup(Vec8q const & index, void const * table) {
2215     if (n <= 0) return 0;
2216     if (n <= 8) {
2217         Vec8q table1 = Vec8q().load(table);
2218         return lookup8(index, table1);
2219     }
2220     if (n <= 16) {
2221         Vec8q table1 = Vec8q().load(table);
2222         Vec8q table2 = Vec8q().load((int8_t*)table + 64);
2223         return _mm512_permutex2var_epi64(table1, index, table2);
2224     }
2225     // n > 16. Limit index
2226     Vec8uq index1;
2227     if ((n & (n-1)) == 0) {
2228         // n is a power of 2, make index modulo n
2229         index1 = Vec8uq(index) & (n-1);
2230     }
2231     else {
2232         // n is not a power of 2, limit to n-1
2233         index1 = min(Vec8uq(index), uint32_t(n-1));
2234     }
2235     return _mm512_i64gather_epi64(index1, (const long long*)table, 8);
2236 }
2237 
2238 
2239 /*****************************************************************************
2240 *
2241 *          Gather functions with fixed indexes
2242 *
2243 *****************************************************************************/
2244 // Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15
2245 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
2246 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
gather16i(void const * a)2247 static inline Vec16i gather16i(void const * a) {
2248     Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15)>=0> Negative_array_index;  // Error message if index is negative
2249     // find smallest and biggest index, using only compile-time constant expressions
2250     const int i01min   = i0  < i1  ? i0  : i1;
2251     const int i23min   = i2  < i3  ? i2  : i3;
2252     const int i45min   = i4  < i5  ? i4  : i5;
2253     const int i67min   = i6  < i7  ? i6  : i7;
2254     const int i89min   = i8  < i9  ? i8  : i9;
2255     const int i1011min = i10 < i11 ? i10 : i11;
2256     const int i1213min = i12 < i13 ? i12 : i13;
2257     const int i1415min = i14 < i15 ? i14 : i15;
2258     const int i0_3min   = i01min   < i23min    ? i01min   : i23min;
2259     const int i4_7min   = i45min   < i67min    ? i45min   : i67min;
2260     const int i8_11min  = i89min   < i1011min  ? i89min   : i1011min;
2261     const int i12_15min = i1213min < i1415min  ? i1213min : i1415min;
2262     const int i0_7min   = i0_3min  < i4_7min   ? i0_3min  : i4_7min;
2263     const int i8_15min  = i8_11min < i12_15min ? i8_11min : i12_15min;
2264     const int imin      = i0_7min  < i8_15min  ? i0_7min  : i8_15min;
2265     const int i01max   = i0  > i1  ? i0  : i1;
2266     const int i23max   = i2  > i3  ? i2  : i3;
2267     const int i45max   = i4  > i5  ? i4  : i5;
2268     const int i67max   = i6  > i7  ? i6  : i7;
2269     const int i89max   = i8  > i9  ? i8  : i9;
2270     const int i1011max = i10 > i11 ? i10 : i11;
2271     const int i1213max = i12 > i13 ? i12 : i13;
2272     const int i1415max = i14 > i15 ? i14 : i15;
2273     const int i0_3max   = i01max   > i23max    ? i01max   : i23max;
2274     const int i4_7max   = i45max   > i67max    ? i45max   : i67max;
2275     const int i8_11max  = i89max   > i1011max  ? i89max   : i1011max;
2276     const int i12_15max = i1213max > i1415max  ? i1213max : i1415max;
2277     const int i0_7max   = i0_3max  > i4_7max   ? i0_3max  : i4_7max;
2278     const int i8_15max  = i8_11max > i12_15max ? i8_11max : i12_15max;
2279     const int imax      = i0_7max  > i8_15max  ? i0_7max  : i8_15max;
2280     if (imax - imin <= 15) {
2281         // load one contiguous block and permute
2282         if (imax > 15) {
2283             // make sure we don't read past the end of the array
2284             Vec16i b = Vec16i().load((int32_t const *)a + imax-15);
2285             return permute16i<i0-imax+15, i1-imax+15, i2-imax+15, i3-imax+15, i4-imax+15, i5-imax+15, i6-imax+15, i7-imax+15,
2286                 i8-imax+15, i9-imax+15, i10-imax+15, i11-imax+15, i12-imax+15, i13-imax+15, i14-imax+15, i15-imax+15> (b);
2287         }
2288         else {
2289             Vec16i b = Vec16i().load((int32_t const *)a + imin);
2290             return permute16i<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin,
2291                 i8-imin, i9-imin, i10-imin, i11-imin, i12-imin, i13-imin, i14-imin, i15-imin> (b);
2292         }
2293     }
2294     if ((i0<imin+16  || i0>imax-16)  && (i1<imin+16  || i1>imax-16)  && (i2<imin+16  || i2>imax-16)  && (i3<imin+16  || i3>imax-16)
2295     &&  (i4<imin+16  || i4>imax-16)  && (i5<imin+16  || i5>imax-16)  && (i6<imin+16  || i6>imax-16)  && (i7<imin+16  || i7>imax-16)
2296     &&  (i8<imin+16  || i8>imax-16)  && (i9<imin+16  || i9>imax-16)  && (i10<imin+16 || i10>imax-16) && (i11<imin+16 || i11>imax-16)
2297     &&  (i12<imin+16 || i12>imax-16) && (i13<imin+16 || i13>imax-16) && (i14<imin+16 || i14>imax-16) && (i15<imin+16 || i15>imax-16) ) {
2298         // load two contiguous blocks and blend
2299         Vec16i b = Vec16i().load((int32_t const *)a + imin);
2300         Vec16i c = Vec16i().load((int32_t const *)a + imax-15);
2301         const int j0  = i0 <imin+16 ? i0 -imin : 31-imax+i0;
2302         const int j1  = i1 <imin+16 ? i1 -imin : 31-imax+i1;
2303         const int j2  = i2 <imin+16 ? i2 -imin : 31-imax+i2;
2304         const int j3  = i3 <imin+16 ? i3 -imin : 31-imax+i3;
2305         const int j4  = i4 <imin+16 ? i4 -imin : 31-imax+i4;
2306         const int j5  = i5 <imin+16 ? i5 -imin : 31-imax+i5;
2307         const int j6  = i6 <imin+16 ? i6 -imin : 31-imax+i6;
2308         const int j7  = i7 <imin+16 ? i7 -imin : 31-imax+i7;
2309         const int j8  = i8 <imin+16 ? i8 -imin : 31-imax+i8;
2310         const int j9  = i9 <imin+16 ? i9 -imin : 31-imax+i9;
2311         const int j10 = i10<imin+16 ? i10-imin : 31-imax+i10;
2312         const int j11 = i11<imin+16 ? i11-imin : 31-imax+i11;
2313         const int j12 = i12<imin+16 ? i12-imin : 31-imax+i12;
2314         const int j13 = i13<imin+16 ? i13-imin : 31-imax+i13;
2315         const int j14 = i14<imin+16 ? i14-imin : 31-imax+i14;
2316         const int j15 = i15<imin+16 ? i15-imin : 31-imax+i15;
2317         return blend16i<j0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15>(b, c);
2318     }
2319     // use gather instruction
2320     return _mm512_i32gather_epi32(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), (const int *)a, 4);
2321 }
2322 
2323 
2324 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
gather8q(void const * a)2325 static inline Vec8q gather8q(void const * a) {
2326     Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7)>=0> Negative_array_index;  // Error message if index is negative
2327 
2328     const int i01min = i0 < i1 ? i0 : i1;
2329     const int i23min = i2 < i3 ? i2 : i3;
2330     const int i45min = i4 < i5 ? i4 : i5;
2331     const int i67min = i6 < i7 ? i6 : i7;
2332     const int i0123min = i01min < i23min ? i01min : i23min;
2333     const int i4567min = i45min < i67min ? i45min : i67min;
2334     const int imin = i0123min < i4567min ? i0123min : i4567min;
2335     const int i01max = i0 > i1 ? i0 : i1;
2336     const int i23max = i2 > i3 ? i2 : i3;
2337     const int i45max = i4 > i5 ? i4 : i5;
2338     const int i67max = i6 > i7 ? i6 : i7;
2339     const int i0123max = i01max > i23max ? i01max : i23max;
2340     const int i4567max = i45max > i67max ? i45max : i67max;
2341     const int imax = i0123max > i4567max ? i0123max : i4567max;
2342     if (imax - imin <= 7) {
2343         // load one contiguous block and permute
2344         if (imax > 7) {
2345             // make sure we don't read past the end of the array
2346             Vec8q b = Vec8q().load((int64_t const *)a + imax-7);
2347             return permute8q<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7> (b);
2348         }
2349         else {
2350             Vec8q b = Vec8q().load((int64_t const *)a + imin);
2351             return permute8q<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin> (b);
2352         }
2353     }
2354     if ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
2355     &&  (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
2356         // load two contiguous blocks and blend
2357         Vec8q b = Vec8q().load((int64_t const *)a + imin);
2358         Vec8q c = Vec8q().load((int64_t const *)a + imax-7);
2359         const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
2360         const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
2361         const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
2362         const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
2363         const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
2364         const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
2365         const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
2366         const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
2367         return blend8q<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
2368     }
2369     // use gather instruction
2370     return _mm512_i64gather_epi64(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), (const long long *)a, 8);
2371 }
2372 
2373 /*****************************************************************************
2374 *
2375 *          Vector scatter functions
2376 *
2377 ******************************************************************************
2378 *
2379 * These functions write the elements of a vector to arbitrary positions in an
2380 * array in memory. Each vector element is written to an array position
2381 * determined by an index. An element is not written if the corresponding
2382 * index is out of range.
2383 * The indexes can be specified as constant template parameters or as an
2384 * integer vector.
2385 *
2386 * The scatter functions are useful if the data are distributed in a sparce
2387 * manner into the array. If the array is dense then it is more efficient
2388 * to permute the data into the right positions and then write the whole
2389 * permuted vector into the array.
2390 *
2391 * Example:
2392 * Vec8q a(10,11,12,13,14,15,16,17);
2393 * int64_t b[16] = {0};
2394 * scatter<0,2,14,10,1,-1,5,9>(a,b);
2395 * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
2396 *
2397 *****************************************************************************/
2398 
2399 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
2400 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
scatter(Vec16i const & data,void * array)2401     static inline void scatter(Vec16i const & data, void * array) {
2402     __m512i indx = constant16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15>();
2403     Vec16ib mask(i0>=0, i1>=0, i2>=0, i3>=0, i4>=0, i5>=0, i6>=0, i7>=0,
2404         i8>=0, i9>=0, i10>=0, i11>=0, i12>=0, i13>=0, i14>=0, i15>=0);
2405     _mm512_mask_i32scatter_epi32((int*)array, mask, indx, data, 4);
2406 }
2407 
2408 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
scatter(Vec8q const & data,void * array)2409 static inline void scatter(Vec8q const & data, void * array) {
2410     __m256i indx = constant8i<i0,i1,i2,i3,i4,i5,i6,i7>();
2411     Vec8qb mask(i0>=0, i1>=0, i2>=0, i3>=0, i4>=0, i5>=0, i6>=0, i7>=0);
2412     _mm512_mask_i32scatter_epi64((long long *)array, mask, indx, data, 8);
2413 }
2414 
scatter(Vec16i const & index,uint32_t limit,Vec16i const & data,void * array)2415 static inline void scatter(Vec16i const & index, uint32_t limit, Vec16i const & data, void * array) {
2416     Vec16ib mask = Vec16ui(index) < limit;
2417     _mm512_mask_i32scatter_epi32((int*)array, mask, index, data, 4);
2418 }
2419 
scatter(Vec8q const & index,uint32_t limit,Vec8q const & data,void * array)2420 static inline void scatter(Vec8q const & index, uint32_t limit, Vec8q const & data, void * array) {
2421     Vec8qb mask = Vec8uq(index) < uint64_t(limit);
2422     _mm512_mask_i64scatter_epi64((long long *)array, mask, index, data, 8);
2423 }
2424 
scatter(Vec8i const & index,uint32_t limit,Vec8q const & data,void * array)2425 static inline void scatter(Vec8i const & index, uint32_t limit, Vec8q const & data, void * array) {
2426 #if defined (__AVX512VL__)
2427     __mmask16 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit));
2428 #else
2429     __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit)));
2430 #endif
2431     _mm512_mask_i32scatter_epi64((long long *)array, mask, index, data, 8);
2432 }
2433 
2434 /*****************************************************************************
2435 *
2436 *          Functions for conversion between integer sizes
2437 *
2438 *****************************************************************************/
2439 
2440 // Extend 16-bit integers to 32-bit integers, signed and unsigned
2441 
2442 // Function extend_to_int : extends Vec16s to Vec16i with sign extension
extend_to_int(Vec16s const & a)2443 static inline Vec16i extend_to_int (Vec16s const & a) {
2444     return _mm512_cvtepi16_epi32(a);
2445 }
2446 
2447 // Function extend_to_int : extends Vec16us to Vec16ui with zero extension
extend_to_int(Vec16us const & a)2448 static inline Vec16ui extend_to_int (Vec16us const & a) {
2449     return _mm512_cvtepu16_epi32(a);
2450 }
2451 
2452 // Function extend_to_int : extends Vec16c to Vec16i with sign extension
extend_to_int(Vec16c const & a)2453 static inline Vec16i extend_to_int (Vec16c const & a) {
2454     return _mm512_cvtepi8_epi32(a);
2455 }
2456 
2457 // Function extend_to_int : extends Vec16uc to Vec16ui with zero extension
extend_to_int(Vec16uc const & a)2458 static inline Vec16ui extend_to_int (Vec16uc const & a) {
2459     return _mm512_cvtepu8_epi32(a);
2460 }
2461 
2462 
2463 // Extend 32-bit integers to 64-bit integers, signed and unsigned
2464 
2465 // Function extend_low : extends the low 8 elements to 64 bits with sign extension
extend_low(Vec16i const & a)2466 static inline Vec8q extend_low (Vec16i const & a) {
2467     return _mm512_cvtepi32_epi64(a.get_low());
2468 }
2469 
2470 // Function extend_high : extends the high 8 elements to 64 bits with sign extension
extend_high(Vec16i const & a)2471 static inline Vec8q extend_high (Vec16i const & a) {
2472     return _mm512_cvtepi32_epi64(a.get_high());
2473 }
2474 
2475 // Function extend_low : extends the low 8 elements to 64 bits with zero extension
extend_low(Vec16ui const & a)2476 static inline Vec8uq extend_low (Vec16ui const & a) {
2477     return _mm512_cvtepu32_epi64(a.get_low());
2478 }
2479 
2480 // Function extend_high : extends the high 8 elements to 64 bits with zero extension
extend_high(Vec16ui const & a)2481 static inline Vec8uq extend_high (Vec16ui const & a) {
2482     return _mm512_cvtepu32_epi64(a.get_high());
2483 }
2484 
2485 
2486 // Compress 32-bit integers to 8-bit integers, signed and unsigned, with and without saturation
2487 
2488 // Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
2489 // Overflow wraps around
compress_to_int8(Vec16i const & a)2490 static inline Vec16c compress_to_int8 (Vec16i const & a) {
2491     return _mm512_cvtepi32_epi8(a);
2492 }
2493 
compress_to_int16(Vec16i const & a)2494 static inline Vec16s compress_to_int16 (Vec16i const & a) {
2495     return _mm512_cvtepi32_epi16(a);
2496 }
2497 
2498 // with signed saturation
compress_to_int8_saturated(Vec16i const & a)2499 static inline Vec16c compress_to_int8_saturated (Vec16i const & a) {
2500     return _mm512_cvtsepi32_epi8(a);
2501 }
2502 
compress_to_int16_saturated(Vec16i const & a)2503 static inline Vec16s compress_to_int16_saturated (Vec16i const & a) {
2504     return _mm512_cvtsepi32_epi16(a);
2505 }
2506 
2507 // with unsigned saturation
compress_to_int8_saturated(Vec16ui const & a)2508 static inline Vec16uc compress_to_int8_saturated (Vec16ui const & a) {
2509     return _mm512_cvtusepi32_epi8(a);
2510 }
2511 
compress_to_int16_saturated(Vec16ui const & a)2512 static inline Vec16us compress_to_int16_saturated (Vec16ui const & a) {
2513     return _mm512_cvtusepi32_epi16(a);
2514 }
2515 
2516 // Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
2517 
2518 // Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
2519 // Overflow wraps around
compress(Vec8q const & low,Vec8q const & high)2520 static inline Vec16i compress (Vec8q const & low, Vec8q const & high) {
2521     Vec8i low2   = _mm512_cvtepi64_epi32(low);
2522     Vec8i high2  = _mm512_cvtepi64_epi32(high);
2523     return Vec16i(low2, high2);
2524 }
2525 
2526 // Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
2527 // Signed, with saturation
compress_saturated(Vec8q const & low,Vec8q const & high)2528 static inline Vec16i compress_saturated (Vec8q const & low, Vec8q const & high) {
2529     Vec8i low2   = _mm512_cvtsepi64_epi32(low);
2530     Vec8i high2  = _mm512_cvtsepi64_epi32(high);
2531     return Vec16i(low2, high2);
2532 }
2533 
2534 // Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
2535 // Unsigned, with saturation
compress_saturated(Vec8uq const & low,Vec8uq const & high)2536 static inline Vec16ui compress_saturated (Vec8uq const & low, Vec8uq const & high) {
2537     Vec8ui low2   = _mm512_cvtusepi64_epi32(low);
2538     Vec8ui high2  = _mm512_cvtusepi64_epi32(high);
2539     return Vec16ui(low2, high2);
2540 }
2541 
2542 
2543 /*****************************************************************************
2544 *
2545 *          Integer division operators
2546 *
2547 *          Please see the file vectori128.h for explanation.
2548 *
2549 *****************************************************************************/
2550 
2551 // vector operator / : divide each element by divisor
2552 
2553 // vector of 16 32-bit signed integers
2554 static inline Vec16i operator / (Vec16i const & a, Divisor_i const & d) {
2555     __m512i m   = _mm512_broadcast_i32x4(d.getm());        // broadcast multiplier
2556     __m512i sgn = _mm512_broadcast_i32x4(d.getsign());     // broadcast sign of d
2557     __m512i t1  = _mm512_mul_epi32(a,m);                   // 32x32->64 bit signed multiplication of even elements of a
2558     __m512i t3  = _mm512_srli_epi64(a,32);                 // get odd elements of a into position for multiplication
2559     __m512i t4  = _mm512_mul_epi32(t3,m);                  // 32x32->64 bit signed multiplication of odd elements
2560     __m512i t2  = _mm512_srli_epi64(t1,32);                // dword of even index results
2561     __m512i t7  = _mm512_mask_mov_epi32(t2, __mmask16(0xAAAA), t4);  // blend two results
2562     __m512i t8  = _mm512_add_epi32(t7,a);                  // add
2563     __m512i t9  = _mm512_sra_epi32(t8,d.gets1());          // shift right artihmetic
2564     __m512i t10 = _mm512_srai_epi32(a,31);                 // sign of a
2565     __m512i t11 = _mm512_sub_epi32(t10,sgn);               // sign of a - sign of d
2566     __m512i t12 = _mm512_sub_epi32(t9,t11);                // + 1 if a < 0, -1 if d < 0
2567     return        _mm512_xor_si512(t12,sgn);               // change sign if divisor negative
2568 }
2569 
2570 // vector of 16 32-bit unsigned integers
2571 static inline Vec16ui operator / (Vec16ui const & a, Divisor_ui const & d) {
2572     __m512i m   = _mm512_broadcast_i32x4(d.getm());       // broadcast multiplier
2573     __m512i t1  = _mm512_mul_epu32(a,m);                   // 32x32->64 bit unsigned multiplication of even elements of a
2574     __m512i t3  = _mm512_srli_epi64(a,32);                 // get odd elements of a into position for multiplication
2575     __m512i t4  = _mm512_mul_epu32(t3,m);                  // 32x32->64 bit unsigned multiplication of odd elements
2576     __m512i t2  = _mm512_srli_epi64(t1,32);                // high dword of even index results
2577     __m512i t7  = _mm512_mask_mov_epi32(t2, __mmask16(0xAAAA), t4);  // blend two results
2578     __m512i t8  = _mm512_sub_epi32(a,t7);                  // subtract
2579     __m512i t9  = _mm512_srl_epi32(t8,d.gets1());          // shift right logical
2580     __m512i t10 = _mm512_add_epi32(t7,t9);                 // add
2581     return        _mm512_srl_epi32(t10,d.gets2());         // shift right logical
2582 }
2583 
2584 // vector operator /= : divide
2585 static inline Vec16i & operator /= (Vec16i & a, Divisor_i const & d) {
2586     a = a / d;
2587     return a;
2588 }
2589 
2590 // vector operator /= : divide
2591 static inline Vec16ui & operator /= (Vec16ui & a, Divisor_ui const & d) {
2592     a = a / d;
2593     return a;
2594 }
2595 
2596 
2597 /*****************************************************************************
2598 *
2599 *          Integer division 2: divisor is a compile-time constant
2600 *
2601 *****************************************************************************/
2602 
2603 // Divide Vec16i by compile-time constant
2604 template <int32_t d>
divide_by_i(Vec16i const & x)2605 static inline Vec16i divide_by_i(Vec16i const & x) {
2606     Static_error_check<(d!=0)> Dividing_by_zero;                     // Error message if dividing by zero
2607     if (d ==  1) return  x;
2608     if (d == -1) return -x;
2609     if (uint32_t(d) == 0x80000000u) {
2610         return _mm512_maskz_set1_epi32(x == Vec16i(0x80000000), 1);  // avoid overflow of abs(d). return (x == 0x80000000) ? 1 : 0;
2611     }
2612     const uint32_t d1 = d > 0 ? uint32_t(d) : -uint32_t(d);          // compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits)
2613     if ((d1 & (d1-1)) == 0) {
2614         // d1 is a power of 2. use shift
2615         const int k = bit_scan_reverse_const(d1);
2616         __m512i sign;
2617         if (k > 1) sign = _mm512_srai_epi32(x, k-1); else sign = x;  // k copies of sign bit
2618         __m512i bias    = _mm512_srli_epi32(sign, 32-k);             // bias = x >= 0 ? 0 : k-1
2619         __m512i xpbias  = _mm512_add_epi32 (x, bias);                // x + bias
2620         __m512i q       = _mm512_srai_epi32(xpbias, k);              // (x + bias) >> k
2621         if (d > 0)      return q;                                    // d > 0: return  q
2622         return _mm512_sub_epi32(_mm512_setzero_epi32(), q);          // d < 0: return -q
2623 
2624     }
2625     // general case
2626     const int32_t sh = bit_scan_reverse_const(uint32_t(d1)-1);       // ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case)
2627     const int32_t mult = int(1 + (uint64_t(1) << (32+sh)) / uint32_t(d1) - (int64_t(1) << 32));   // multiplier
2628     const Divisor_i div(mult, sh, d < 0 ? -1 : 0);
2629     return x / div;
2630 }
2631 
2632 // define Vec8i a / const_int(d)
2633 template <int32_t d>
2634 static inline Vec16i operator / (Vec16i const & a, Const_int_t<d>) {
2635     return divide_by_i<d>(a);
2636 }
2637 
2638 // define Vec16i a / const_uint(d)
2639 template <uint32_t d>
2640 static inline Vec16i operator / (Vec16i const & a, Const_uint_t<d>) {
2641     Static_error_check< (d<0x80000000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
2642     return divide_by_i<int32_t(d)>(a);                               // signed divide
2643 }
2644 
2645 // vector operator /= : divide
2646 template <int32_t d>
2647 static inline Vec16i & operator /= (Vec16i & a, Const_int_t<d> b) {
2648     a = a / b;
2649     return a;
2650 }
2651 
2652 // vector operator /= : divide
2653 template <uint32_t d>
2654 static inline Vec16i & operator /= (Vec16i & a, Const_uint_t<d> b) {
2655     a = a / b;
2656     return a;
2657 }
2658 
2659 
2660 // Divide Vec16ui by compile-time constant
2661 template <uint32_t d>
divide_by_ui(Vec16ui const & x)2662 static inline Vec16ui divide_by_ui(Vec16ui const & x) {
2663     Static_error_check<(d!=0)> Dividing_by_zero;                     // Error message if dividing by zero
2664     if (d == 1) return x;                                            // divide by 1
2665     const int b = bit_scan_reverse_const(d);                         // floor(log2(d))
2666     if ((uint32_t(d) & (uint32_t(d)-1)) == 0) {
2667         // d is a power of 2. use shift
2668         return  _mm512_srli_epi32(x, b);                             // x >> b
2669     }
2670     // general case (d > 2)
2671     uint32_t mult = uint32_t((uint64_t(1) << (b+32)) / d);           // multiplier = 2^(32+b) / d
2672     const uint64_t rem = (uint64_t(1) << (b+32)) - uint64_t(d)*mult; // remainder 2^(32+b) % d
2673     const bool round_down = (2*rem < d);                             // check if fraction is less than 0.5
2674     if (!round_down) {
2675         mult = mult + 1;                                             // round up mult
2676     }
2677     // do 32*32->64 bit unsigned multiplication and get high part of result
2678     const __m512i multv = Vec16ui(uint64_t(mult));                   // zero-extend mult and broadcast
2679     __m512i t1 = _mm512_mul_epu32(x,multv);                          // 32x32->64 bit unsigned multiplication of even elements
2680     if (round_down) {
2681         t1      = _mm512_add_epi64(t1,multv);                        // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
2682     }
2683     __m512i t2 = _mm512_srli_epi64(t1,32);                           // high dword of result 0 and 2
2684     __m512i t3 = _mm512_srli_epi64(x,32);                            // get odd elements into position for multiplication
2685     __m512i t4 = _mm512_mul_epu32(t3,multv);                         // 32x32->64 bit unsigned multiplication of x[1] and x[3]
2686     if (round_down) {
2687         t4      = _mm512_add_epi64(t4,multv);                        // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
2688     }
2689     __m512i t7 = _mm512_mask_mov_epi32(t2, __mmask16(0xAA), t4);     // blend two results
2690     Vec16ui q  = _mm512_srli_epi32(t7, b);                           // shift right by b
2691     return q;                                                        // no overflow possible
2692 }
2693 
2694 // define Vec8ui a / const_uint(d)
2695 template <uint32_t d>
2696 static inline Vec16ui operator / (Vec16ui const & a, Const_uint_t<d>) {
2697     return divide_by_ui<d>(a);
2698 }
2699 
2700 // define Vec8ui a / const_int(d)
2701 template <int32_t d>
2702 static inline Vec16ui operator / (Vec16ui const & a, Const_int_t<d>) {
2703     Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
2704     return divide_by_ui<d>(a);                                       // unsigned divide
2705 }
2706 
2707 // vector operator /= : divide
2708 template <uint32_t d>
2709 static inline Vec16ui & operator /= (Vec16ui & a, Const_uint_t<d> b) {
2710     a = a / b;
2711     return a;
2712 }
2713 
2714 // vector operator /= : divide
2715 template <int32_t d>
2716 static inline Vec16ui & operator /= (Vec16ui & a, Const_int_t<d> b) {
2717     a = a / b;
2718     return a;
2719 }
2720 
2721 /*****************************************************************************
2722 *
2723 *          Horizontal scan functions
2724 *
2725 *****************************************************************************/
2726 
2727 // Get index to the first element that is true. Return -1 if all are false
2728 
horizontal_find_first(Vec16ib const & x)2729 static inline int horizontal_find_first(Vec16ib const & x) {
2730     uint32_t b = uint16_t(__mmask16(x));
2731     if (b) {
2732         return bit_scan_forward(b);
2733     }
2734     else {
2735         return -1;
2736     }
2737 }
2738 
horizontal_find_first(Vec8qb const & x)2739 static inline int horizontal_find_first(Vec8qb const & x) {
2740     uint32_t b = uint8_t(__mmask16(x));
2741     if (b) {
2742         return bit_scan_forward(b);
2743     }
2744     else {
2745         return -1;
2746     }
2747 }
2748 
horizontal_count(Vec16ib const & x)2749 static inline uint32_t horizontal_count(Vec16ib const & x) {
2750     return vml_popcnt(uint32_t(uint16_t(__mmask16(x))));
2751 }
2752 
horizontal_count(Vec8qb const & x)2753 static inline uint32_t horizontal_count(Vec8qb const & x) {
2754     return vml_popcnt(uint32_t(uint16_t(__mmask16(x))));
2755 }
2756 
2757 
2758 /*****************************************************************************
2759 *
2760 *          Boolean <-> bitfield conversion functions
2761 *
2762 *****************************************************************************/
2763 
2764 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec4ib x)2765 static inline uint8_t to_bits(Vec4ib x) {
2766     __m512i a = _mm512_castsi128_si512(x);
2767     __mmask16 b = _mm512_mask_testn_epi32_mask(0xF, a, a);
2768     return uint8_t(b) ^ 0xF;
2769 }
2770 
2771 // to_Vec16c: convert integer bitfield to boolean vector
to_Vec4ib(uint8_t x)2772 static inline Vec4ib to_Vec4ib(uint8_t x) {
2773     return _mm512_castsi512_si128(_mm512_maskz_set1_epi32(__mmask16(x), -1));
2774 }
2775 
2776 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec2qb x)2777 static inline uint8_t to_bits(Vec2qb x) {
2778     __m512i a = _mm512_castsi128_si512(x);
2779     __mmask16 b = _mm512_mask_testn_epi64_mask(0x3, a, a);
2780     return uint8_t(b) ^ 0x3;
2781 }
2782 
2783 // to_Vec16c: convert integer bitfield to boolean vector
to_Vec2qb(uint8_t x)2784 static inline Vec2qb to_Vec2qb(uint8_t x) {
2785     return _mm512_castsi512_si128(_mm512_maskz_set1_epi64(__mmask16(x), -1LL));
2786 }
2787 
2788 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec8ib x)2789 static inline uint8_t to_bits(Vec8ib x) {
2790     __m512i a = _mm512_castsi256_si512(x);
2791     __mmask16 b = _mm512_mask_testn_epi32_mask(0xFF, a, a);
2792     return ~ uint8_t(b);
2793 }
2794 
2795 // to_Vec16c: convert integer bitfield to boolean vector
to_Vec8ib(uint8_t x)2796 static inline Vec8ib to_Vec8ib(uint8_t x) {
2797     return _mm512_castsi512_si256(_mm512_maskz_set1_epi32(__mmask16(x), -1));
2798 }
2799 
2800 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec4qb x)2801 static inline uint8_t to_bits(Vec4qb x) {
2802     __m512i a = _mm512_castsi256_si512(x);
2803     __mmask16 b = _mm512_mask_testn_epi64_mask(0xF, a, a);
2804     return uint8_t(b) ^ 0xF;
2805 }
2806 
2807 // to_Vec16c: convert integer bitfield to boolean vector
to_Vec4qb(uint8_t x)2808 static inline Vec4qb to_Vec4qb(uint8_t x) {
2809     return _mm512_castsi512_si256(_mm512_maskz_set1_epi64(__mmask16(x), -1LL));
2810 }
2811 
2812 
2813 // to_bits: convert to integer bitfield
to_bits(Vec16b a)2814 static inline uint16_t to_bits(Vec16b a) {
2815     return (uint16_t)(__mmask16)a;
2816 }
2817 
2818 // to_Vec16b: convert integer bitfield to boolean vector
to_Vec16b(uint16_t x)2819 static inline Vec16b to_Vec16b(uint16_t x) {
2820     return (__mmask16)x;
2821 }
2822 
2823 // to_Vec16ib: convert integer bitfield to boolean vector
to_Vec16ib(uint16_t x)2824 static inline Vec16ib to_Vec16ib(uint16_t x) {
2825     return to_Vec16b(x);
2826 }
2827 
2828 // to_Vec8b: convert integer bitfield to boolean vector
to_Vec8qb(uint8_t x)2829 static inline Vec8qb to_Vec8qb(uint8_t x) {
2830     return (__mmask16)x;
2831 }
2832 
2833 #ifdef VCL_NAMESPACE
2834 }
2835 #endif
2836 
2837 #endif // VECTORI512_H
2838