1 /**************************** vectori512.h *******************************
2 * Author: Agner Fog
3 * Date created: 2014-07-23
4 * Last modified: 2017-02-19
5 * Version: 1.27
6 * Project: vector classes
7 * Description:
8 * Header file defining integer vector classes as interface to intrinsic
9 * functions in x86 microprocessors with AVX512 and later instruction sets.
10 *
11 * Instructions:
12 * Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired
13 * instruction set, which must be at least AVX512.
14 *
15 * The following vector classes are defined here:
16 * Vec16i Vector of 16 32-bit signed integers
17 * Vec16ui Vector of 16 32-bit unsigned integers
18 * Vec16ib Vector of 16 Booleans for use with Vec16i and Vec16ui
19 * Vec8q Vector of 8 64-bit signed integers
20 * Vec8uq Vector of 8 64-bit unsigned integers
21 * Vec8qb Vector of 8 Booleans for use with Vec8q and Vec8uq
22 *
23 * Each vector object is represented internally in the CPU as a 512-bit register.
24 * This header file defines operators and functions for these vectors.
25 *
26 * For detailed instructions, see VectorClass.pdf
27 *
28 * (c) Copyright 2014-2017 GNU General Public License http://www.gnu.org/licenses
29 *****************************************************************************/
30
31 // check combination of header files
32 #if defined (VECTORI512_H)
33 #if VECTORI512_H != 2
34 #error Two different versions of vectori512.h included
35 #endif
36 #else
37 #define VECTORI512_H 2
38
39 #ifdef VECTORF512_H
40 #error Please put header file vectori512.h before vectorf512.h
41 #endif
42
43
44 #if INSTRSET < 9 // AVX512 required
45 #error Wrong instruction set for vectori512.h, AVX512 required or use vectori512e.h
46 #endif
47
48 #include "vectori256.h"
49
50 #ifdef VCL_NAMESPACE
51 namespace VCL_NAMESPACE {
52 #endif
53
54 // Bug fix for missing intrinsics:
55 // _mm512_cmpgt_epu32_mask, _mm512_cmpgt_epu64_mask
56 // all typecast intrinsics
57 // Fix expected in GCC version 4.9.3 or 5.0. https://gcc.gnu.org/bugzilla/show_bug.cgi?id=61878
58
59 // questionable
60 // _mm512_mask_mov_epi32 check select(). Doc at https://software.intel.com/en-us/node/513888 is wrong. Bug report filed
61
62
63 #if defined (GCC_VERSION) && GCC_VERSION < 50000 && !defined(__INTEL_COMPILER) && !defined(__clang__)
64
_mm512_castsi256_si512(__m256i x)65 static inline __m512i _mm512_castsi256_si512(__m256i x) {
66 union {
67 __m512i a;
68 __m256i b;
69 } u;
70 u.b = x;
71 return u.a;
72 }
73
_mm512_castsi512_si256(__m512i x)74 static inline __m256i _mm512_castsi512_si256(__m512i x) {
75 union {
76 __m512i a;
77 __m256i b;
78 } u;
79 u.a = x;
80 return u.b;
81 }
82
_mm512_castsi128_si512(__m128i x)83 static inline __m512i _mm512_castsi128_si512(__m128i x) {
84 union {
85 __m128i a;
86 __m512i b;
87 } u;
88 u.a = x;
89 return u.b;
90 }
91
_mm512_castsi512_si128(__m512i x)92 static inline __m128i _mm512_castsi512_si128(__m512i x) {
93 union {
94 __m512i a;
95 __m128i b;
96 } u;
97 u.a = x;
98 return u.b;
99 }
100
101 #endif
102
103
104 /*****************************************************************************
105 *
106 * Generate compile-time constant vector
107 *
108 *****************************************************************************/
109 // Generate a constant vector of 8 integers stored in memory.
110 // Can be converted to any integer vector type
111 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
112 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
constant16i()113 static inline __m512i constant16i() {
114 static const union {
115 int32_t i[16];
116 __m512i zmm;
117 } u = {{i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15}};
118 return u.zmm;
119 }
120
121
122 /*****************************************************************************
123 *
124 * Boolean vector base classes for AVX512
125 *
126 *****************************************************************************/
127
128 class Vec16b {
129 protected:
130 __mmask16 m16; // Boolean vector
131 public:
132 // Default constructor:
Vec16b()133 Vec16b () {
134 }
135 // Constructor to convert from type __mmask16 used in intrinsics:
Vec16b(__mmask16 x)136 Vec16b (__mmask16 x) {
137 m16 = x;
138 }
139 // Constructor to build from all elements:
Vec16b(bool b0,bool b1,bool b2,bool b3,bool b4,bool b5,bool b6,bool b7,bool b8,bool b9,bool b10,bool b11,bool b12,bool b13,bool b14,bool b15)140 Vec16b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7,
141 bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15) {
142 m16 = uint16_t(b0 | b1<<1 | b2<<2 | b3<<3 | b4<<4 | b5<<5 | b6<<6 | b7<<7 |
143 b8<<8 | b9<<9 | b10<<10 | b11<<11 | b12<<12 | b13<<13 | b14<<14 | b15<<15);
144 }
145 // Constructor to broadcast single value:
Vec16b(bool b)146 Vec16b(bool b) {
147 m16 = __mmask16(-int16_t(b));
148 }
149 private: // Prevent constructing from int, etc.
150 Vec16b(int b);
151 public:
152 // Constructor to make from two halves
Vec16b(Vec8ib const & x0,Vec8ib const & x1)153 Vec16b (Vec8ib const & x0, Vec8ib const & x1) {
154 // = Vec16i(x0,x1) != 0; (not defined yet)
155 __m512i z = _mm512_inserti64x4(_mm512_castsi256_si512(x0), x1, 1);
156 m16 = _mm512_cmpneq_epi32_mask(z, _mm512_setzero_epi32());
157 }
158 // Assignment operator to convert from type __mmask16 used in intrinsics:
159 Vec16b & operator = (__mmask16 x) {
160 m16 = x;
161 return *this;
162 }
163 // Assignment operator to broadcast scalar value:
164 Vec16b & operator = (bool b) {
165 m16 = Vec16b(b);
166 return *this;
167 }
168 private: // Prevent assigning int because of ambiguity
169 Vec16b & operator = (int x);
170 public:
171 // Type cast operator to convert to __mmask16 used in intrinsics
__mmask16()172 operator __mmask16() const {
173 return m16;
174 }
175 // split into two halves
get_low()176 Vec8ib get_low() const {
177 return to_Vec8ib((uint8_t)m16);
178 }
get_high()179 Vec8ib get_high() const {
180 return to_Vec8ib((uint16_t)m16 >> 8);
181 }
182 // Member function to change a single element in vector
183 // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,bool value)184 Vec16b const & insert(uint32_t index, bool value) {
185 m16 = __mmask16(((uint16_t)m16 & ~(1 << index)) | (int)value << index);
186 return *this;
187 }
188 // Member function extract a single element from vector
extract(uint32_t index)189 bool extract(uint32_t index) const {
190 return ((uint32_t)m16 >> index) & 1;
191 }
192 // Extract a single element. Operator [] can only read an element, not write.
193 bool operator [] (uint32_t index) const {
194 return extract(index);
195 }
size()196 static int size () {
197 return 16;
198 }
199 };
200
201 // Define operators for this class
202
203 // vector operator & : bitwise and
204 static inline Vec16b operator & (Vec16b a, Vec16b b) {
205 return _mm512_kand(a, b);
206 }
207 static inline Vec16b operator && (Vec16b a, Vec16b b) {
208 return a & b;
209 }
210
211 // vector operator | : bitwise or
212 static inline Vec16b operator | (Vec16b a, Vec16b b) {
213 return _mm512_kor(a, b);
214 }
215 static inline Vec16b operator || (Vec16b a, Vec16b b) {
216 return a | b;
217 }
218
219 // vector operator ^ : bitwise xor
220 static inline Vec16b operator ^ (Vec16b a, Vec16b b) {
221 return _mm512_kxor(a, b);
222 }
223
224 // vector operator ~ : bitwise not
225 static inline Vec16b operator ~ (Vec16b a) {
226 return _mm512_knot(a);
227 }
228
229 // vector operator ! : element not
230 static inline Vec16b operator ! (Vec16b a) {
231 return ~a;
232 }
233
234 // vector operator &= : bitwise and
235 static inline Vec16b & operator &= (Vec16b & a, Vec16b b) {
236 a = a & b;
237 return a;
238 }
239
240 // vector operator |= : bitwise or
241 static inline Vec16b & operator |= (Vec16b & a, Vec16b b) {
242 a = a | b;
243 return a;
244 }
245
246 // vector operator ^= : bitwise xor
247 static inline Vec16b & operator ^= (Vec16b & a, Vec16b b) {
248 a = a ^ b;
249 return a;
250 }
251
252
253 /*****************************************************************************
254 *
255 * Functions for boolean vectors
256 *
257 *****************************************************************************/
258
259 // function andnot: a & ~ b
andnot(Vec16b a,Vec16b b)260 static inline Vec16b andnot (Vec16b a, Vec16b b) {
261 return _mm512_kandn(b, a);
262 }
263
264 // horizontal_and. Returns true if all bits are 1
horizontal_and(Vec16b const & a)265 static inline bool horizontal_and (Vec16b const & a) {
266 return (uint16_t)(__mmask16)a == 0xFFFF;
267 }
268
269 // horizontal_or. Returns true if at least one bit is 1
horizontal_or(Vec16b const & a)270 static inline bool horizontal_or (Vec16b const & a) {
271 return (uint16_t)(__mmask16)a != 0;
272 }
273
274
275 /*****************************************************************************
276 *
277 * Vec16ib: Vector of 16 Booleans for use with Vec16i and Vec16ui
278 *
279 *****************************************************************************/
280
281 class Vec16ib : public Vec16b {
282 public:
283 // Default constructor:
Vec16ib()284 Vec16ib () {
285 }
Vec16ib(Vec16b x)286 Vec16ib (Vec16b x) {
287 m16 = x;
288 }
289 // Constructor to build from all elements:
Vec16ib(bool x0,bool x1,bool x2,bool x3,bool x4,bool x5,bool x6,bool x7,bool x8,bool x9,bool x10,bool x11,bool x12,bool x13,bool x14,bool x15)290 Vec16ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
291 bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) :
292 Vec16b(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15) {
293 }
294 // Constructor to convert from type __mmask16 used in intrinsics:
Vec16ib(__mmask16 x)295 Vec16ib (__mmask16 x) {
296 m16 = x;
297 }
298 // Constructor to broadcast single value:
Vec16ib(bool b)299 Vec16ib(bool b) : Vec16b(b) {}
300 private: // Prevent constructing from int, etc.
301 Vec16ib(int b);
302 public:
303 // Constructor to make from two halves
Vec16ib(Vec8ib const & x0,Vec8ib const & x1)304 Vec16ib (Vec8ib const & x0, Vec8ib const & x1) {
305 m16 = Vec16b(x0, x1);
306 }
307 // Assignment operator to convert from type __mmask16 used in intrinsics:
308 Vec16ib & operator = (__mmask16 x) {
309 m16 = x;
310 return *this;
311 }
312 // Assignment operator to broadcast scalar value:
313 Vec16ib & operator = (bool b) {
314 m16 = Vec16b(b);
315 return *this;
316 }
317 private: // Prevent assigning int because of ambiguity
318 Vec16ib & operator = (int x);
319 public:
320 };
321
322 // Define operators for Vec16ib
323
324 // vector operator & : bitwise and
325 static inline Vec16ib operator & (Vec16ib a, Vec16ib b) {
326 return Vec16b(a) & Vec16b(b);
327 }
328 static inline Vec16ib operator && (Vec16ib a, Vec16ib b) {
329 return a & b;
330 }
331
332 // vector operator | : bitwise or
333 static inline Vec16ib operator | (Vec16ib a, Vec16ib b) {
334 return Vec16b(a) | Vec16b(b);
335 }
336 static inline Vec16ib operator || (Vec16ib a, Vec16ib b) {
337 return a | b;
338 }
339
340 // vector operator ^ : bitwise xor
341 static inline Vec16ib operator ^ (Vec16ib a, Vec16ib b) {
342 return Vec16b(a) ^ Vec16b(b);
343 }
344
345 // vector operator ~ : bitwise not
346 static inline Vec16ib operator ~ (Vec16ib a) {
347 return ~Vec16b(a);
348 }
349
350 // vector operator ! : element not
351 static inline Vec16ib operator ! (Vec16ib a) {
352 return ~a;
353 }
354
355 // vector operator &= : bitwise and
356 static inline Vec16ib & operator &= (Vec16ib & a, Vec16ib b) {
357 a = a & b;
358 return a;
359 }
360
361 // vector operator |= : bitwise or
362 static inline Vec16ib & operator |= (Vec16ib & a, Vec16ib b) {
363 a = a | b;
364 return a;
365 }
366
367 // vector operator ^= : bitwise xor
368 static inline Vec16ib & operator ^= (Vec16ib & a, Vec16ib b) {
369 a = a ^ b;
370 return a;
371 }
372
373 // vector function andnot
andnot(Vec16ib a,Vec16ib b)374 static inline Vec16ib andnot (Vec16ib a, Vec16ib b) {
375 return Vec16ib(andnot(Vec16b(a), Vec16b(b)));
376 }
377
378
379 /*****************************************************************************
380 *
381 * Vec8b: Base class vector of 8 Booleans
382 *
383 *****************************************************************************/
384
385 class Vec8b : public Vec16b {
386 public:
387 // Default constructor:
Vec8b()388 Vec8b () {
389 }
390 // Constructor to convert from type __mmask16 used in intrinsics:
Vec8b(__mmask16 x)391 Vec8b (__mmask16 x) {
392 m16 = x;
393 }
394 // Constructor to build from all elements:
Vec8b(bool b0,bool b1,bool b2,bool b3,bool b4,bool b5,bool b6,bool b7)395 Vec8b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7) {
396 m16 = uint16_t(b0 | b1<<1 | b2<<2 | b3<<3 | b4<<4 | b5<<5 | b6<<6 | b7<<7);
397 }
Vec8b(Vec16b const & x)398 Vec8b (Vec16b const & x) {
399 m16 = __mmask16(x);
400 }
401 // Constructor to broadcast single value:
Vec8b(bool b)402 Vec8b(bool b) {
403 m16 = __mmask16(-int8_t(b));
404 }
405 // Assignment operator to convert from type __mmask16 used in intrinsics:
406 Vec8b & operator = (__mmask16 x) {
407 m16 = x;
408 return *this;
409 }
410 private: // Prevent constructing from int etc. because of ambiguity
411 Vec8b(int b);
412 Vec8b & operator = (int x);
413 public:
414 // split into two halves
get_low()415 Vec4qb get_low() const {
416 return Vec4qb(Vec4q(_mm512_castsi512_si256(_mm512_maskz_set1_epi64(__mmask16(m16), -1LL))));
417 }
get_high()418 Vec4qb get_high() const {
419 return Vec8b(__mmask16(m16 >> 4)).get_low();
420 }
size()421 static int size () {
422 return 8;
423 }
424 };
425
426 /*****************************************************************************
427 *
428 * Functions for boolean vectors
429 *
430 *****************************************************************************/
431
432 // function andnot: a & ~ b
andnot(Vec8b a,Vec8b b)433 static inline Vec8b andnot (Vec8b a, Vec8b b) {
434 return _mm512_kandn(b, a);
435 }
436
437 // horizontal_and. Returns true if all bits are 1
horizontal_and(Vec8b const & a)438 static inline bool horizontal_and (Vec8b const & a) {
439 return (uint8_t)(__mmask16)a == 0xFF;
440 }
441
442 // horizontal_or. Returns true if at least one bit is 1
horizontal_or(Vec8b const & a)443 static inline bool horizontal_or (Vec8b const & a) {
444 return (uint8_t)(__mmask16)a != 0;
445 }
446
447
448 /*****************************************************************************
449 *
450 * Vec8qb: Vector of 8 Booleans for use with Vec8q and Vec8qu
451 *
452 *****************************************************************************/
453
454 class Vec8qb : public Vec8b {
455 public:
456 // Default constructor:
Vec8qb()457 Vec8qb () {
458 }
Vec8qb(Vec16b x)459 Vec8qb (Vec16b x) {
460 m16 = x;
461 }
462 // Constructor to build from all elements:
Vec8qb(bool x0,bool x1,bool x2,bool x3,bool x4,bool x5,bool x6,bool x7)463 Vec8qb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) :
464 Vec8b(x0, x1, x2, x3, x4, x5, x6, x7) {
465 }
466 // Constructor to convert from type __mmask8 used in intrinsics:
Vec8qb(__mmask8 x)467 Vec8qb (__mmask8 x) {
468 m16 = (__mmask16)x;
469 }
470 // Constructor to convert from type __mmask16 used in intrinsics:
Vec8qb(__mmask16 x)471 Vec8qb (__mmask16 x) {
472 m16 = x;
473 }
474 // Assignment operator to convert from type __mmask16 used in intrinsics:
475 Vec8qb & operator = (__mmask16 x) {
476 m16 = x;
477 return *this;
478 }
479 // Constructor to broadcast single value:
Vec8qb(bool b)480 Vec8qb(bool b) : Vec8b(b) {}
481 // Assignment operator to broadcast scalar:
482 Vec8qb & operator = (bool b) {
483 m16 = Vec8b(b);
484 return *this;
485 }
486 private: // Prevent constructing from int, etc.
487 Vec8qb(int b);
488 Vec8qb & operator = (int x);
489 public:
490 // Constructor to make from two halves
Vec8qb(Vec4qb const & x0,Vec4qb const & x1)491 Vec8qb (Vec4qb const & x0, Vec4qb const & x1) {
492 // = Vec8q(x0,x1) != 0; (not defined yet)
493 __m512i z = _mm512_inserti64x4(_mm512_castsi256_si512(x0), x1, 1);
494 m16 = _mm512_cmpneq_epi64_mask(z, _mm512_setzero_si512());
495 }
496 };
497
498 // Define operators for Vec8qb
499
500 // vector operator & : bitwise and
501 static inline Vec8qb operator & (Vec8qb a, Vec8qb b) {
502 return Vec16b(a) & Vec16b(b);
503 }
504 static inline Vec8qb operator && (Vec8qb a, Vec8qb b) {
505 return a & b;
506 }
507
508 // vector operator | : bitwise or
509 static inline Vec8qb operator | (Vec8qb a, Vec8qb b) {
510 return Vec16b(a) | Vec16b(b);
511 }
512 static inline Vec8qb operator || (Vec8qb a, Vec8qb b) {
513 return a | b;
514 }
515
516 // vector operator ^ : bitwise xor
517 static inline Vec8qb operator ^ (Vec8qb a, Vec8qb b) {
518 return Vec16b(a) ^ Vec16b(b);
519 }
520
521 // vector operator ~ : bitwise not
522 static inline Vec8qb operator ~ (Vec8qb a) {
523 return ~Vec16b(a);
524 }
525
526 // vector operator ! : element not
527 static inline Vec8qb operator ! (Vec8qb a) {
528 return ~a;
529 }
530
531 // vector operator &= : bitwise and
532 static inline Vec8qb & operator &= (Vec8qb & a, Vec8qb b) {
533 a = a & b;
534 return a;
535 }
536
537 // vector operator |= : bitwise or
538 static inline Vec8qb & operator |= (Vec8qb & a, Vec8qb b) {
539 a = a | b;
540 return a;
541 }
542
543 // vector operator ^= : bitwise xor
544 static inline Vec8qb & operator ^= (Vec8qb & a, Vec8qb b) {
545 a = a ^ b;
546 return a;
547 }
548
549 // to_bits: convert to integer bitfield
to_bits(Vec8qb a)550 static inline uint32_t to_bits(Vec8qb a) {
551 return (uint8_t)(__mmask16)a;
552 }
553
554 // vector function andnot
andnot(Vec8qb a,Vec8qb b)555 static inline Vec8qb andnot (Vec8qb a, Vec8qb b) {
556 return Vec8qb(andnot(Vec16b(a), Vec16b(b)));
557 }
558
559
560 /*****************************************************************************
561 *
562 * Vector of 512 1-bit unsigned integers (base class for Vec16i)
563 *
564 *****************************************************************************/
565 class Vec512b {
566 protected:
567 __m512i zmm; // Integer vector
568 public:
569 // Default constructor:
Vec512b()570 Vec512b() {
571 }
572 // Constructor to build from two Vec256b:
Vec512b(Vec256b const & a0,Vec256b const & a1)573 Vec512b(Vec256b const & a0, Vec256b const & a1) {
574 zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
575 }
576 // Constructor to convert from type __m512i used in intrinsics:
Vec512b(__m512i const & x)577 Vec512b(__m512i const & x) {
578 zmm = x;
579 }
580 // Assignment operator to convert from type __m512i used in intrinsics:
581 Vec512b & operator = (__m512i const & x) {
582 zmm = x;
583 return *this;
584 }
585 // Type cast operator to convert to __m512i used in intrinsics
__m512i()586 operator __m512i() const {
587 return zmm;
588 }
589 // Member function to load from array (unaligned)
load(void const * p)590 Vec512b & load(void const * p) {
591 zmm = _mm512_loadu_si512(p);
592 return *this;
593 }
594 // Member function to load from array, aligned by 64
595 // You may use load_a instead of load if you are certain that p points to an address
596 // divisible by 64, but there is hardly any speed advantage of load_a on modern processors
load_a(void const * p)597 Vec512b & load_a(void const * p) {
598 zmm = _mm512_load_si512(p);
599 return *this;
600 }
601 // Member function to store into array (unaligned)
store(void * p)602 void store(void * p) const {
603 _mm512_storeu_si512(p, zmm);
604 }
605 // Member function to store into array, aligned by 64
606 // You may use store_a instead of store if you are certain that p points to an address
607 // divisible by 64, but there is hardly any speed advantage of store_a on modern processors
store_a(void * p)608 void store_a(void * p) const {
609 _mm512_store_si512(p, zmm);
610 }
611 // Member function to change a single bit, mainly for test purposes
612 // Note: This function is inefficient. Use load function if changing more than one bit
set_bit(uint32_t index,int value)613 Vec512b const & set_bit(uint32_t index, int value) {
614 static uint64_t m[16] = {0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0};
615 int wi = (index >> 6) & 7; // qword index
616 int bi = index & 0x3F; // bit index within qword w
617
618 __m512i mask = Vec512b().load(m+8-wi); // 1 in qword number wi
619 mask = _mm512_sll_epi64(mask,_mm_cvtsi32_si128(bi)); // mask with bit number b set
620 if (value & 1) {
621 zmm = _mm512_or_si512(mask,zmm);
622 }
623 else {
624 zmm = _mm512_andnot_si512(mask,zmm);
625 }
626 return *this;
627 }
628 // Member function to get a single bit, mainly for test purposes
629 // Note: This function is inefficient. Use store function if reading more than one bit
get_bit(uint32_t index)630 int get_bit(uint32_t index) const {
631 union {
632 __m512i z;
633 uint8_t i[64];
634 } u;
635 u.z = zmm;
636 int wi = (index >> 3) & 0x3F; // byte index
637 int bi = index & 7; // bit index within byte w
638 return (u.i[wi] >> bi) & 1;
639 }
640 // Member functions to split into two Vec256b:
get_low()641 Vec256b get_low() const {
642 return _mm512_castsi512_si256(zmm);
643 }
get_high()644 Vec256b get_high() const {
645 return _mm512_extracti64x4_epi64(zmm,1);
646 }
size()647 static int size () {
648 return 512;
649 }
650 };
651
652
653 // Define operators for this class
654
655 // vector operator & : bitwise and
656 static inline Vec512b operator & (Vec512b const & a, Vec512b const & b) {
657 return _mm512_and_epi32(a, b);
658 }
659 static inline Vec512b operator && (Vec512b const & a, Vec512b const & b) {
660 return a & b;
661 }
662
663 // vector operator | : bitwise or
664 static inline Vec512b operator | (Vec512b const & a, Vec512b const & b) {
665 return _mm512_or_epi32(a, b);
666 }
667 static inline Vec512b operator || (Vec512b const & a, Vec512b const & b) {
668 return a | b;
669 }
670
671 // vector operator ^ : bitwise xor
672 static inline Vec512b operator ^ (Vec512b const & a, Vec512b const & b) {
673 return _mm512_xor_epi32(a, b);
674 }
675
676 // vector operator ~ : bitwise not
677 static inline Vec512b operator ~ (Vec512b const & a) {
678 return _mm512_xor_epi32(a, _mm512_set1_epi32(-1));
679 }
680
681 // vector operator &= : bitwise and
682 static inline Vec512b & operator &= (Vec512b & a, Vec512b const & b) {
683 a = a & b;
684 return a;
685 }
686
687 // vector operator |= : bitwise or
688 static inline Vec512b & operator |= (Vec512b & a, Vec512b const & b) {
689 a = a | b;
690 return a;
691 }
692
693 // vector operator ^= : bitwise xor
694 static inline Vec512b & operator ^= (Vec512b & a, Vec512b const & b) {
695 a = a ^ b;
696 return a;
697 }
698
699 // Define functions for this class
700
701 // function andnot: a & ~ b
andnot(Vec512b const & a,Vec512b const & b)702 static inline Vec512b andnot (Vec512b const & a, Vec512b const & b) {
703 return _mm512_andnot_epi32(b, a);
704 }
705
706
707 /*****************************************************************************
708 *
709 * Vector of 16 32-bit signed integers
710 *
711 *****************************************************************************/
712
713 class Vec16i: public Vec512b {
714 public:
715 // Default constructor:
Vec16i()716 Vec16i() {
717 };
718 // Constructor to broadcast the same value into all elements:
Vec16i(int i)719 Vec16i(int i) {
720 zmm = _mm512_set1_epi32(i);
721 };
722 // Constructor to build from all elements:
Vec16i(int32_t i0,int32_t i1,int32_t i2,int32_t i3,int32_t i4,int32_t i5,int32_t i6,int32_t i7,int32_t i8,int32_t i9,int32_t i10,int32_t i11,int32_t i12,int32_t i13,int32_t i14,int32_t i15)723 Vec16i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7,
724 int32_t i8, int32_t i9, int32_t i10, int32_t i11, int32_t i12, int32_t i13, int32_t i14, int32_t i15) {
725 zmm = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
726 };
727 // Constructor to build from two Vec8i:
Vec16i(Vec8i const & a0,Vec8i const & a1)728 Vec16i(Vec8i const & a0, Vec8i const & a1) {
729 zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
730 }
731 // Constructor to convert from type __m512i used in intrinsics:
Vec16i(__m512i const & x)732 Vec16i(__m512i const & x) {
733 zmm = x;
734 };
735 // Assignment operator to convert from type __m512i used in intrinsics:
736 Vec16i & operator = (__m512i const & x) {
737 zmm = x;
738 return *this;
739 };
740 // Type cast operator to convert to __m512i used in intrinsics
__m512i()741 operator __m512i() const {
742 return zmm;
743 };
744 // Member function to load from array (unaligned)
load(void const * p)745 Vec16i & load(void const * p) {
746 zmm = _mm512_loadu_si512(p);
747 return *this;
748 }
749 // Member function to load from array, aligned by 64
load_a(void const * p)750 Vec16i & load_a(void const * p) {
751 zmm = _mm512_load_si512(p);
752 return *this;
753 }
754 // Partial load. Load n elements and set the rest to 0
load_partial(int n,void const * p)755 Vec16i & load_partial(int n, void const * p) {
756 zmm = _mm512_maskz_loadu_epi32(__mmask16((1 << n) - 1), p);
757 return *this;
758 }
759 // Partial store. Store n elements
store_partial(int n,void * p)760 void store_partial(int n, void * p) const {
761 _mm512_mask_storeu_epi32(p, __mmask16((1 << n) - 1), zmm);
762 }
763 // cut off vector to n elements. The last 16-n elements are set to zero
cutoff(int n)764 Vec16i & cutoff(int n) {
765 zmm = _mm512_maskz_mov_epi32(__mmask16((1 << n) - 1), zmm);
766 return *this;
767 }
768 // Member function to change a single element in vector
insert(uint32_t index,int32_t value)769 Vec16i const & insert(uint32_t index, int32_t value) {
770 zmm = _mm512_mask_set1_epi32(zmm, __mmask16(1 << index), value);
771 return *this;
772 };
773 // Member function extract a single element from vector
extract(uint32_t index)774 int32_t extract(uint32_t index) const {
775 int32_t a[16];
776 store(a);
777 return a[index & 15];
778 }
779 // Extract a single element. Use store function if extracting more than one element.
780 // Operator [] can only read an element, not write.
781 int32_t operator [] (uint32_t index) const {
782 return extract(index);
783 }
784 // Member functions to split into two Vec8i:
get_low()785 Vec8i get_low() const {
786 return _mm512_castsi512_si256(zmm);
787 }
get_high()788 Vec8i get_high() const {
789 return _mm512_extracti64x4_epi64(zmm,1);
790 }
size()791 static int size () {
792 return 16;
793 }
794 };
795
796
797 // Define operators for Vec16i
798
799 // vector operator + : add element by element
800 static inline Vec16i operator + (Vec16i const & a, Vec16i const & b) {
801 return _mm512_add_epi32(a, b);
802 }
803
804 // vector operator += : add
805 static inline Vec16i & operator += (Vec16i & a, Vec16i const & b) {
806 a = a + b;
807 return a;
808 }
809
810 // postfix operator ++
811 static inline Vec16i operator ++ (Vec16i & a, int) {
812 Vec16i a0 = a;
813 a = a + 1;
814 return a0;
815 }
816
817 // prefix operator ++
818 static inline Vec16i & operator ++ (Vec16i & a) {
819 a = a + 1;
820 return a;
821 }
822
823 // vector operator - : subtract element by element
824 static inline Vec16i operator - (Vec16i const & a, Vec16i const & b) {
825 return _mm512_sub_epi32(a, b);
826 }
827
828 // vector operator - : unary minus
829 static inline Vec16i operator - (Vec16i const & a) {
830 return _mm512_sub_epi32(_mm512_setzero_epi32(), a);
831 }
832
833 // vector operator -= : subtract
834 static inline Vec16i & operator -= (Vec16i & a, Vec16i const & b) {
835 a = a - b;
836 return a;
837 }
838
839 // postfix operator --
840 static inline Vec16i operator -- (Vec16i & a, int) {
841 Vec16i a0 = a;
842 a = a - 1;
843 return a0;
844 }
845
846 // prefix operator --
847 static inline Vec16i & operator -- (Vec16i & a) {
848 a = a - 1;
849 return a;
850 }
851
852 // vector operator * : multiply element by element
853 static inline Vec16i operator * (Vec16i const & a, Vec16i const & b) {
854 return _mm512_mullo_epi32(a, b);
855 }
856
857 // vector operator *= : multiply
858 static inline Vec16i & operator *= (Vec16i & a, Vec16i const & b) {
859 a = a * b;
860 return a;
861 }
862
863 // vector operator / : divide all elements by same integer
864 // See bottom of file
865
866
867 // vector operator << : shift left
868 static inline Vec16i operator << (Vec16i const & a, int32_t b) {
869 return _mm512_sll_epi32(a, _mm_cvtsi32_si128(b));
870 }
871
872 // vector operator <<= : shift left
873 static inline Vec16i & operator <<= (Vec16i & a, int32_t b) {
874 a = a << b;
875 return a;
876 }
877
878 // vector operator >> : shift right arithmetic
879 static inline Vec16i operator >> (Vec16i const & a, int32_t b) {
880 return _mm512_sra_epi32(a, _mm_cvtsi32_si128(b));
881 }
882
883 // vector operator >>= : shift right arithmetic
884 static inline Vec16i & operator >>= (Vec16i & a, int32_t b) {
885 a = a >> b;
886 return a;
887 }
888
889 // vector operator == : returns true for elements for which a == b
890 static inline Vec16ib operator == (Vec16i const & a, Vec16i const & b) {
891 return _mm512_cmpeq_epi32_mask(a, b);
892 }
893
894 // vector operator != : returns true for elements for which a != b
895 static inline Vec16ib operator != (Vec16i const & a, Vec16i const & b) {
896 return _mm512_cmpneq_epi32_mask(a, b);
897 }
898
899 // vector operator > : returns true for elements for which a > b
900 static inline Vec16ib operator > (Vec16i const & a, Vec16i const & b) {
901 return _mm512_cmpgt_epi32_mask(a, b);
902 }
903
904 // vector operator < : returns true for elements for which a < b
905 static inline Vec16ib operator < (Vec16i const & a, Vec16i const & b) {
906 return b > a;
907 }
908
909 // vector operator >= : returns true for elements for which a >= b (signed)
910 static inline Vec16ib operator >= (Vec16i const & a, Vec16i const & b) {
911 return _mm512_cmpge_epi32_mask(a, b);
912 }
913
914 // vector operator <= : returns true for elements for which a <= b (signed)
915 static inline Vec16ib operator <= (Vec16i const & a, Vec16i const & b) {
916 return b >= a;
917 }
918
919 // vector operator & : bitwise and
920 static inline Vec16i operator & (Vec16i const & a, Vec16i const & b) {
921 return _mm512_and_epi32(a, b);
922 }
923
924 // vector operator &= : bitwise and
925 static inline Vec16i & operator &= (Vec16i & a, Vec16i const & b) {
926 a = a & b;
927 return a;
928 }
929
930 // vector operator | : bitwise or
931 static inline Vec16i operator | (Vec16i const & a, Vec16i const & b) {
932 return _mm512_or_epi32(a, b);
933 }
934
935 // vector operator |= : bitwise or
936 static inline Vec16i & operator |= (Vec16i & a, Vec16i const & b) {
937 a = a | b;
938 return a;
939 }
940
941 // vector operator ^ : bitwise xor
942 static inline Vec16i operator ^ (Vec16i const & a, Vec16i const & b) {
943 return _mm512_xor_epi32(a, b);
944 }
945
946 // vector operator ^= : bitwise xor
947 static inline Vec16i & operator ^= (Vec16i & a, Vec16i const & b) {
948 a = a ^ b;
949 return a;
950 }
951
952 // vector operator ~ : bitwise not
953 static inline Vec16i operator ~ (Vec16i const & a) {
954 return a ^ Vec16i(-1);
955 }
956
957 // Functions for this class
958
959 // Select between two operands. Corresponds to this pseudocode:
960 // for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec16ib const & s,Vec16i const & a,Vec16i const & b)961 static inline Vec16i select (Vec16ib const & s, Vec16i const & a, Vec16i const & b) {
962 return _mm512_mask_mov_epi32(b, s, a); // conditional move may be optimized better by the compiler than blend
963 // return _mm512_mask_blend_epi32(s, b, a);
964 }
965
966 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec16ib const & f,Vec16i const & a,Vec16i const & b)967 static inline Vec16i if_add (Vec16ib const & f, Vec16i const & a, Vec16i const & b) {
968 return _mm512_mask_add_epi32(a, f, a, b);
969 }
970
971 // Horizontal add: Calculates the sum of all vector elements.
972 // Overflow will wrap around
horizontal_add(Vec16i const & a)973 static inline int32_t horizontal_add (Vec16i const & a) {
974 #if defined(__INTEL_COMPILER)
975 return _mm512_reduce_add_epi32(a);
976 #else
977 return horizontal_add(a.get_low() + a.get_high());
978 #endif
979 }
980
981 // function add_saturated: add element by element, signed with saturation
982 // (is it faster to up-convert to 64 bit integers, and then downconvert the sum with saturation?)
add_saturated(Vec16i const & a,Vec16i const & b)983 static inline Vec16i add_saturated(Vec16i const & a, Vec16i const & b) {
984 __m512i sum = _mm512_add_epi32(a, b); // a + b
985 __m512i axb = _mm512_xor_epi32(a, b); // check if a and b have different sign
986 __m512i axs = _mm512_xor_epi32(a, sum); // check if a and sum have different sign
987 __m512i ovf1 = _mm512_andnot_epi32(axb,axs); // check if sum has wrong sign
988 __m512i ovf2 = _mm512_srai_epi32(ovf1,31); // -1 if overflow
989 __mmask16 ovf3 = _mm512_cmpneq_epi32_mask(ovf2, _mm512_setzero_epi32()); // same, as mask
990 __m512i asign = _mm512_srli_epi32(a,31); // 1 if a < 0
991 __m512i sat1 = _mm512_srli_epi32(ovf2,1); // 7FFFFFFF if overflow
992 __m512i sat2 = _mm512_add_epi32(sat1,asign); // 7FFFFFFF if positive overflow 80000000 if negative overflow
993 return _mm512_mask_blend_epi32(ovf3, sum, sat2); // sum if not overflow, else sat2
994 }
995
996 // function sub_saturated: subtract element by element, signed with saturation
sub_saturated(Vec16i const & a,Vec16i const & b)997 static inline Vec16i sub_saturated(Vec16i const & a, Vec16i const & b) {
998 __m512i diff = _mm512_sub_epi32(a, b); // a + b
999 __m512i axb = _mm512_xor_si512(a, b); // check if a and b have different sign
1000 __m512i axs = _mm512_xor_si512(a, diff); // check if a and sum have different sign
1001 __m512i ovf1 = _mm512_and_si512(axb,axs); // check if sum has wrong sign
1002 __m512i ovf2 = _mm512_srai_epi32(ovf1,31); // -1 if overflow
1003 __mmask16 ovf3 = _mm512_cmpneq_epi32_mask(ovf2, _mm512_setzero_epi32()); // same, as mask
1004 __m512i asign = _mm512_srli_epi32(a,31); // 1 if a < 0
1005 __m512i sat1 = _mm512_srli_epi32(ovf2,1); // 7FFFFFFF if overflow
1006 __m512i sat2 = _mm512_add_epi32(sat1,asign); // 7FFFFFFF if positive overflow 80000000 if negative overflow
1007 return _mm512_mask_blend_epi32(ovf3, diff, sat2); // sum if not overflow, else sat2
1008 }
1009
1010 // function max: a > b ? a : b
max(Vec16i const & a,Vec16i const & b)1011 static inline Vec16i max(Vec16i const & a, Vec16i const & b) {
1012 return _mm512_max_epi32(a,b);
1013 }
1014
1015 // function min: a < b ? a : b
min(Vec16i const & a,Vec16i const & b)1016 static inline Vec16i min(Vec16i const & a, Vec16i const & b) {
1017 return _mm512_min_epi32(a,b);
1018 }
1019
1020 // function abs: a >= 0 ? a : -a
abs(Vec16i const & a)1021 static inline Vec16i abs(Vec16i const & a) {
1022 return _mm512_abs_epi32(a);
1023 }
1024
1025 // function abs_saturated: same as abs, saturate if overflow
abs_saturated(Vec16i const & a)1026 static inline Vec16i abs_saturated(Vec16i const & a) {
1027 return _mm512_min_epu32(abs(a), Vec16i(0x7FFFFFFF));
1028 }
1029
1030 // function rotate_left all elements
1031 // Use negative count to rotate right
rotate_left(Vec16i const & a,int b)1032 static inline Vec16i rotate_left(Vec16i const & a, int b) {
1033 return _mm512_rolv_epi32(a, Vec16i(b));
1034 }
1035
1036
1037 /*****************************************************************************
1038 *
1039 * Vector of 16 32-bit unsigned integers
1040 *
1041 *****************************************************************************/
1042
1043
1044 class Vec16ui : public Vec16i {
1045 public:
1046 // Default constructor:
Vec16ui()1047 Vec16ui() {
1048 };
1049 // Constructor to broadcast the same value into all elements:
Vec16ui(uint32_t i)1050 Vec16ui(uint32_t i) {
1051 zmm = _mm512_set1_epi32(i);
1052 };
1053 // Constructor to build from all elements:
Vec16ui(uint32_t i0,uint32_t i1,uint32_t i2,uint32_t i3,uint32_t i4,uint32_t i5,uint32_t i6,uint32_t i7,uint32_t i8,uint32_t i9,uint32_t i10,uint32_t i11,uint32_t i12,uint32_t i13,uint32_t i14,uint32_t i15)1054 Vec16ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7,
1055 uint32_t i8, uint32_t i9, uint32_t i10, uint32_t i11, uint32_t i12, uint32_t i13, uint32_t i14, uint32_t i15) {
1056 zmm = _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
1057 };
1058 // Constructor to build from two Vec8ui:
Vec16ui(Vec8ui const & a0,Vec8ui const & a1)1059 Vec16ui(Vec8ui const & a0, Vec8ui const & a1) {
1060 zmm = Vec16i(Vec8i(a0), Vec8i(a1));
1061 }
1062 // Constructor to convert from type __m512i used in intrinsics:
Vec16ui(__m512i const & x)1063 Vec16ui(__m512i const & x) {
1064 zmm = x;
1065 };
1066 // Assignment operator to convert from type __m512i used in intrinsics:
1067 Vec16ui & operator = (__m512i const & x) {
1068 zmm = x;
1069 return *this;
1070 };
1071 // Member function to load from array (unaligned)
load(void const * p)1072 Vec16ui & load(void const * p) {
1073 Vec16i::load(p);
1074 return *this;
1075 }
1076 // Member function to load from array, aligned by 64
load_a(void const * p)1077 Vec16ui & load_a(void const * p) {
1078 Vec16i::load_a(p);
1079 return *this;
1080 }
1081 // Member function to change a single element in vector
1082 // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,uint32_t value)1083 Vec16ui const & insert(uint32_t index, uint32_t value) {
1084 Vec16i::insert(index, value);
1085 return *this;
1086 }
1087 // Member function extract a single element from vector
extract(uint32_t index)1088 uint32_t extract(uint32_t index) const {
1089 return Vec16i::extract(index);
1090 }
1091 // Extract a single element. Use store function if extracting more than one element.
1092 // Operator [] can only read an element, not write.
1093 uint32_t operator [] (uint32_t index) const {
1094 return extract(index);
1095 }
1096 // Member functions to split into two Vec4ui:
get_low()1097 Vec8ui get_low() const {
1098 return Vec8ui(Vec16i::get_low());
1099 }
get_high()1100 Vec8ui get_high() const {
1101 return Vec8ui(Vec16i::get_high());
1102 }
1103 };
1104
1105 // Define operators for this class
1106
1107 // vector operator + : add
1108 static inline Vec16ui operator + (Vec16ui const & a, Vec16ui const & b) {
1109 return Vec16ui (Vec16i(a) + Vec16i(b));
1110 }
1111
1112 // vector operator - : subtract
1113 static inline Vec16ui operator - (Vec16ui const & a, Vec16ui const & b) {
1114 return Vec16ui (Vec16i(a) - Vec16i(b));
1115 }
1116
1117 // vector operator * : multiply
1118 static inline Vec16ui operator * (Vec16ui const & a, Vec16ui const & b) {
1119 return Vec16ui (Vec16i(a) * Vec16i(b));
1120 }
1121
1122 // vector operator / : divide
1123 // See bottom of file
1124
1125 // vector operator >> : shift right logical all elements
1126 static inline Vec16ui operator >> (Vec16ui const & a, uint32_t b) {
1127 return _mm512_srl_epi32(a, _mm_cvtsi32_si128(b));
1128 }
1129
1130 // vector operator >> : shift right logical all elements
1131 static inline Vec16ui operator >> (Vec16ui const & a, int32_t b) {
1132 return a >> (uint32_t)b;
1133 }
1134
1135 // vector operator >>= : shift right logical
1136 static inline Vec16ui & operator >>= (Vec16ui & a, uint32_t b) {
1137 a = a >> b;
1138 return a;
1139 }
1140
1141 // vector operator >>= : shift right logical
1142 static inline Vec16ui & operator >>= (Vec16ui & a, int32_t b) {
1143 a = a >> uint32_t(b);
1144 return a;
1145 }
1146
1147 // vector operator << : shift left all elements
1148 static inline Vec16ui operator << (Vec16ui const & a, uint32_t b) {
1149 return Vec16ui ((Vec16i)a << (int32_t)b);
1150 }
1151
1152 // vector operator << : shift left all elements
1153 static inline Vec16ui operator << (Vec16ui const & a, int32_t b) {
1154 return Vec16ui ((Vec16i)a << (int32_t)b);
1155 }
1156
1157 // vector operator < : returns true for elements for which a < b (unsigned)
1158 static inline Vec16ib operator < (Vec16ui const & a, Vec16ui const & b) {
1159 return _mm512_cmplt_epu32_mask(a, b);
1160 }
1161
1162 // vector operator > : returns true for elements for which a > b (unsigned)
1163 static inline Vec16ib operator > (Vec16ui const & a, Vec16ui const & b) {
1164 return b < a;
1165 }
1166
1167
1168 // vector operator >= : returns true for elements for which a >= b (unsigned)
1169 static inline Vec16ib operator >= (Vec16ui const & a, Vec16ui const & b) {
1170 return _mm512_cmpge_epu32_mask(a, b);
1171 }
1172
1173 // vector operator <= : returns true for elements for which a <= b (unsigned)
1174 static inline Vec16ib operator <= (Vec16ui const & a, Vec16ui const & b) {
1175 return b >= a;
1176 }
1177
1178 // vector operator & : bitwise and
1179 static inline Vec16ui operator & (Vec16ui const & a, Vec16ui const & b) {
1180 return Vec16ui(Vec16i(a) & Vec16i(b));
1181 }
1182
1183 // vector operator | : bitwise or
1184 static inline Vec16ui operator | (Vec16ui const & a, Vec16ui const & b) {
1185 return Vec16ui(Vec16i(a) | Vec16i(b));
1186 }
1187
1188 // vector operator ^ : bitwise xor
1189 static inline Vec16ui operator ^ (Vec16ui const & a, Vec16ui const & b) {
1190 return Vec16ui(Vec16i(a) ^ Vec16i(b));
1191 }
1192
1193 // vector operator ~ : bitwise not
1194 static inline Vec16ui operator ~ (Vec16ui const & a) {
1195 return Vec16ui( ~ Vec16i(a));
1196 }
1197
1198 // Functions for this class
1199
1200 // Select between two operands. Corresponds to this pseudocode:
1201 // for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec16ib const & s,Vec16ui const & a,Vec16ui const & b)1202 static inline Vec16ui select (Vec16ib const & s, Vec16ui const & a, Vec16ui const & b) {
1203 return Vec16ui(select(s, Vec16i(a), Vec16i(b)));
1204 }
1205
1206 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec16ib const & f,Vec16ui const & a,Vec16ui const & b)1207 static inline Vec16ui if_add (Vec16ib const & f, Vec16ui const & a, Vec16ui const & b) {
1208 return Vec16ui(if_add(f, Vec16i(a), Vec16i(b)));
1209 }
1210
1211 // Horizontal add: Calculates the sum of all vector elements.
1212 // Overflow will wrap around
horizontal_add(Vec16ui const & a)1213 static inline uint32_t horizontal_add (Vec16ui const & a) {
1214 return horizontal_add((Vec16i)a);
1215 }
1216
1217 // horizontal_add_x: Horizontal add extended: Calculates the sum of all vector elements. Defined later in this file
1218
1219 // function add_saturated: add element by element, unsigned with saturation
add_saturated(Vec16ui const & a,Vec16ui const & b)1220 static inline Vec16ui add_saturated(Vec16ui const & a, Vec16ui const & b) {
1221 Vec16ui sum = a + b;
1222 Vec16ib overflow = sum < (a | b); // overflow if (a + b) < (a | b)
1223 return _mm512_mask_set1_epi32(sum, overflow, -1); // 0xFFFFFFFF if overflow
1224 }
1225
1226 // function sub_saturated: subtract element by element, unsigned with saturation
sub_saturated(Vec16ui const & a,Vec16ui const & b)1227 static inline Vec16ui sub_saturated(Vec16ui const & a, Vec16ui const & b) {
1228 Vec16ui diff = a - b;
1229 return _mm512_maskz_mov_epi32(diff <= a, diff); // underflow if diff > a gives zero
1230 }
1231
1232 // function max: a > b ? a : b
max(Vec16ui const & a,Vec16ui const & b)1233 static inline Vec16ui max(Vec16ui const & a, Vec16ui const & b) {
1234 return _mm512_max_epu32(a,b);
1235 }
1236
1237 // function min: a < b ? a : b
min(Vec16ui const & a,Vec16ui const & b)1238 static inline Vec16ui min(Vec16ui const & a, Vec16ui const & b) {
1239 return _mm512_min_epu32(a,b);
1240 }
1241
1242
1243 /*****************************************************************************
1244 *
1245 * Vector of 8 64-bit signed integers
1246 *
1247 *****************************************************************************/
1248
1249 class Vec8q : public Vec512b {
1250 public:
1251 // Default constructor:
Vec8q()1252 Vec8q() {
1253 }
1254 // Constructor to broadcast the same value into all elements:
Vec8q(int64_t i)1255 Vec8q(int64_t i) {
1256 zmm = _mm512_set1_epi64(i);
1257 }
1258 // Constructor to build from all elements:
Vec8q(int64_t i0,int64_t i1,int64_t i2,int64_t i3,int64_t i4,int64_t i5,int64_t i6,int64_t i7)1259 Vec8q(int64_t i0, int64_t i1, int64_t i2, int64_t i3, int64_t i4, int64_t i5, int64_t i6, int64_t i7) {
1260 zmm = _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7);
1261 }
1262 // Constructor to build from two Vec4q:
Vec8q(Vec4q const & a0,Vec4q const & a1)1263 Vec8q(Vec4q const & a0, Vec4q const & a1) {
1264 zmm = _mm512_inserti64x4(_mm512_castsi256_si512(a0), a1, 1);
1265 }
1266 // Constructor to convert from type __m512i used in intrinsics:
Vec8q(__m512i const & x)1267 Vec8q(__m512i const & x) {
1268 zmm = x;
1269 }
1270 // Assignment operator to convert from type __m512i used in intrinsics:
1271 Vec8q & operator = (__m512i const & x) {
1272 zmm = x;
1273 return *this;
1274 }
1275 // Type cast operator to convert to __m512i used in intrinsics
__m512i()1276 operator __m512i() const {
1277 return zmm;
1278 }
1279 // Member function to load from array (unaligned)
load(void const * p)1280 Vec8q & load(void const * p) {
1281 zmm = _mm512_loadu_si512(p);
1282 return *this;
1283 }
1284 // Member function to load from array, aligned by 64
load_a(void const * p)1285 Vec8q & load_a(void const * p) {
1286 zmm = _mm512_load_si512(p);
1287 return *this;
1288 }
1289 // Partial load. Load n elements and set the rest to 0
load_partial(int n,void const * p)1290 Vec8q & load_partial(int n, void const * p) {
1291 zmm = _mm512_maskz_loadu_epi64(__mmask16((1 << n) - 1), p);
1292 return *this;
1293 }
1294 // Partial store. Store n elements
store_partial(int n,void * p)1295 void store_partial(int n, void * p) const {
1296 _mm512_mask_storeu_epi64(p, __mmask16((1 << n) - 1), zmm);
1297 }
1298 // cut off vector to n elements. The last 8-n elements are set to zero
cutoff(int n)1299 Vec8q & cutoff(int n) {
1300 zmm = _mm512_maskz_mov_epi64(__mmask16((1 << n) - 1), zmm);
1301 return *this;
1302 }
1303 // Member function to change a single element in vector
1304 // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,int64_t value)1305 Vec8q const & insert(uint32_t index, int64_t value) {
1306 zmm = _mm512_mask_set1_epi64(zmm, __mmask16(1 << index), value);
1307 // zmm = _mm512_mask_blend_epi64(__mmask16(1 << index), zmm, _mm512_set1_epi64(value));
1308 return *this;
1309 }
1310 // Member function extract a single element from vector
extract(uint32_t index)1311 int64_t extract(uint32_t index) const {
1312 int64_t a[8];
1313 store (a);
1314 return a[index & 7];
1315 }
1316 // Extract a single element. Use store function if extracting more than one element.
1317 // Operator [] can only read an element, not write.
1318 int64_t operator [] (uint32_t index) const {
1319 return extract(index);
1320 }
1321 // Member functions to split into two Vec2q:
get_low()1322 Vec4q get_low() const {
1323 return _mm512_castsi512_si256(zmm);
1324 }
get_high()1325 Vec4q get_high() const {
1326 return _mm512_extracti64x4_epi64(zmm,1);
1327 }
size()1328 static int size () {
1329 return 8;
1330 }
1331 };
1332
1333
1334 // Define operators for Vec8q
1335
1336 // vector operator + : add element by element
1337 static inline Vec8q operator + (Vec8q const & a, Vec8q const & b) {
1338 return _mm512_add_epi64(a, b);
1339 }
1340
1341 // vector operator += : add
1342 static inline Vec8q & operator += (Vec8q & a, Vec8q const & b) {
1343 a = a + b;
1344 return a;
1345 }
1346
1347 // postfix operator ++
1348 static inline Vec8q operator ++ (Vec8q & a, int) {
1349 Vec8q a0 = a;
1350 a = a + 1;
1351 return a0;
1352 }
1353
1354 // prefix operator ++
1355 static inline Vec8q & operator ++ (Vec8q & a) {
1356 a = a + 1;
1357 return a;
1358 }
1359
1360 // vector operator - : subtract element by element
1361 static inline Vec8q operator - (Vec8q const & a, Vec8q const & b) {
1362 return _mm512_sub_epi64(a, b);
1363 }
1364
1365 // vector operator - : unary minus
1366 static inline Vec8q operator - (Vec8q const & a) {
1367 return _mm512_sub_epi64(_mm512_setzero_epi32(), a);
1368 }
1369
1370 // vector operator -= : subtract
1371 static inline Vec8q & operator -= (Vec8q & a, Vec8q const & b) {
1372 a = a - b;
1373 return a;
1374 }
1375
1376 // postfix operator --
1377 static inline Vec8q operator -- (Vec8q & a, int) {
1378 Vec8q a0 = a;
1379 a = a - 1;
1380 return a0;
1381 }
1382
1383 // prefix operator --
1384 static inline Vec8q & operator -- (Vec8q & a) {
1385 a = a - 1;
1386 return a;
1387 }
1388
1389 // vector operator * : multiply element by element
1390 static inline Vec8q operator * (Vec8q const & a, Vec8q const & b) {
1391 #ifdef __AVX512DQ__
1392 return _mm512_mullo_epi64(a, b);
1393 #elif defined (__INTEL_COMPILER)
1394 return _mm512_mullox_epi64(a, b); // _mm512_mullox_epi64 missing in gcc
1395 #else
1396 // instruction does not exist. Split into 32-bit multiplies
1397 //__m512i ahigh = _mm512_shuffle_epi32(a, 0xB1); // swap H<->L
1398 __m512i ahigh = _mm512_srli_epi64(a, 32); // high 32 bits of each a
1399 __m512i bhigh = _mm512_srli_epi64(b, 32); // high 32 bits of each b
1400 __m512i prodahb = _mm512_mul_epu32(ahigh, b); // ahigh*b
1401 __m512i prodbha = _mm512_mul_epu32(bhigh, a); // bhigh*a
1402 __m512i prodhl = _mm512_add_epi64(prodahb, prodbha); // sum of high*low products
1403 __m512i prodhi = _mm512_slli_epi64(prodhl, 32); // same, shifted high
1404 __m512i prodll = _mm512_mul_epu32(a, b); // alow*blow = 64 bit unsigned products
1405 __m512i prod = _mm512_add_epi64(prodll, prodhi); // low*low+(high*low)<<32
1406 return prod;
1407 #endif
1408 }
1409
1410 // vector operator *= : multiply
1411 static inline Vec8q & operator *= (Vec8q & a, Vec8q const & b) {
1412 a = a * b;
1413 return a;
1414 }
1415
1416 // vector operator << : shift left
1417 static inline Vec8q operator << (Vec8q const & a, int32_t b) {
1418 return _mm512_sll_epi64(a, _mm_cvtsi32_si128(b));
1419 }
1420
1421 // vector operator <<= : shift left
1422 static inline Vec8q & operator <<= (Vec8q & a, int32_t b) {
1423 a = a << b;
1424 return a;
1425 }
1426
1427 // vector operator >> : shift right arithmetic
1428 static inline Vec8q operator >> (Vec8q const & a, int32_t b) {
1429 return _mm512_sra_epi64(a, _mm_cvtsi32_si128(b));
1430 }
1431
1432 // vector operator >>= : shift right arithmetic
1433 static inline Vec8q & operator >>= (Vec8q & a, int32_t b) {
1434 a = a >> b;
1435 return a;
1436 }
1437
1438 // vector operator == : returns true for elements for which a == b
1439 static inline Vec8qb operator == (Vec8q const & a, Vec8q const & b) {
1440 return Vec8qb(_mm512_cmpeq_epi64_mask(a, b));
1441 }
1442
1443 // vector operator != : returns true for elements for which a != b
1444 static inline Vec8qb operator != (Vec8q const & a, Vec8q const & b) {
1445 return Vec8qb(_mm512_cmpneq_epi64_mask(a, b));
1446 }
1447
1448 // vector operator < : returns true for elements for which a < b
1449 static inline Vec8qb operator < (Vec8q const & a, Vec8q const & b) {
1450 return Vec8qb(_mm512_cmplt_epi64_mask(a, b));
1451 }
1452
1453 // vector operator > : returns true for elements for which a > b
1454 static inline Vec8qb operator > (Vec8q const & a, Vec8q const & b) {
1455 return b < a;
1456 }
1457
1458 // vector operator >= : returns true for elements for which a >= b (signed)
1459 static inline Vec8qb operator >= (Vec8q const & a, Vec8q const & b) {
1460 return Vec8qb(_mm512_cmpge_epi64_mask(a, b));
1461 }
1462
1463 // vector operator <= : returns true for elements for which a <= b (signed)
1464 static inline Vec8qb operator <= (Vec8q const & a, Vec8q const & b) {
1465 return b >= a;
1466 }
1467
1468 // vector operator & : bitwise and
1469 static inline Vec8q operator & (Vec8q const & a, Vec8q const & b) {
1470 return _mm512_and_epi32(a, b);
1471 }
1472
1473 // vector operator &= : bitwise and
1474 static inline Vec8q & operator &= (Vec8q & a, Vec8q const & b) {
1475 a = a & b;
1476 return a;
1477 }
1478
1479 // vector operator | : bitwise or
1480 static inline Vec8q operator | (Vec8q const & a, Vec8q const & b) {
1481 return _mm512_or_epi32(a, b);
1482 }
1483
1484 // vector operator |= : bitwise or
1485 static inline Vec8q & operator |= (Vec8q & a, Vec8q const & b) {
1486 a = a | b;
1487 return a;
1488 }
1489
1490 // vector operator ^ : bitwise xor
1491 static inline Vec8q operator ^ (Vec8q const & a, Vec8q const & b) {
1492 return _mm512_xor_epi32(a, b);
1493 }
1494 // vector operator ^= : bitwise xor
1495 static inline Vec8q & operator ^= (Vec8q & a, Vec8q const & b) {
1496 a = a ^ b;
1497 return a;
1498 }
1499
1500 // vector operator ~ : bitwise not
1501 static inline Vec8q operator ~ (Vec8q const & a) {
1502 return Vec8q(~ Vec16i(a));
1503 }
1504
1505 // Functions for this class
1506
1507 // Select between two operands. Corresponds to this pseudocode:
1508 // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec8qb const & s,Vec8q const & a,Vec8q const & b)1509 static inline Vec8q select (Vec8qb const & s, Vec8q const & a, Vec8q const & b) {
1510 return _mm512_mask_mov_epi64(b, s, a);
1511 //return _mm512_mask_blend_epi64(s, b, a);
1512 }
1513
1514 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec8qb const & f,Vec8q const & a,Vec8q const & b)1515 static inline Vec8q if_add (Vec8qb const & f, Vec8q const & a, Vec8q const & b) {
1516 return _mm512_mask_add_epi64(a, f, a, b);
1517 }
1518
1519 // Horizontal add: Calculates the sum of all vector elements.
1520 // Overflow will wrap around
horizontal_add(Vec8q const & a)1521 static inline int64_t horizontal_add (Vec8q const & a) {
1522 #if defined(__INTEL_COMPILER)
1523 return _mm512_reduce_add_epi64(a);
1524 #else
1525 return horizontal_add(a.get_low()+a.get_high());
1526 #endif
1527 }
1528
1529 // Horizontal add extended: Calculates the sum of all vector elements
1530 // Elements are sign extended before adding to avoid overflow
horizontal_add_x(Vec16i const & x)1531 static inline int64_t horizontal_add_x (Vec16i const & x) {
1532 Vec8q a = _mm512_cvtepi32_epi64(x.get_low());
1533 Vec8q b = _mm512_cvtepi32_epi64(x.get_high());
1534 return horizontal_add(a+b);
1535 }
1536
1537 // Horizontal add extended: Calculates the sum of all vector elements
1538 // Elements are zero extended before adding to avoid overflow
horizontal_add_x(Vec16ui const & x)1539 static inline uint64_t horizontal_add_x (Vec16ui const & x) {
1540 Vec8q a = _mm512_cvtepu32_epi64(x.get_low());
1541 Vec8q b = _mm512_cvtepu32_epi64(x.get_high());
1542 return horizontal_add(a+b);
1543 }
1544
1545 // function max: a > b ? a : b
max(Vec8q const & a,Vec8q const & b)1546 static inline Vec8q max(Vec8q const & a, Vec8q const & b) {
1547 return _mm512_max_epi64(a, b);
1548 }
1549
1550 // function min: a < b ? a : b
min(Vec8q const & a,Vec8q const & b)1551 static inline Vec8q min(Vec8q const & a, Vec8q const & b) {
1552 return _mm512_min_epi64(a, b);
1553 }
1554
1555 // function abs: a >= 0 ? a : -a
abs(Vec8q const & a)1556 static inline Vec8q abs(Vec8q const & a) {
1557 return _mm512_abs_epi64(a);
1558 }
1559
1560 // function abs_saturated: same as abs, saturate if overflow
abs_saturated(Vec8q const & a)1561 static inline Vec8q abs_saturated(Vec8q const & a) {
1562 return _mm512_min_epu64(abs(a), Vec8q(0x7FFFFFFFFFFFFFFF));
1563 }
1564
1565 // function rotate_left all elements
1566 // Use negative count to rotate right
rotate_left(Vec8q const & a,int b)1567 static inline Vec8q rotate_left(Vec8q const & a, int b) {
1568 return _mm512_rolv_epi64(a, Vec8q(b));
1569 }
1570
1571
1572 /*****************************************************************************
1573 *
1574 * Vector of 8 64-bit unsigned integers
1575 *
1576 *****************************************************************************/
1577
1578 class Vec8uq : public Vec8q {
1579 public:
1580 // Default constructor:
Vec8uq()1581 Vec8uq() {
1582 }
1583 // Constructor to broadcast the same value into all elements:
Vec8uq(uint64_t i)1584 Vec8uq(uint64_t i) {
1585 zmm = Vec8q(i);
1586 }
1587 // Constructor to convert from Vec8q:
Vec8uq(Vec8q const & x)1588 Vec8uq(Vec8q const & x) {
1589 zmm = x;
1590 }
1591 // Constructor to convert from type __m512i used in intrinsics:
Vec8uq(__m512i const & x)1592 Vec8uq(__m512i const & x) {
1593 zmm = x;
1594 }
1595 // Constructor to build from all elements:
Vec8uq(uint64_t i0,uint64_t i1,uint64_t i2,uint64_t i3,uint64_t i4,uint64_t i5,uint64_t i6,uint64_t i7)1596 Vec8uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3, uint64_t i4, uint64_t i5, uint64_t i6, uint64_t i7) {
1597 zmm = Vec8q(i0, i1, i2, i3, i4, i5, i6, i7);
1598 }
1599 // Constructor to build from two Vec4uq:
Vec8uq(Vec4uq const & a0,Vec4uq const & a1)1600 Vec8uq(Vec4uq const & a0, Vec4uq const & a1) {
1601 zmm = Vec8q(Vec4q(a0), Vec4q(a1));
1602 }
1603 // Assignment operator to convert from Vec8q:
1604 Vec8uq & operator = (Vec8q const & x) {
1605 zmm = x;
1606 return *this;
1607 }
1608 // Assignment operator to convert from type __m512i used in intrinsics:
1609 Vec8uq & operator = (__m512i const & x) {
1610 zmm = x;
1611 return *this;
1612 }
1613 // Member function to load from array (unaligned)
load(void const * p)1614 Vec8uq & load(void const * p) {
1615 Vec8q::load(p);
1616 return *this;
1617 }
1618 // Member function to load from array, aligned by 32
load_a(void const * p)1619 Vec8uq & load_a(void const * p) {
1620 Vec8q::load_a(p);
1621 return *this;
1622 }
1623 // Member function to change a single element in vector
1624 // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,uint64_t value)1625 Vec8uq const & insert(uint32_t index, uint64_t value) {
1626 Vec8q::insert(index, value);
1627 return *this;
1628 }
1629 // Member function extract a single element from vector
extract(uint32_t index)1630 uint64_t extract(uint32_t index) const {
1631 return Vec8q::extract(index);
1632 }
1633 // Extract a single element. Use store function if extracting more than one element.
1634 // Operator [] can only read an element, not write.
1635 uint64_t operator [] (uint32_t index) const {
1636 return extract(index);
1637 }
1638 // Member functions to split into two Vec2uq:
get_low()1639 Vec4uq get_low() const {
1640 return Vec4uq(Vec8q::get_low());
1641 }
get_high()1642 Vec4uq get_high() const {
1643 return Vec4uq(Vec8q::get_high());
1644 }
1645 };
1646
1647 // Define operators for this class
1648
1649 // vector operator + : add
1650 static inline Vec8uq operator + (Vec8uq const & a, Vec8uq const & b) {
1651 return Vec8uq (Vec8q(a) + Vec8q(b));
1652 }
1653
1654 // vector operator - : subtract
1655 static inline Vec8uq operator - (Vec8uq const & a, Vec8uq const & b) {
1656 return Vec8uq (Vec8q(a) - Vec8q(b));
1657 }
1658
1659 // vector operator * : multiply element by element
1660 static inline Vec8uq operator * (Vec8uq const & a, Vec8uq const & b) {
1661 return Vec8uq (Vec8q(a) * Vec8q(b));
1662 }
1663
1664 // vector operator >> : shift right logical all elements
1665 static inline Vec8uq operator >> (Vec8uq const & a, uint32_t b) {
1666 return _mm512_srl_epi64(a,_mm_cvtsi32_si128(b));
1667 }
1668
1669 // vector operator >> : shift right logical all elements
1670 static inline Vec8uq operator >> (Vec8uq const & a, int32_t b) {
1671 return a >> (uint32_t)b;
1672 }
1673
1674 // vector operator >>= : shift right artihmetic
1675 static inline Vec8uq & operator >>= (Vec8uq & a, uint32_t b) {
1676 a = a >> b;
1677 return a;
1678 }
1679
1680 // vector operator >>= : shift right logical
1681 static inline Vec8uq & operator >>= (Vec8uq & a, int32_t b) {
1682 a = a >> uint32_t(b);
1683 return a;
1684 }
1685
1686 // vector operator << : shift left all elements
1687 static inline Vec8uq operator << (Vec8uq const & a, uint32_t b) {
1688 return Vec8uq ((Vec8q)a << (int32_t)b);
1689 }
1690
1691 // vector operator << : shift left all elements
1692 static inline Vec8uq operator << (Vec8uq const & a, int32_t b) {
1693 return Vec8uq ((Vec8q)a << b);
1694 }
1695
1696 // vector operator < : returns true for elements for which a < b (unsigned)
1697 static inline Vec8qb operator < (Vec8uq const & a, Vec8uq const & b) {
1698 return _mm512_cmplt_epu64_mask(a, b);
1699 }
1700
1701 // vector operator > : returns true for elements for which a > b (unsigned)
1702 static inline Vec8qb operator > (Vec8uq const & a, Vec8uq const & b) {
1703 return b < a;
1704 }
1705
1706 // vector operator >= : returns true for elements for which a >= b (unsigned)
1707 static inline Vec8qb operator >= (Vec8uq const & a, Vec8uq const & b) {
1708 return _mm512_cmpge_epu64_mask(a, b);
1709 }
1710
1711 // vector operator <= : returns true for elements for which a <= b (unsigned)
1712 static inline Vec8qb operator <= (Vec8uq const & a, Vec8uq const & b) {
1713 return b >= a;
1714 }
1715
1716 // vector operator & : bitwise and
1717 static inline Vec8uq operator & (Vec8uq const & a, Vec8uq const & b) {
1718 return Vec8uq(Vec8q(a) & Vec8q(b));
1719 }
1720
1721 // vector operator | : bitwise or
1722 static inline Vec8uq operator | (Vec8uq const & a, Vec8uq const & b) {
1723 return Vec8uq(Vec8q(a) | Vec8q(b));
1724 }
1725
1726 // vector operator ^ : bitwise xor
1727 static inline Vec8uq operator ^ (Vec8uq const & a, Vec8uq const & b) {
1728 return Vec8uq(Vec8q(a) ^ Vec8q(b));
1729 }
1730
1731 // Functions for this class
1732
1733 // Select between two operands. Corresponds to this pseudocode:
1734 // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec8qb const & s,Vec8uq const & a,Vec8uq const & b)1735 static inline Vec8uq select (Vec8qb const & s, Vec8uq const & a, Vec8uq const & b) {
1736 return Vec8uq(select(s, Vec8q(a), Vec8q(b)));
1737 }
1738
1739 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec8qb const & f,Vec8uq const & a,Vec8uq const & b)1740 static inline Vec8uq if_add (Vec8qb const & f, Vec8uq const & a, Vec8uq const & b) {
1741 return _mm512_mask_add_epi64(a, f, a, b);
1742 }
1743
1744 // Horizontal add: Calculates the sum of all vector elements.
1745 // Overflow will wrap around
horizontal_add(Vec8uq const & a)1746 static inline uint64_t horizontal_add (Vec8uq const & a) {
1747 return horizontal_add(Vec8q(a));
1748 }
1749
1750 // function max: a > b ? a : b
max(Vec8uq const & a,Vec8uq const & b)1751 static inline Vec8uq max(Vec8uq const & a, Vec8uq const & b) {
1752 return _mm512_max_epu64(a, b);
1753 }
1754
1755 // function min: a < b ? a : b
min(Vec8uq const & a,Vec8uq const & b)1756 static inline Vec8uq min(Vec8uq const & a, Vec8uq const & b) {
1757 return _mm512_min_epu64(a, b);
1758 }
1759
1760
1761 /*****************************************************************************
1762 *
1763 * Vector permute functions
1764 *
1765 ******************************************************************************
1766 *
1767 * These permute functions can reorder the elements of a vector and optionally
1768 * set some elements to zero.
1769 *
1770 * The indexes are inserted as template parameters in <>. These indexes must be
1771 * constants. Each template parameter is an index to the element you want to select.
1772 * An index of -1 will generate zero. An index of -256 means don't care.
1773 *
1774 * Example:
1775 * Vec8q a(10,11,12,13,14,15,16,17); // a is (10,11,12,13,14,15,16,17)
1776 * Vec8q b;
1777 * b = permute8q<0,2,7,7,-1,-1,1,1>(a); // b is (10,12,17,17, 0, 0,11,11)
1778 *
1779 * A lot of the code here is metaprogramming aiming to find the instructions
1780 * that best fit the template parameters and instruction set. The metacode
1781 * will be reduced out to leave only a few vector instructions in release
1782 * mode with optimization on.
1783 *****************************************************************************/
1784
1785 // Permute vector of 8 64-bit integers.
1786 // Index -1 gives 0, index -256 means don't care.
1787 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
permute8q(Vec8q const & a)1788 static inline Vec8q permute8q(Vec8q const & a) {
1789
1790 // Combine indexes into a single bitfield, with 4 bits for each
1791 const int m1 = (i0&7) | (i1&7)<<4 | (i2&7)<< 8 | (i3&7)<<12 | (i4&7)<<16 | (i5&7)<<20 | (i6&7)<<24 | (i7&7)<<28;
1792
1793 // Mask to zero out negative indexes
1794 const int mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000);
1795 const int m2 = m1 & mz;
1796
1797 // zeroing needed
1798 const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7) & 0x80) != 0;
1799
1800 // special case: all zero
1801 if (mz == 0) return _mm512_setzero_epi32();
1802
1803 // mask for elements not zeroed
1804 const __mmask16 z = __mmask16((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7);
1805 // same with 2 bits for each element
1806 const __mmask16 zz = __mmask16((i0>=0?3:0) | (i1>=0?0xC:0) | (i2>=0?0x30:0) | (i3>=0?0xC0:0) | (i4>=0?0x300:0) | (i5>=0?0xC00:0) | (i6>=0?0x3000:0) | (i7>=0?0xC000:0));
1807
1808 if (((m1 ^ 0x76543210) & mz) == 0) {
1809 // no shuffling
1810 if (dozero) {
1811 // zero some elements
1812 return _mm512_maskz_mov_epi64(z, a);
1813 }
1814 return a; // do nothing
1815 }
1816
1817 if (((m1 ^ 0x66442200) & 0x66666666 & mz) == 0) {
1818 // no exchange of data between the four 128-bit lanes
1819 const int pat = ((m2 | m2 >> 8 | m2 >> 16 | m2 >> 24) & 0x11) * 0x01010101;
1820 const int pmask = ((pat & 1) * 10 + 4) | ((((pat >> 4) & 1) * 10 + 4) << 4);
1821 if (((m1 ^ pat) & mz & 0x11111111) == 0) {
1822 // same permute pattern in all lanes
1823 if (dozero) { // permute within lanes and zero
1824 return _mm512_maskz_shuffle_epi32(zz, a, (_MM_PERM_ENUM)pmask);
1825 }
1826 else { // permute within lanes
1827 return _mm512_shuffle_epi32(a, (_MM_PERM_ENUM)pmask);
1828 }
1829 }
1830 // different permute patterns in each lane. It's faster to do a full permute than four masked permutes within lanes
1831 }
1832 if ((((m1 ^ 0x10101010) & 0x11111111 & mz) == 0)
1833 && ((m1 ^ (m1 >> 4)) & 0x06060606 & mz & (mz >> 4)) == 0) {
1834 // permute lanes only. no permutation within each lane
1835 const int m3 = m2 | (m2 >> 4);
1836 const int s = ((m3 >> 1) & 3) | (((m3 >> 9) & 3) << 2) | (((m3 >> 17) & 3) << 4) | (((m3 >> 25) & 3) << 6);
1837 if (dozero) {
1838 // permute lanes and zero some 64-bit elements
1839 return _mm512_maskz_shuffle_i64x2(z, a, a, (_MM_PERM_ENUM)s);
1840 }
1841 else {
1842 // permute lanes
1843 return _mm512_shuffle_i64x2(a, a, (_MM_PERM_ENUM)s);
1844 }
1845 }
1846 // full permute needed
1847 const __m512i pmask = constant16i<i0&7, 0, i1&7, 0, i2&7, 0, i3&7, 0, i4&7, 0, i5&7, 0, i6&7, 0, i7&7, 0>();
1848 if (dozero) {
1849 // full permute and zeroing
1850 // Documentation is inconsistent. which order of the operands is correct?
1851 return _mm512_maskz_permutexvar_epi64(z, pmask, a);
1852 }
1853 else {
1854 return _mm512_permutexvar_epi64(pmask, a);
1855 }
1856 }
1857
1858 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
permute8uq(Vec8uq const & a)1859 static inline Vec8uq permute8uq(Vec8uq const & a) {
1860 return Vec8uq (permute8q<i0,i1,i2,i3,i4,i5,i6,i7> (a));
1861 }
1862
1863
1864 // Permute vector of 16 32-bit integers.
1865 // Index -1 gives 0, index -256 means don't care.
1866 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
permute16i(Vec16i const & a)1867 static inline Vec16i permute16i(Vec16i const & a) {
1868
1869 // Combine indexes into a single bitfield, with 4 bits for each
1870 const uint64_t m1 = (i0&15) | (i1&15)<<4 | (i2&15)<< 8 | (i3&15)<<12 | (i4&15)<<16 | (i5&15)<<20 | (i6&15)<<24 | (i7&15LL)<<28 // 15LL avoids sign extension of (int32_t | int64_t)
1871 | (i8&15LL)<<32 | (i9&15LL)<<36 | (i10&15LL)<<40 | (i11&15LL)<<44 | (i12&15LL)<<48 | (i13&15LL)<<52 | (i14&15LL)<<56 | (i15&15LL)<<60;
1872
1873 // Mask to zero out negative indexes
1874 const uint64_t mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000ULL) | (i8<0?0:0xF00000000)
1875 | (i9<0?0:0xF000000000) | (i10<0?0:0xF0000000000) | (i11<0?0:0xF00000000000) | (i12<0?0:0xF000000000000) | (i13<0?0:0xF0000000000000) | (i14<0?0:0xF00000000000000) | (i15<0?0:0xF000000000000000);
1876
1877 const uint64_t m2 = m1 & mz;
1878
1879 // zeroing needed
1880 const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15) & 0x80) != 0;
1881
1882 // special case: all zero
1883 if (mz == 0) return _mm512_setzero_epi32();
1884
1885 // mask for elements not zeroed
1886 const __mmask16 z = __mmask16((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7
1887 | (i8>=0)<<8 | (i9>=0)<<9 | (i10>=0)<<10 | (i11>=0)<<11 | (i12>=0)<<12 | (i13>=0)<<13 | (i14>=0)<<14 | (i15>=0)<<15);
1888
1889 if (((m1 ^ 0xFEDCBA9876543210) & mz) == 0) {
1890 // no shuffling
1891 if (dozero) {
1892 // zero some elements
1893 return _mm512_maskz_mov_epi32(z, a);
1894 }
1895 return a; // do nothing
1896 }
1897
1898 if (((m1 ^ 0xCCCC888844440000) & 0xCCCCCCCCCCCCCCCC & mz) == 0) {
1899 // no exchange of data between the four 128-bit lanes
1900 const uint64_t pat = ((m2 | (m2 >> 16) | (m2 >> 32) | (m2 >> 48)) & 0x3333) * 0x0001000100010001;
1901 const int pmask = (pat & 3) | (((pat >> 4) & 3) << 2) | (((pat >> 8) & 3) << 4) | (((pat >> 12) & 3) << 6);
1902 if (((m1 ^ pat) & 0x3333333333333333 & mz) == 0) {
1903 // same permute pattern in all lanes
1904 if (dozero) { // permute within lanes and zero
1905 return _mm512_maskz_shuffle_epi32(z, a, (_MM_PERM_ENUM)pmask);
1906 }
1907 else { // permute within lanes
1908 return _mm512_shuffle_epi32(a, (_MM_PERM_ENUM)pmask);
1909 }
1910 }
1911 // different permute patterns in each lane. It's faster to do a full permute than four masked permutes within lanes
1912 }
1913 const uint64_t lane = (m2 | m2 >> 4 | m2 >> 8 | m2 >> 12) & 0x000C000C000C000C;
1914 if ((((m1 ^ 0x3210321032103210) & 0x3333333333333333 & mz) == 0)
1915 && ((m1 ^ (lane * 0x1111)) & 0xCCCCCCCCCCCCCCCC & mz) == 0) {
1916 // permute lanes only. no permutation within each lane
1917 const uint64_t s = ((lane >> 2) & 3) | (((lane >> 18) & 3) << 2) | (((lane >> 34) & 3) << 4) | (((lane >> 50) & 3) << 6);
1918 if (dozero) {
1919 // permute lanes and zero some 64-bit elements
1920 return _mm512_maskz_shuffle_i32x4(z, a, a, (_MM_PERM_ENUM)s);
1921 }
1922 else {
1923 // permute lanes
1924 return _mm512_shuffle_i32x4(a, a, (_MM_PERM_ENUM)s);
1925 }
1926 }
1927 // full permute needed
1928 const __m512i pmask = constant16i<i0&15, i1&15, i2&15, i3&15, i4&15, i5&15, i6&15, i7&15, i8&15, i9&15, i10&15, i11&15, i12&15, i13&15, i14&15, i15&15>();
1929 if (dozero) {
1930 // full permute and zeroing
1931 return _mm512_maskz_permutexvar_epi32(z, pmask, a);
1932 }
1933 else {
1934 return _mm512_permutexvar_epi32(pmask, a);
1935 }
1936 }
1937
1938
1939 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
permute16ui(Vec16ui const & a)1940 static inline Vec16ui permute16ui(Vec16ui const & a) {
1941 return Vec16ui (permute16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a));
1942 }
1943
1944
1945 /*****************************************************************************
1946 *
1947 * Vector blend functions
1948 *
1949 ******************************************************************************
1950 *
1951 * These blend functions can mix elements from two different vectors and
1952 * optionally set some elements to zero.
1953 *
1954 * The indexes are inserted as template parameters in <>. These indexes must be
1955 * constants. Each template parameter is an index to the element you want to
1956 * select, where higher indexes indicate an element from the second source
1957 * vector. For example, if each vector has 8 elements, then indexes 0 - 7
1958 * will select an element from the first vector and indexes 8 - 15 will select
1959 * an element from the second vector. A negative index will generate zero.
1960 *
1961 * Example:
1962 * Vec8q a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
1963 * Vec8q b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
1964 * Vec8q c;
1965 * c = blend8q<1,0,9,8,7,-1,15,15> (a,b); // c is (101, 100, 201, 200, 107, 0, 207, 207)
1966 *
1967 * A lot of the code here is metaprogramming aiming to find the instructions
1968 * that best fit the template parameters and instruction set. The metacode
1969 * will be reduced out to leave only a few vector instructions in release
1970 * mode with optimization on.
1971 *****************************************************************************/
1972
1973
1974 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
blend8q(Vec8q const & a,Vec8q const & b)1975 static inline Vec8q blend8q(Vec8q const & a, Vec8q const & b) {
1976
1977 // Combine indexes into a single bitfield, with 4 bits for each
1978 const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<< 8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
1979
1980 // Mask to zero out negative indexes
1981 const int mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000);
1982 const int m2 = m1 & mz;
1983
1984 // zeroing needed
1985 const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7) & 0x80) != 0;
1986
1987 // mask for elements not zeroed
1988 const __mmask16 z = __mmask16((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7);
1989
1990 // special case: all zero
1991 if (mz == 0) return _mm512_setzero_epi32();
1992
1993 // special case: all from a
1994 if ((m1 & 0x88888888 & mz) == 0) {
1995 return permute8q <i0, i1, i2, i3, i4, i5, i6, i7> (a);
1996 }
1997
1998 // special case: all from b
1999 if ((~m1 & 0x88888888 & mz) == 0) {
2000 return permute8q <i0^8, i1^8, i2^8, i3^8, i4^8, i5^8, i6^8, i7^8> (b);
2001 }
2002
2003 // special case: blend without permute
2004 if (((m1 ^ 0x76543210) & 0x77777777 & mz) == 0) {
2005 __mmask16 blendmask = __mmask16((i0&8)>>3 | (i1&8)>>2 | (i2&8)>>1 | (i3&8)>>0 | (i4&8)<<1 | (i5&8)<<2 | (i6&8)<<3 | (i7&8)<<4 );
2006 __m512i t = _mm512_mask_blend_epi64(blendmask, a, b);
2007 if (dozero) {
2008 t = _mm512_maskz_mov_epi64(z, t);
2009 }
2010 return t;
2011 }
2012 // special case: all data stay within their lane
2013 if (((m1 ^ 0x66442200) & 0x66666666 & mz) == 0) {
2014 // mask for elements from a and b
2015 const uint32_t mb = ((i0&8)?0xF:0) | ((i1&8)?0xF0:0) | ((i2&8)?0xF00:0) | ((i3&8)?0xF000:0) | ((i4&8)?0xF0000:0) | ((i5&8)?0xF00000:0) | ((i6&8)?0xF000000:0) | ((i7&8)?0xF0000000:0);
2016 const uint32_t mbz = mb & mz; // mask for nonzero elements from b
2017 const uint32_t maz = ~mb & mz; // mask for nonzero elements from a
2018 const uint32_t m1a = m1 & maz;
2019 const uint32_t m1b = m1 & mbz;
2020 const uint32_t pata = ((m1a | m1a >> 8 | m1a >> 16 | m1a >> 24) & 0xFF) * 0x01010101; // permute pattern for elements from a
2021 const uint32_t patb = ((m1b | m1b >> 8 | m1b >> 16 | m1b >> 24) & 0xFF) * 0x01010101; // permute pattern for elements from b
2022 if (((m1 ^ pata) & 0x11111111 & maz) == 0 && ((m1 ^ patb) & 0x11111111 & mbz) == 0) {
2023 // Same permute pattern in all lanes:
2024 // This code generates two instructions instead of one, but we are avoiding the slow lane-crossing instruction,
2025 // and we are saving 64 bytes of data cache.
2026 // 1. Permute a, zero elements not from a (using _mm512_maskz_shuffle_epi32)
2027 __m512i ta = permute8q< (maz&0xF)?i0&7:-1, (maz&0xF0)?i1&7:-1, (maz&0xF00)?i2&7:-1, (maz&0xF000)?i3&7:-1,
2028 (maz&0xF0000)?i4&7:-1, (maz&0xF00000)?i5&7:-1, (maz&0xF000000)?i6&7:-1, (maz&0xF0000000)?i7&7:-1> (a);
2029 // write mask for elements from b
2030 const __mmask16 sb = ((mbz&0xF)?3:0) | ((mbz&0xF0)?0xC:0) | ((mbz&0xF00)?0x30:0) | ((mbz&0xF000)?0xC0:0) | ((mbz&0xF0000)?0x300:0) | ((mbz&0xF00000)?0xC00:0) | ((mbz&0xF000000)?0x3000:0) | ((mbz&0xF0000000)?0xC000:0);
2031 // permute index for elements from b
2032 const int pi = ((patb & 1) * 10 + 4) | ((((patb >> 4) & 1) * 10 + 4) << 4);
2033 // 2. Permute elements from b and combine with elements from a through write mask
2034 return _mm512_mask_shuffle_epi32(ta, sb, b, (_MM_PERM_ENUM)pi);
2035 }
2036 // not same permute pattern in all lanes. use full permute
2037 }
2038 // general case: full permute
2039 const __m512i pmask = constant16i<i0&0xF, 0, i1&0xF, 0, i2&0xF, 0, i3&0xF, 0, i4&0xF, 0, i5&0xF, 0, i6&0xF, 0, i7&0xF, 0>();
2040 if (dozero) {
2041 return _mm512_maskz_permutex2var_epi64(z, a, pmask, b);
2042 }
2043 else {
2044 return _mm512_permutex2var_epi64(a, pmask, b);
2045 }
2046 }
2047
2048 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
blend8uq(Vec8uq const & a,Vec8uq const & b)2049 static inline Vec8uq blend8uq(Vec8uq const & a, Vec8uq const & b) {
2050 return Vec8uq( blend8q<i0,i1,i2,i3,i4,i5,i6,i7> (a,b));
2051 }
2052
2053
2054 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
2055 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
blend16i(Vec16i const & a,Vec16i const & b)2056 static inline Vec16i blend16i(Vec16i const & a, Vec16i const & b) {
2057
2058 // Combine indexes into a single bitfield, with 4 bits for each indicating shuffle, but not source
2059 const uint64_t m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xFLL)<<28
2060 | (i8&0xFLL)<<32 | (i9&0xFLL)<<36 | (i10&0xFLL)<<40 | (i11&0xFLL)<<44 | (i12&0xFLL)<<48 | (i13&0xFLL)<<52 | (i14&0xFLL)<<56 | (i15&0xFLL)<<60;
2061
2062 // Mask to zero out negative indexes
2063 const uint64_t mz = (i0<0?0:0xF) | (i1<0?0:0xF0) | (i2<0?0:0xF00) | (i3<0?0:0xF000) | (i4<0?0:0xF0000) | (i5<0?0:0xF00000) | (i6<0?0:0xF000000) | (i7<0?0:0xF0000000ULL)
2064 | (i8<0?0:0xF00000000) | (i9<0?0:0xF000000000) | (i10<0?0:0xF0000000000) | (i11<0?0:0xF00000000000) | (i12<0?0:0xF000000000000) | (i13<0?0:0xF0000000000000) | (i14<0?0:0xF00000000000000) | (i15<0?0:0xF000000000000000);
2065 const uint64_t m2 = m1 & mz;
2066
2067 // collect bit 4 of each index = select source
2068 const uint64_t ms = ((i0&16)?0xF:0) | ((i1&16)?0xF0:0) | ((i2&16)?0xF00:0) | ((i3&16)?0xF000:0) | ((i4&16)?0xF0000:0) | ((i5&16)?0xF00000:0) | ((i6&16)?0xF000000:0) | ((i7&16)?0xF0000000ULL:0)
2069 | ((i8&16)?0xF00000000:0) | ((i9&16)?0xF000000000:0) | ((i10&16)?0xF0000000000:0) | ((i11&16)?0xF00000000000:0) | ((i12&16)?0xF000000000000:0) | ((i13&16)?0xF0000000000000:0) | ((i14&16)?0xF00000000000000:0) | ((i15&16)?0xF000000000000000:0);
2070
2071 // zeroing needed
2072 const bool dozero = ((i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15) & 0x80) != 0;
2073
2074 // mask for elements not zeroed
2075 const __mmask16 z = __mmask16((i0>=0)<<0 | (i1>=0)<<1 | (i2>=0)<<2 | (i3>=0)<<3 | (i4>=0)<<4 | (i5>=0)<<5 | (i6>=0)<<6 | (i7>=0)<<7
2076 | (i8>=0)<<8 | (i9>=0)<<9 | (i10>=0)<<10 | (i11>=0)<<11 | (i12>=0)<<12 | (i13>=0)<<13 | (i14>=0)<<14 | (i15>=0)<<15);
2077
2078 // special case: all zero
2079 if (mz == 0) return _mm512_setzero_epi32();
2080
2081 // special case: all from a
2082 if ((ms & mz) == 0) {
2083 return permute16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a);
2084 }
2085
2086 // special case: all from b
2087 if ((~ms & mz) == 0) {
2088 return permute16i<i0^16,i1^16,i2^16,i3^16,i4^16,i5^16,i6^16,i7^16,i8^16,i9^16,i10^16,i11^16,i12^16,i13^16,i14^16,i15^16 > (b);
2089 }
2090
2091 // special case: blend without permute
2092 if (((m1 ^ 0xFEDCBA9876543210) & mz) == 0) {
2093 __mmask16 blendmask = __mmask16((i0&16)>>4 | (i1&16)>>3 | (i2&16)>>2 | (i3&16)>>1 | (i4&16) | (i5&16)<<1 | (i6&16)<<2 | (i7&16)<<3
2094 | (i8&16)<<4 | (i9&16)<<5 | (i10&16)<<6 | (i11&16)<<7 | (i12&16)<<8 | (i13&16)<<9 | (i14&16)<<10 | (i15&16)<<11);
2095 __m512i t = _mm512_mask_blend_epi32(blendmask, a, b);
2096 if (dozero) {
2097 t = _mm512_maskz_mov_epi32(z, t);
2098 }
2099 return t;
2100 }
2101
2102 // special case: all data stay within their lane
2103 if (((m1 ^ 0xCCCC888844440000) & 0xCCCCCCCCCCCCCCCC & mz) == 0) {
2104
2105 // mask for elements from a and b
2106 const uint64_t mb = ms;
2107 const uint64_t mbz = mb & mz; // mask for nonzero elements from b
2108 const uint64_t maz = ~mb & mz; // mask for nonzero elements from a
2109 const uint64_t m1a = m1 & maz;
2110 const uint64_t m1b = m1 & mbz;
2111 const uint64_t pata = ((m1a | m1a >> 16 | m1a >> 32 | m1a >> 48) & 0xFFFF) * 0x0001000100010001; // permute pattern for elements from a
2112 const uint64_t patb = ((m1b | m1b >> 16 | m1b >> 32 | m1b >> 48) & 0xFFFF) * 0x0001000100010001; // permute pattern for elements from b
2113 if (((m1 ^ pata) & 0x3333333333333333 & maz) == 0 && ((m1 ^ patb) & 0x3333333333333333 & mbz) == 0) {
2114 // Same permute pattern in all lanes:
2115 // This code generates two instructions instead of one, but we are avoiding the slow lane-crossing instruction,
2116 // and we are saving 64 bytes of data cache.
2117 // 1. Permute a, zero elements not from a (using _mm512_maskz_shuffle_epi32)
2118 __m512i ta = permute16i< (maz&0xF)?i0&15:-1, (maz&0xF0)?i1&15:-1, (maz&0xF00)?i2&15:-1, (maz&0xF000)?i3&15:-1,
2119 (maz&0xF0000)?i4&15:-1, (maz&0xF00000)?i5&15:-1, (maz&0xF000000)?i6&15:-1, (maz&0xF0000000)?i7&15:-1,
2120 (maz&0xF00000000)?i8&15:-1, (maz&0xF000000000)?i9&15:-1, (maz&0xF0000000000)?i10&15:-1, (maz&0xF00000000000)?i11&15:-1,
2121 (maz&0xF000000000000)?i12&15:-1, (maz&0xF0000000000000)?i13&15:-1, (maz&0xF00000000000000)?i14&15:-1, (maz&0xF000000000000000)?i15&15:-1> (a);
2122 // write mask for elements from b
2123 const __mmask16 sb = ((mbz&0xF)?1:0) | ((mbz&0xF0)?0x2:0) | ((mbz&0xF00)?0x4:0) | ((mbz&0xF000)?0x8:0) | ((mbz&0xF0000)?0x10:0) | ((mbz&0xF00000)?0x20:0) | ((mbz&0xF000000)?0x40:0) | ((mbz&0xF0000000)?0x80:0)
2124 | ((mbz&0xF00000000)?0x100:0) | ((mbz&0xF000000000)?0x200:0) | ((mbz&0xF0000000000)?0x400:0) | ((mbz&0xF00000000000)?0x800:0) | ((mbz&0xF000000000000)?0x1000:0) | ((mbz&0xF0000000000000)?0x2000:0) | ((mbz&0xF00000000000000)?0x4000:0) | ((mbz&0xF000000000000000)?0x8000:0);
2125 // permute index for elements from b
2126 const int pi = (patb & 3) | (((patb >> 4) & 3) << 2) | (((patb >> 8) & 3) << 4) | (((patb >> 12) & 3) << 6);
2127 // 2. Permute elements from b and combine with elements from a through write mask
2128 return _mm512_mask_shuffle_epi32(ta, sb, b, (_MM_PERM_ENUM)pi);
2129 }
2130 // not same permute pattern in all lanes. use full permute
2131 }
2132
2133 // general case: full permute
2134 const __m512i pmask = constant16i<i0&0x1F, i1&0x1F, i2&0x1F, i3&0x1F, i4&0x1F, i5&0x1F, i6&0x1F, i7&0x1F,
2135 i8&0x1F, i9&0x1F, i10&0x1F, i11&0x1F, i12&0x1F, i13&0x1F, i14&0x1F, i15&0x1F>();
2136 if (dozero) {
2137 return _mm512_maskz_permutex2var_epi32(z, a, pmask, b);
2138 }
2139 else {
2140 return _mm512_permutex2var_epi32(a, pmask, b);
2141 }
2142 }
2143
2144 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
2145 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
blend16ui(Vec16ui const & a,Vec16ui const & b)2146 static inline Vec16ui blend16ui(Vec16ui const & a, Vec16ui const & b) {
2147 return Vec16ui( blend16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (Vec16i(a),Vec16i(b)));
2148 }
2149
2150
2151 /*****************************************************************************
2152 *
2153 * Vector lookup functions
2154 *
2155 ******************************************************************************
2156 *
2157 * These functions use vector elements as indexes into a table.
2158 * The table is given as one or more vectors or as an array.
2159 *
2160 * This can be used for several purposes:
2161 * - table lookup
2162 * - permute or blend with variable indexes
2163 * - blend from more than two sources
2164 * - gather non-contiguous data
2165 *
2166 * An index out of range may produce any value - the actual value produced is
2167 * implementation dependent and may be different for different instruction
2168 * sets. An index out of range does not produce an error message or exception.
2169 *
2170 * Example:
2171 * Vec8q a(2,0,0,6,4,3,5,0); // index a is ( 2, 0, 0, 6, 4, 3, 5, 0)
2172 * Vec8q b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
2173 * Vec8q c;
2174 * c = lookup8 (a,b); // c is (102, 100, 100, 106, 104, 103, 105, 100)
2175 *
2176 *****************************************************************************/
2177
lookup16(Vec16i const & index,Vec16i const & table)2178 static inline Vec16i lookup16(Vec16i const & index, Vec16i const & table) {
2179 return _mm512_permutexvar_epi32(index, table);
2180 }
2181
2182 template <int n>
lookup(Vec16i const & index,void const * table)2183 static inline Vec16i lookup(Vec16i const & index, void const * table) {
2184 if (n <= 0) return 0;
2185 if (n <= 16) {
2186 Vec16i table1 = Vec16i().load(table);
2187 return lookup16(index, table1);
2188 }
2189 if (n <= 32) {
2190 Vec16i table1 = Vec16i().load(table);
2191 Vec16i table2 = Vec16i().load((int8_t*)table + 64);
2192 return _mm512_permutex2var_epi32(table1, index, table2);
2193 }
2194 // n > 32. Limit index
2195 Vec16ui index1;
2196 if ((n & (n-1)) == 0) {
2197 // n is a power of 2, make index modulo n
2198 index1 = Vec16ui(index) & (n-1);
2199 }
2200 else {
2201 // n is not a power of 2, limit to n-1
2202 index1 = min(Vec16ui(index), uint32_t(n-1));
2203 }
2204 return _mm512_i32gather_epi32(index1, (const int*)table, 4);
2205 // return _mm512_i32gather_epi32(index1, table, _MM_UPCONV_EPI32_NONE, 4, 0);
2206 }
2207
2208
lookup8(Vec8q const & index,Vec8q const & table)2209 static inline Vec8q lookup8(Vec8q const & index, Vec8q const & table) {
2210 return _mm512_permutexvar_epi64(index, table);
2211 }
2212
2213 template <int n>
lookup(Vec8q const & index,void const * table)2214 static inline Vec8q lookup(Vec8q const & index, void const * table) {
2215 if (n <= 0) return 0;
2216 if (n <= 8) {
2217 Vec8q table1 = Vec8q().load(table);
2218 return lookup8(index, table1);
2219 }
2220 if (n <= 16) {
2221 Vec8q table1 = Vec8q().load(table);
2222 Vec8q table2 = Vec8q().load((int8_t*)table + 64);
2223 return _mm512_permutex2var_epi64(table1, index, table2);
2224 }
2225 // n > 16. Limit index
2226 Vec8uq index1;
2227 if ((n & (n-1)) == 0) {
2228 // n is a power of 2, make index modulo n
2229 index1 = Vec8uq(index) & (n-1);
2230 }
2231 else {
2232 // n is not a power of 2, limit to n-1
2233 index1 = min(Vec8uq(index), uint32_t(n-1));
2234 }
2235 return _mm512_i64gather_epi64(index1, (const long long*)table, 8);
2236 }
2237
2238
2239 /*****************************************************************************
2240 *
2241 * Gather functions with fixed indexes
2242 *
2243 *****************************************************************************/
2244 // Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15
2245 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
2246 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
gather16i(void const * a)2247 static inline Vec16i gather16i(void const * a) {
2248 Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15)>=0> Negative_array_index; // Error message if index is negative
2249 // find smallest and biggest index, using only compile-time constant expressions
2250 const int i01min = i0 < i1 ? i0 : i1;
2251 const int i23min = i2 < i3 ? i2 : i3;
2252 const int i45min = i4 < i5 ? i4 : i5;
2253 const int i67min = i6 < i7 ? i6 : i7;
2254 const int i89min = i8 < i9 ? i8 : i9;
2255 const int i1011min = i10 < i11 ? i10 : i11;
2256 const int i1213min = i12 < i13 ? i12 : i13;
2257 const int i1415min = i14 < i15 ? i14 : i15;
2258 const int i0_3min = i01min < i23min ? i01min : i23min;
2259 const int i4_7min = i45min < i67min ? i45min : i67min;
2260 const int i8_11min = i89min < i1011min ? i89min : i1011min;
2261 const int i12_15min = i1213min < i1415min ? i1213min : i1415min;
2262 const int i0_7min = i0_3min < i4_7min ? i0_3min : i4_7min;
2263 const int i8_15min = i8_11min < i12_15min ? i8_11min : i12_15min;
2264 const int imin = i0_7min < i8_15min ? i0_7min : i8_15min;
2265 const int i01max = i0 > i1 ? i0 : i1;
2266 const int i23max = i2 > i3 ? i2 : i3;
2267 const int i45max = i4 > i5 ? i4 : i5;
2268 const int i67max = i6 > i7 ? i6 : i7;
2269 const int i89max = i8 > i9 ? i8 : i9;
2270 const int i1011max = i10 > i11 ? i10 : i11;
2271 const int i1213max = i12 > i13 ? i12 : i13;
2272 const int i1415max = i14 > i15 ? i14 : i15;
2273 const int i0_3max = i01max > i23max ? i01max : i23max;
2274 const int i4_7max = i45max > i67max ? i45max : i67max;
2275 const int i8_11max = i89max > i1011max ? i89max : i1011max;
2276 const int i12_15max = i1213max > i1415max ? i1213max : i1415max;
2277 const int i0_7max = i0_3max > i4_7max ? i0_3max : i4_7max;
2278 const int i8_15max = i8_11max > i12_15max ? i8_11max : i12_15max;
2279 const int imax = i0_7max > i8_15max ? i0_7max : i8_15max;
2280 if (imax - imin <= 15) {
2281 // load one contiguous block and permute
2282 if (imax > 15) {
2283 // make sure we don't read past the end of the array
2284 Vec16i b = Vec16i().load((int32_t const *)a + imax-15);
2285 return permute16i<i0-imax+15, i1-imax+15, i2-imax+15, i3-imax+15, i4-imax+15, i5-imax+15, i6-imax+15, i7-imax+15,
2286 i8-imax+15, i9-imax+15, i10-imax+15, i11-imax+15, i12-imax+15, i13-imax+15, i14-imax+15, i15-imax+15> (b);
2287 }
2288 else {
2289 Vec16i b = Vec16i().load((int32_t const *)a + imin);
2290 return permute16i<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin,
2291 i8-imin, i9-imin, i10-imin, i11-imin, i12-imin, i13-imin, i14-imin, i15-imin> (b);
2292 }
2293 }
2294 if ((i0<imin+16 || i0>imax-16) && (i1<imin+16 || i1>imax-16) && (i2<imin+16 || i2>imax-16) && (i3<imin+16 || i3>imax-16)
2295 && (i4<imin+16 || i4>imax-16) && (i5<imin+16 || i5>imax-16) && (i6<imin+16 || i6>imax-16) && (i7<imin+16 || i7>imax-16)
2296 && (i8<imin+16 || i8>imax-16) && (i9<imin+16 || i9>imax-16) && (i10<imin+16 || i10>imax-16) && (i11<imin+16 || i11>imax-16)
2297 && (i12<imin+16 || i12>imax-16) && (i13<imin+16 || i13>imax-16) && (i14<imin+16 || i14>imax-16) && (i15<imin+16 || i15>imax-16) ) {
2298 // load two contiguous blocks and blend
2299 Vec16i b = Vec16i().load((int32_t const *)a + imin);
2300 Vec16i c = Vec16i().load((int32_t const *)a + imax-15);
2301 const int j0 = i0 <imin+16 ? i0 -imin : 31-imax+i0;
2302 const int j1 = i1 <imin+16 ? i1 -imin : 31-imax+i1;
2303 const int j2 = i2 <imin+16 ? i2 -imin : 31-imax+i2;
2304 const int j3 = i3 <imin+16 ? i3 -imin : 31-imax+i3;
2305 const int j4 = i4 <imin+16 ? i4 -imin : 31-imax+i4;
2306 const int j5 = i5 <imin+16 ? i5 -imin : 31-imax+i5;
2307 const int j6 = i6 <imin+16 ? i6 -imin : 31-imax+i6;
2308 const int j7 = i7 <imin+16 ? i7 -imin : 31-imax+i7;
2309 const int j8 = i8 <imin+16 ? i8 -imin : 31-imax+i8;
2310 const int j9 = i9 <imin+16 ? i9 -imin : 31-imax+i9;
2311 const int j10 = i10<imin+16 ? i10-imin : 31-imax+i10;
2312 const int j11 = i11<imin+16 ? i11-imin : 31-imax+i11;
2313 const int j12 = i12<imin+16 ? i12-imin : 31-imax+i12;
2314 const int j13 = i13<imin+16 ? i13-imin : 31-imax+i13;
2315 const int j14 = i14<imin+16 ? i14-imin : 31-imax+i14;
2316 const int j15 = i15<imin+16 ? i15-imin : 31-imax+i15;
2317 return blend16i<j0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15>(b, c);
2318 }
2319 // use gather instruction
2320 return _mm512_i32gather_epi32(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), (const int *)a, 4);
2321 }
2322
2323
2324 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
gather8q(void const * a)2325 static inline Vec8q gather8q(void const * a) {
2326 Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7)>=0> Negative_array_index; // Error message if index is negative
2327
2328 const int i01min = i0 < i1 ? i0 : i1;
2329 const int i23min = i2 < i3 ? i2 : i3;
2330 const int i45min = i4 < i5 ? i4 : i5;
2331 const int i67min = i6 < i7 ? i6 : i7;
2332 const int i0123min = i01min < i23min ? i01min : i23min;
2333 const int i4567min = i45min < i67min ? i45min : i67min;
2334 const int imin = i0123min < i4567min ? i0123min : i4567min;
2335 const int i01max = i0 > i1 ? i0 : i1;
2336 const int i23max = i2 > i3 ? i2 : i3;
2337 const int i45max = i4 > i5 ? i4 : i5;
2338 const int i67max = i6 > i7 ? i6 : i7;
2339 const int i0123max = i01max > i23max ? i01max : i23max;
2340 const int i4567max = i45max > i67max ? i45max : i67max;
2341 const int imax = i0123max > i4567max ? i0123max : i4567max;
2342 if (imax - imin <= 7) {
2343 // load one contiguous block and permute
2344 if (imax > 7) {
2345 // make sure we don't read past the end of the array
2346 Vec8q b = Vec8q().load((int64_t const *)a + imax-7);
2347 return permute8q<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7> (b);
2348 }
2349 else {
2350 Vec8q b = Vec8q().load((int64_t const *)a + imin);
2351 return permute8q<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin> (b);
2352 }
2353 }
2354 if ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
2355 && (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
2356 // load two contiguous blocks and blend
2357 Vec8q b = Vec8q().load((int64_t const *)a + imin);
2358 Vec8q c = Vec8q().load((int64_t const *)a + imax-7);
2359 const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
2360 const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
2361 const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
2362 const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
2363 const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
2364 const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
2365 const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
2366 const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
2367 return blend8q<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
2368 }
2369 // use gather instruction
2370 return _mm512_i64gather_epi64(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), (const long long *)a, 8);
2371 }
2372
2373 /*****************************************************************************
2374 *
2375 * Vector scatter functions
2376 *
2377 ******************************************************************************
2378 *
2379 * These functions write the elements of a vector to arbitrary positions in an
2380 * array in memory. Each vector element is written to an array position
2381 * determined by an index. An element is not written if the corresponding
2382 * index is out of range.
2383 * The indexes can be specified as constant template parameters or as an
2384 * integer vector.
2385 *
2386 * The scatter functions are useful if the data are distributed in a sparce
2387 * manner into the array. If the array is dense then it is more efficient
2388 * to permute the data into the right positions and then write the whole
2389 * permuted vector into the array.
2390 *
2391 * Example:
2392 * Vec8q a(10,11,12,13,14,15,16,17);
2393 * int64_t b[16] = {0};
2394 * scatter<0,2,14,10,1,-1,5,9>(a,b);
2395 * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
2396 *
2397 *****************************************************************************/
2398
2399 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
2400 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
scatter(Vec16i const & data,void * array)2401 static inline void scatter(Vec16i const & data, void * array) {
2402 __m512i indx = constant16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15>();
2403 Vec16ib mask(i0>=0, i1>=0, i2>=0, i3>=0, i4>=0, i5>=0, i6>=0, i7>=0,
2404 i8>=0, i9>=0, i10>=0, i11>=0, i12>=0, i13>=0, i14>=0, i15>=0);
2405 _mm512_mask_i32scatter_epi32((int*)array, mask, indx, data, 4);
2406 }
2407
2408 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
scatter(Vec8q const & data,void * array)2409 static inline void scatter(Vec8q const & data, void * array) {
2410 __m256i indx = constant8i<i0,i1,i2,i3,i4,i5,i6,i7>();
2411 Vec8qb mask(i0>=0, i1>=0, i2>=0, i3>=0, i4>=0, i5>=0, i6>=0, i7>=0);
2412 _mm512_mask_i32scatter_epi64((long long *)array, mask, indx, data, 8);
2413 }
2414
scatter(Vec16i const & index,uint32_t limit,Vec16i const & data,void * array)2415 static inline void scatter(Vec16i const & index, uint32_t limit, Vec16i const & data, void * array) {
2416 Vec16ib mask = Vec16ui(index) < limit;
2417 _mm512_mask_i32scatter_epi32((int*)array, mask, index, data, 4);
2418 }
2419
scatter(Vec8q const & index,uint32_t limit,Vec8q const & data,void * array)2420 static inline void scatter(Vec8q const & index, uint32_t limit, Vec8q const & data, void * array) {
2421 Vec8qb mask = Vec8uq(index) < uint64_t(limit);
2422 _mm512_mask_i64scatter_epi64((long long *)array, mask, index, data, 8);
2423 }
2424
scatter(Vec8i const & index,uint32_t limit,Vec8q const & data,void * array)2425 static inline void scatter(Vec8i const & index, uint32_t limit, Vec8q const & data, void * array) {
2426 #if defined (__AVX512VL__)
2427 __mmask16 mask = _mm256_cmplt_epu32_mask(index, Vec8ui(limit));
2428 #else
2429 __mmask16 mask = _mm512_cmplt_epu32_mask(_mm512_castsi256_si512(index), _mm512_castsi256_si512(Vec8ui(limit)));
2430 #endif
2431 _mm512_mask_i32scatter_epi64((long long *)array, mask, index, data, 8);
2432 }
2433
2434 /*****************************************************************************
2435 *
2436 * Functions for conversion between integer sizes
2437 *
2438 *****************************************************************************/
2439
2440 // Extend 16-bit integers to 32-bit integers, signed and unsigned
2441
2442 // Function extend_to_int : extends Vec16s to Vec16i with sign extension
extend_to_int(Vec16s const & a)2443 static inline Vec16i extend_to_int (Vec16s const & a) {
2444 return _mm512_cvtepi16_epi32(a);
2445 }
2446
2447 // Function extend_to_int : extends Vec16us to Vec16ui with zero extension
extend_to_int(Vec16us const & a)2448 static inline Vec16ui extend_to_int (Vec16us const & a) {
2449 return _mm512_cvtepu16_epi32(a);
2450 }
2451
2452 // Function extend_to_int : extends Vec16c to Vec16i with sign extension
extend_to_int(Vec16c const & a)2453 static inline Vec16i extend_to_int (Vec16c const & a) {
2454 return _mm512_cvtepi8_epi32(a);
2455 }
2456
2457 // Function extend_to_int : extends Vec16uc to Vec16ui with zero extension
extend_to_int(Vec16uc const & a)2458 static inline Vec16ui extend_to_int (Vec16uc const & a) {
2459 return _mm512_cvtepu8_epi32(a);
2460 }
2461
2462
2463 // Extend 32-bit integers to 64-bit integers, signed and unsigned
2464
2465 // Function extend_low : extends the low 8 elements to 64 bits with sign extension
extend_low(Vec16i const & a)2466 static inline Vec8q extend_low (Vec16i const & a) {
2467 return _mm512_cvtepi32_epi64(a.get_low());
2468 }
2469
2470 // Function extend_high : extends the high 8 elements to 64 bits with sign extension
extend_high(Vec16i const & a)2471 static inline Vec8q extend_high (Vec16i const & a) {
2472 return _mm512_cvtepi32_epi64(a.get_high());
2473 }
2474
2475 // Function extend_low : extends the low 8 elements to 64 bits with zero extension
extend_low(Vec16ui const & a)2476 static inline Vec8uq extend_low (Vec16ui const & a) {
2477 return _mm512_cvtepu32_epi64(a.get_low());
2478 }
2479
2480 // Function extend_high : extends the high 8 elements to 64 bits with zero extension
extend_high(Vec16ui const & a)2481 static inline Vec8uq extend_high (Vec16ui const & a) {
2482 return _mm512_cvtepu32_epi64(a.get_high());
2483 }
2484
2485
2486 // Compress 32-bit integers to 8-bit integers, signed and unsigned, with and without saturation
2487
2488 // Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
2489 // Overflow wraps around
compress_to_int8(Vec16i const & a)2490 static inline Vec16c compress_to_int8 (Vec16i const & a) {
2491 return _mm512_cvtepi32_epi8(a);
2492 }
2493
compress_to_int16(Vec16i const & a)2494 static inline Vec16s compress_to_int16 (Vec16i const & a) {
2495 return _mm512_cvtepi32_epi16(a);
2496 }
2497
2498 // with signed saturation
compress_to_int8_saturated(Vec16i const & a)2499 static inline Vec16c compress_to_int8_saturated (Vec16i const & a) {
2500 return _mm512_cvtsepi32_epi8(a);
2501 }
2502
compress_to_int16_saturated(Vec16i const & a)2503 static inline Vec16s compress_to_int16_saturated (Vec16i const & a) {
2504 return _mm512_cvtsepi32_epi16(a);
2505 }
2506
2507 // with unsigned saturation
compress_to_int8_saturated(Vec16ui const & a)2508 static inline Vec16uc compress_to_int8_saturated (Vec16ui const & a) {
2509 return _mm512_cvtusepi32_epi8(a);
2510 }
2511
compress_to_int16_saturated(Vec16ui const & a)2512 static inline Vec16us compress_to_int16_saturated (Vec16ui const & a) {
2513 return _mm512_cvtusepi32_epi16(a);
2514 }
2515
2516 // Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
2517
2518 // Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
2519 // Overflow wraps around
compress(Vec8q const & low,Vec8q const & high)2520 static inline Vec16i compress (Vec8q const & low, Vec8q const & high) {
2521 Vec8i low2 = _mm512_cvtepi64_epi32(low);
2522 Vec8i high2 = _mm512_cvtepi64_epi32(high);
2523 return Vec16i(low2, high2);
2524 }
2525
2526 // Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
2527 // Signed, with saturation
compress_saturated(Vec8q const & low,Vec8q const & high)2528 static inline Vec16i compress_saturated (Vec8q const & low, Vec8q const & high) {
2529 Vec8i low2 = _mm512_cvtsepi64_epi32(low);
2530 Vec8i high2 = _mm512_cvtsepi64_epi32(high);
2531 return Vec16i(low2, high2);
2532 }
2533
2534 // Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
2535 // Unsigned, with saturation
compress_saturated(Vec8uq const & low,Vec8uq const & high)2536 static inline Vec16ui compress_saturated (Vec8uq const & low, Vec8uq const & high) {
2537 Vec8ui low2 = _mm512_cvtusepi64_epi32(low);
2538 Vec8ui high2 = _mm512_cvtusepi64_epi32(high);
2539 return Vec16ui(low2, high2);
2540 }
2541
2542
2543 /*****************************************************************************
2544 *
2545 * Integer division operators
2546 *
2547 * Please see the file vectori128.h for explanation.
2548 *
2549 *****************************************************************************/
2550
2551 // vector operator / : divide each element by divisor
2552
2553 // vector of 16 32-bit signed integers
2554 static inline Vec16i operator / (Vec16i const & a, Divisor_i const & d) {
2555 __m512i m = _mm512_broadcast_i32x4(d.getm()); // broadcast multiplier
2556 __m512i sgn = _mm512_broadcast_i32x4(d.getsign()); // broadcast sign of d
2557 __m512i t1 = _mm512_mul_epi32(a,m); // 32x32->64 bit signed multiplication of even elements of a
2558 __m512i t3 = _mm512_srli_epi64(a,32); // get odd elements of a into position for multiplication
2559 __m512i t4 = _mm512_mul_epi32(t3,m); // 32x32->64 bit signed multiplication of odd elements
2560 __m512i t2 = _mm512_srli_epi64(t1,32); // dword of even index results
2561 __m512i t7 = _mm512_mask_mov_epi32(t2, __mmask16(0xAAAA), t4); // blend two results
2562 __m512i t8 = _mm512_add_epi32(t7,a); // add
2563 __m512i t9 = _mm512_sra_epi32(t8,d.gets1()); // shift right artihmetic
2564 __m512i t10 = _mm512_srai_epi32(a,31); // sign of a
2565 __m512i t11 = _mm512_sub_epi32(t10,sgn); // sign of a - sign of d
2566 __m512i t12 = _mm512_sub_epi32(t9,t11); // + 1 if a < 0, -1 if d < 0
2567 return _mm512_xor_si512(t12,sgn); // change sign if divisor negative
2568 }
2569
2570 // vector of 16 32-bit unsigned integers
2571 static inline Vec16ui operator / (Vec16ui const & a, Divisor_ui const & d) {
2572 __m512i m = _mm512_broadcast_i32x4(d.getm()); // broadcast multiplier
2573 __m512i t1 = _mm512_mul_epu32(a,m); // 32x32->64 bit unsigned multiplication of even elements of a
2574 __m512i t3 = _mm512_srli_epi64(a,32); // get odd elements of a into position for multiplication
2575 __m512i t4 = _mm512_mul_epu32(t3,m); // 32x32->64 bit unsigned multiplication of odd elements
2576 __m512i t2 = _mm512_srli_epi64(t1,32); // high dword of even index results
2577 __m512i t7 = _mm512_mask_mov_epi32(t2, __mmask16(0xAAAA), t4); // blend two results
2578 __m512i t8 = _mm512_sub_epi32(a,t7); // subtract
2579 __m512i t9 = _mm512_srl_epi32(t8,d.gets1()); // shift right logical
2580 __m512i t10 = _mm512_add_epi32(t7,t9); // add
2581 return _mm512_srl_epi32(t10,d.gets2()); // shift right logical
2582 }
2583
2584 // vector operator /= : divide
2585 static inline Vec16i & operator /= (Vec16i & a, Divisor_i const & d) {
2586 a = a / d;
2587 return a;
2588 }
2589
2590 // vector operator /= : divide
2591 static inline Vec16ui & operator /= (Vec16ui & a, Divisor_ui const & d) {
2592 a = a / d;
2593 return a;
2594 }
2595
2596
2597 /*****************************************************************************
2598 *
2599 * Integer division 2: divisor is a compile-time constant
2600 *
2601 *****************************************************************************/
2602
2603 // Divide Vec16i by compile-time constant
2604 template <int32_t d>
divide_by_i(Vec16i const & x)2605 static inline Vec16i divide_by_i(Vec16i const & x) {
2606 Static_error_check<(d!=0)> Dividing_by_zero; // Error message if dividing by zero
2607 if (d == 1) return x;
2608 if (d == -1) return -x;
2609 if (uint32_t(d) == 0x80000000u) {
2610 return _mm512_maskz_set1_epi32(x == Vec16i(0x80000000), 1); // avoid overflow of abs(d). return (x == 0x80000000) ? 1 : 0;
2611 }
2612 const uint32_t d1 = d > 0 ? uint32_t(d) : -uint32_t(d); // compile-time abs(d). (force GCC compiler to treat d as 32 bits, not 64 bits)
2613 if ((d1 & (d1-1)) == 0) {
2614 // d1 is a power of 2. use shift
2615 const int k = bit_scan_reverse_const(d1);
2616 __m512i sign;
2617 if (k > 1) sign = _mm512_srai_epi32(x, k-1); else sign = x; // k copies of sign bit
2618 __m512i bias = _mm512_srli_epi32(sign, 32-k); // bias = x >= 0 ? 0 : k-1
2619 __m512i xpbias = _mm512_add_epi32 (x, bias); // x + bias
2620 __m512i q = _mm512_srai_epi32(xpbias, k); // (x + bias) >> k
2621 if (d > 0) return q; // d > 0: return q
2622 return _mm512_sub_epi32(_mm512_setzero_epi32(), q); // d < 0: return -q
2623
2624 }
2625 // general case
2626 const int32_t sh = bit_scan_reverse_const(uint32_t(d1)-1); // ceil(log2(d1)) - 1. (d1 < 2 handled by power of 2 case)
2627 const int32_t mult = int(1 + (uint64_t(1) << (32+sh)) / uint32_t(d1) - (int64_t(1) << 32)); // multiplier
2628 const Divisor_i div(mult, sh, d < 0 ? -1 : 0);
2629 return x / div;
2630 }
2631
2632 // define Vec8i a / const_int(d)
2633 template <int32_t d>
2634 static inline Vec16i operator / (Vec16i const & a, Const_int_t<d>) {
2635 return divide_by_i<d>(a);
2636 }
2637
2638 // define Vec16i a / const_uint(d)
2639 template <uint32_t d>
2640 static inline Vec16i operator / (Vec16i const & a, Const_uint_t<d>) {
2641 Static_error_check< (d<0x80000000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
2642 return divide_by_i<int32_t(d)>(a); // signed divide
2643 }
2644
2645 // vector operator /= : divide
2646 template <int32_t d>
2647 static inline Vec16i & operator /= (Vec16i & a, Const_int_t<d> b) {
2648 a = a / b;
2649 return a;
2650 }
2651
2652 // vector operator /= : divide
2653 template <uint32_t d>
2654 static inline Vec16i & operator /= (Vec16i & a, Const_uint_t<d> b) {
2655 a = a / b;
2656 return a;
2657 }
2658
2659
2660 // Divide Vec16ui by compile-time constant
2661 template <uint32_t d>
divide_by_ui(Vec16ui const & x)2662 static inline Vec16ui divide_by_ui(Vec16ui const & x) {
2663 Static_error_check<(d!=0)> Dividing_by_zero; // Error message if dividing by zero
2664 if (d == 1) return x; // divide by 1
2665 const int b = bit_scan_reverse_const(d); // floor(log2(d))
2666 if ((uint32_t(d) & (uint32_t(d)-1)) == 0) {
2667 // d is a power of 2. use shift
2668 return _mm512_srli_epi32(x, b); // x >> b
2669 }
2670 // general case (d > 2)
2671 uint32_t mult = uint32_t((uint64_t(1) << (b+32)) / d); // multiplier = 2^(32+b) / d
2672 const uint64_t rem = (uint64_t(1) << (b+32)) - uint64_t(d)*mult; // remainder 2^(32+b) % d
2673 const bool round_down = (2*rem < d); // check if fraction is less than 0.5
2674 if (!round_down) {
2675 mult = mult + 1; // round up mult
2676 }
2677 // do 32*32->64 bit unsigned multiplication and get high part of result
2678 const __m512i multv = Vec16ui(uint64_t(mult)); // zero-extend mult and broadcast
2679 __m512i t1 = _mm512_mul_epu32(x,multv); // 32x32->64 bit unsigned multiplication of even elements
2680 if (round_down) {
2681 t1 = _mm512_add_epi64(t1,multv); // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
2682 }
2683 __m512i t2 = _mm512_srli_epi64(t1,32); // high dword of result 0 and 2
2684 __m512i t3 = _mm512_srli_epi64(x,32); // get odd elements into position for multiplication
2685 __m512i t4 = _mm512_mul_epu32(t3,multv); // 32x32->64 bit unsigned multiplication of x[1] and x[3]
2686 if (round_down) {
2687 t4 = _mm512_add_epi64(t4,multv); // compensate for rounding error. (x+1)*m replaced by x*m+m to avoid overflow
2688 }
2689 __m512i t7 = _mm512_mask_mov_epi32(t2, __mmask16(0xAA), t4); // blend two results
2690 Vec16ui q = _mm512_srli_epi32(t7, b); // shift right by b
2691 return q; // no overflow possible
2692 }
2693
2694 // define Vec8ui a / const_uint(d)
2695 template <uint32_t d>
2696 static inline Vec16ui operator / (Vec16ui const & a, Const_uint_t<d>) {
2697 return divide_by_ui<d>(a);
2698 }
2699
2700 // define Vec8ui a / const_int(d)
2701 template <int32_t d>
2702 static inline Vec16ui operator / (Vec16ui const & a, Const_int_t<d>) {
2703 Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
2704 return divide_by_ui<d>(a); // unsigned divide
2705 }
2706
2707 // vector operator /= : divide
2708 template <uint32_t d>
2709 static inline Vec16ui & operator /= (Vec16ui & a, Const_uint_t<d> b) {
2710 a = a / b;
2711 return a;
2712 }
2713
2714 // vector operator /= : divide
2715 template <int32_t d>
2716 static inline Vec16ui & operator /= (Vec16ui & a, Const_int_t<d> b) {
2717 a = a / b;
2718 return a;
2719 }
2720
2721 /*****************************************************************************
2722 *
2723 * Horizontal scan functions
2724 *
2725 *****************************************************************************/
2726
2727 // Get index to the first element that is true. Return -1 if all are false
2728
horizontal_find_first(Vec16ib const & x)2729 static inline int horizontal_find_first(Vec16ib const & x) {
2730 uint32_t b = uint16_t(__mmask16(x));
2731 if (b) {
2732 return bit_scan_forward(b);
2733 }
2734 else {
2735 return -1;
2736 }
2737 }
2738
horizontal_find_first(Vec8qb const & x)2739 static inline int horizontal_find_first(Vec8qb const & x) {
2740 uint32_t b = uint8_t(__mmask16(x));
2741 if (b) {
2742 return bit_scan_forward(b);
2743 }
2744 else {
2745 return -1;
2746 }
2747 }
2748
horizontal_count(Vec16ib const & x)2749 static inline uint32_t horizontal_count(Vec16ib const & x) {
2750 return vml_popcnt(uint32_t(uint16_t(__mmask16(x))));
2751 }
2752
horizontal_count(Vec8qb const & x)2753 static inline uint32_t horizontal_count(Vec8qb const & x) {
2754 return vml_popcnt(uint32_t(uint16_t(__mmask16(x))));
2755 }
2756
2757
2758 /*****************************************************************************
2759 *
2760 * Boolean <-> bitfield conversion functions
2761 *
2762 *****************************************************************************/
2763
2764 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec4ib x)2765 static inline uint8_t to_bits(Vec4ib x) {
2766 __m512i a = _mm512_castsi128_si512(x);
2767 __mmask16 b = _mm512_mask_testn_epi32_mask(0xF, a, a);
2768 return uint8_t(b) ^ 0xF;
2769 }
2770
2771 // to_Vec16c: convert integer bitfield to boolean vector
to_Vec4ib(uint8_t x)2772 static inline Vec4ib to_Vec4ib(uint8_t x) {
2773 return _mm512_castsi512_si128(_mm512_maskz_set1_epi32(__mmask16(x), -1));
2774 }
2775
2776 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec2qb x)2777 static inline uint8_t to_bits(Vec2qb x) {
2778 __m512i a = _mm512_castsi128_si512(x);
2779 __mmask16 b = _mm512_mask_testn_epi64_mask(0x3, a, a);
2780 return uint8_t(b) ^ 0x3;
2781 }
2782
2783 // to_Vec16c: convert integer bitfield to boolean vector
to_Vec2qb(uint8_t x)2784 static inline Vec2qb to_Vec2qb(uint8_t x) {
2785 return _mm512_castsi512_si128(_mm512_maskz_set1_epi64(__mmask16(x), -1LL));
2786 }
2787
2788 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec8ib x)2789 static inline uint8_t to_bits(Vec8ib x) {
2790 __m512i a = _mm512_castsi256_si512(x);
2791 __mmask16 b = _mm512_mask_testn_epi32_mask(0xFF, a, a);
2792 return ~ uint8_t(b);
2793 }
2794
2795 // to_Vec16c: convert integer bitfield to boolean vector
to_Vec8ib(uint8_t x)2796 static inline Vec8ib to_Vec8ib(uint8_t x) {
2797 return _mm512_castsi512_si256(_mm512_maskz_set1_epi32(__mmask16(x), -1));
2798 }
2799
2800 // to_bits: convert boolean vector to integer bitfield
to_bits(Vec4qb x)2801 static inline uint8_t to_bits(Vec4qb x) {
2802 __m512i a = _mm512_castsi256_si512(x);
2803 __mmask16 b = _mm512_mask_testn_epi64_mask(0xF, a, a);
2804 return uint8_t(b) ^ 0xF;
2805 }
2806
2807 // to_Vec16c: convert integer bitfield to boolean vector
to_Vec4qb(uint8_t x)2808 static inline Vec4qb to_Vec4qb(uint8_t x) {
2809 return _mm512_castsi512_si256(_mm512_maskz_set1_epi64(__mmask16(x), -1LL));
2810 }
2811
2812
2813 // to_bits: convert to integer bitfield
to_bits(Vec16b a)2814 static inline uint16_t to_bits(Vec16b a) {
2815 return (uint16_t)(__mmask16)a;
2816 }
2817
2818 // to_Vec16b: convert integer bitfield to boolean vector
to_Vec16b(uint16_t x)2819 static inline Vec16b to_Vec16b(uint16_t x) {
2820 return (__mmask16)x;
2821 }
2822
2823 // to_Vec16ib: convert integer bitfield to boolean vector
to_Vec16ib(uint16_t x)2824 static inline Vec16ib to_Vec16ib(uint16_t x) {
2825 return to_Vec16b(x);
2826 }
2827
2828 // to_Vec8b: convert integer bitfield to boolean vector
to_Vec8qb(uint8_t x)2829 static inline Vec8qb to_Vec8qb(uint8_t x) {
2830 return (__mmask16)x;
2831 }
2832
2833 #ifdef VCL_NAMESPACE
2834 }
2835 #endif
2836
2837 #endif // VECTORI512_H
2838