1 /**************************** vectori512e.h *******************************
2 * Author: Agner Fog
3 * Date created: 2014-07-23
4 * Last modified: 2017-02-19
5 * Version: 1.27
6 * Project: vector classes
7 * Description:
8 * Header file defining integer vector classes as interface to intrinsic
9 * functions in x86 microprocessors with AVX512 and later instruction sets.
10 *
11 * Instructions:
12 * Use Gnu, Intel or Microsoft C++ compiler. Compile for the desired
13 * instruction set, which must be at least AVX512.
14 *
15 * The following vector classes are defined here:
16 * Vec16i Vector of 16 32-bit signed integers
17 * Vec16ui Vector of 16 32-bit unsigned integers
18 * Vec16ib Vector of 16 Booleans for use with Vec16i and Vec16ui
19 * Vec8q Vector of 8 64-bit signed integers
20 * Vec8uq Vector of 8 64-bit unsigned integers
21 * Vec8qb Vector of 8 Booleans for use with Vec8q and Vec8uq
22 *
23 * Each vector object is represented internally in the CPU as a 512-bit register.
24 * This header file defines operators and functions for these vectors.
25 *
26 * For detailed instructions, see VectorClass.pdf
27 *
28 * (c) Copyright 2014-2017 GNU General Public License http://www.gnu.org/licenses
29 *****************************************************************************/
30
31 // check combination of header files
32 #if defined (VECTORI512_H)
33 #if VECTORI512_H != 1
34 #error Two different versions of vectori512.h included
35 #endif
36 #else
37 #define VECTORI512_H 1
38
39 #ifdef VCL_NAMESPACE
40 namespace VCL_NAMESPACE {
41 #endif
42
43 /*****************************************************************************
44 *
45 * base class Vec512ie
46 *
47 *****************************************************************************/
48 // base class to replace _mm512i when AVX512 is not supported
49 class Vec512ie {
50 protected:
51 Vec256b z0; // low half
52 Vec256b z1; // high half
53 public:
Vec512ie(void)54 Vec512ie(void) {}; // default constructor
Vec512ie(Vec8i const & x0,Vec8i const & x1)55 Vec512ie(Vec8i const & x0, Vec8i const & x1) { // constructor to build from two Vec8i
56 z0 = x0; z1 = x1;
57 }
get_low()58 Vec8i get_low() const { // get low half
59 return Vec8i(z0);
60 }
get_high()61 Vec8i get_high() const { // get high half
62 return Vec8i(z1);
63 }
64 };
65
66
67 /*****************************************************************************
68 *
69 * Vector of 512 1-bit unsigned integers or Booleans
70 *
71 *****************************************************************************/
72 class Vec512b : public Vec512ie {
73 public:
74 // Default constructor:
Vec512b()75 Vec512b() {
76 }
77 // Constructor to build from two Vec256b:
Vec512b(Vec256b const & a0,Vec256b const & a1)78 Vec512b(Vec256b const & a0, Vec256b const & a1) {
79 z0 = a0; z1 = a1;
80 }
81 // Constructor to convert from type Vec512ie
Vec512b(Vec512ie const & x)82 Vec512b(Vec512ie const & x) {
83 z0 = x.get_low(); z1 = x.get_high();
84 }
85 // Assignment operator to convert from type Vec512ie
86 Vec512b & operator = (Vec512ie const & x) {
87 z0 = x.get_low(); z1 = x.get_high();
88 return *this;
89 }
90 // Member function to load from array (unaligned)
load(void const * p)91 Vec512b & load(void const * p) {
92 z0 = Vec8i().load(p);
93 z1 = Vec8i().load((int32_t const*)p+8);
94 return *this;
95 }
96 // Member function to load from array, aligned by 64
load_a(void const * p)97 Vec512b & load_a(void const * p) {
98 z0 = Vec8i().load_a(p);
99 z1 = Vec8i().load_a((int32_t const*)p+8);
100 return *this;
101 }
102 // Member function to store into array (unaligned)
store(void * p)103 void store(void * p) const {
104 Vec8i(z0).store(p);
105 Vec8i(z1).store((int32_t*)p+8);
106 }
107 // Member function to store into array, aligned by 64
store_a(void * p)108 void store_a(void * p) const {
109 Vec8i(z0).store_a(p);
110 Vec8i(z1).store_a((int32_t*)p+8);
111 }
112 // Member function to change a single bit
113 // Note: This function is inefficient. Use load function if changing more than one bit
set_bit(uint32_t index,int value)114 Vec512b const & set_bit(uint32_t index, int value) {
115 if (index < 256) {
116 z0 = Vec8i(z0).set_bit(index, value);
117 }
118 else {
119 z1 = Vec8i(z1).set_bit(index-256, value);
120 }
121 return *this;
122 }
123 // Member function to get a single bit
124 // Note: This function is inefficient. Use store function if reading more than one bit
get_bit(uint32_t index)125 int get_bit(uint32_t index) const {
126 if (index < 256) {
127 return Vec8i(z0).get_bit(index);
128 }
129 else {
130 return Vec8i(z1).get_bit(index-256);
131 }
132 }
133 // Extract a single element. Use store function if extracting more than one element.
134 // Operator [] can only read an element, not write.
135 bool operator [] (uint32_t index) const {
136 return get_bit(index) != 0;
137 }
138 // Member functions to split into two Vec128b:
get_low()139 Vec256b get_low() const {
140 return z0;
141 }
get_high()142 Vec256b get_high() const {
143 return z1;
144 }
size()145 static int size () {
146 return 512;
147 }
148 };
149
150 // Define operators for this class
151
152 // vector operator & : bitwise and
153 static inline Vec512b operator & (Vec512b const & a, Vec512b const & b) {
154 return Vec512b(a.get_low() & b.get_low(), a.get_high() & b.get_high());
155 }
156 static inline Vec512b operator && (Vec512b const & a, Vec512b const & b) {
157 return a & b;
158 }
159
160 // vector operator | : bitwise or
161 static inline Vec512b operator | (Vec512b const & a, Vec512b const & b) {
162 return Vec512b(a.get_low() | b.get_low(), a.get_high() | b.get_high());
163 }
164 static inline Vec512b operator || (Vec512b const & a, Vec512b const & b) {
165 return a | b;
166 }
167
168 // vector operator ^ : bitwise xor
169 static inline Vec512b operator ^ (Vec512b const & a, Vec512b const & b) {
170 return Vec512b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
171 }
172
173 // vector operator ~ : bitwise not
174 static inline Vec512b operator ~ (Vec512b const & a) {
175 return Vec512b(~a.get_low(), ~a.get_high());
176 }
177
178 // vector operator &= : bitwise and
179 static inline Vec512b & operator &= (Vec512b & a, Vec512b const & b) {
180 a = a & b;
181 return a;
182 }
183
184 // vector operator |= : bitwise or
185 static inline Vec512b & operator |= (Vec512b & a, Vec512b const & b) {
186 a = a | b;
187 return a;
188 }
189
190 // vector operator ^= : bitwise xor
191 static inline Vec512b & operator ^= (Vec512b & a, Vec512b const & b) {
192 a = a ^ b;
193 return a;
194 }
195
196 // Define functions for this class
197
198 // function andnot: a & ~ b
andnot(Vec512b const & a,Vec512b const & b)199 static inline Vec512b andnot (Vec512b const & a, Vec512b const & b) {
200 return Vec512b(andnot(a.get_low(), b.get_low()), andnot(a.get_high(), b.get_high()));
201 }
202
203
204
205 /*****************************************************************************
206 *
207 * Generate compile-time constant vector
208 *
209 *****************************************************************************/
210 // Generate a constant vector of 8 integers stored in memory.
211 // Can be converted to any integer vector type
212 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
constant16i()213 static inline Vec512ie constant16i() {
214 static const union {
215 int32_t i[16];
216 Vec256b y[2]; // note: requires C++0x or later. Use option -std=c++0x
217 } u = {{i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15}};
218 return Vec512ie(u.y[0], u.y[1]);
219 }
220
221
222 /*****************************************************************************
223 *
224 * Boolean vector base classes for AVX512
225 *
226 *****************************************************************************/
227
228 class Vec16b : public Vec512b {
229 public:
230 // Default constructor:
Vec16b()231 Vec16b () {
232 }
233 // Constructor to build from all elements:
Vec16b(bool b0,bool b1,bool b2,bool b3,bool b4,bool b5,bool b6,bool b7,bool b8,bool b9,bool b10,bool b11,bool b12,bool b13,bool b14,bool b15)234 Vec16b(bool b0, bool b1, bool b2, bool b3, bool b4, bool b5, bool b6, bool b7,
235 bool b8, bool b9, bool b10, bool b11, bool b12, bool b13, bool b14, bool b15) {
236 *this = Vec512b(Vec8i(-(int)b0, -(int)b1, -(int)b2, -(int)b3, -(int)b4, -(int)b5, -(int)b6, -(int)b7), Vec8i(-(int)b8, -(int)b9, -(int)b10, -(int)b11, -(int)b12, -(int)b13, -(int)b14, -(int)b15));
237 }
238 // Constructor to convert from type Vec512b
Vec16b(Vec512b const & x)239 Vec16b (Vec512b const & x) {
240 z0 = x.get_low();
241 z1 = x.get_high();
242 }
243 // Constructor to make from two halves
Vec16b(Vec8ib const & x0,Vec8ib const & x1)244 Vec16b (Vec8ib const & x0, Vec8ib const & x1) {
245 z0 = x0;
246 z1 = x1;
247 }
248 // Constructor to make from two halves
Vec16b(Vec8i const & x0,Vec8i const & x1)249 Vec16b (Vec8i const & x0, Vec8i const & x1) {
250 z0 = x0;
251 z1 = x1;
252 }
253 // Constructor to broadcast single value:
Vec16b(bool b)254 Vec16b(bool b) {
255 z0 = z1 = Vec8i(-int32_t(b));
256 }
257 // Assignment operator to broadcast scalar value:
258 Vec16b & operator = (bool b) {
259 z0 = z1 = Vec8i(-int32_t(b));
260 return *this;
261 }
262 private:
263 // Prevent constructing from int, etc. because of ambiguity
264 Vec16b(int b);
265 // Prevent assigning int because of ambiguity
266 Vec16b & operator = (int x);
267 public:
268 // split into two halves
get_low()269 Vec8ib get_low() const {
270 return Vec8ib(z0);
271 }
get_high()272 Vec8ib get_high() const {
273 return Vec8ib(z1);
274 }
275 // Assignment operator to convert from type Vec512b
276 Vec16b & operator = (Vec512b const & x) {
277 z0 = x.get_low();
278 z1 = x.get_high();
279 return *this;
280 }
281 // Member function to change a single element in vector
282 // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,bool value)283 Vec16b const & insert(uint32_t index, bool value) {
284 if (index < 8) {
285 z0 = Vec8ib(z0).insert(index, value);
286 }
287 else {
288 z1 = Vec8ib(z1).insert(index-8, value);
289 }
290 return *this;
291 }
292 // Member function extract a single element from vector
extract(uint32_t index)293 bool extract(uint32_t index) const {
294 if (index < 8) {
295 return Vec8ib(z0).extract(index);
296 }
297 else {
298 return Vec8ib(z1).extract(index-8);
299 }
300 }
301 // Extract a single element. Operator [] can only read an element, not write.
302 bool operator [] (uint32_t index) const {
303 return extract(index);
304 }
size()305 static int size () {
306 return 16;
307 }
308 };
309
310 // Define operators for this class
311
312 // vector operator & : bitwise and
313 static inline Vec16b operator & (Vec16b const & a, Vec16b const & b) {
314 return Vec16b(a.get_low() & b.get_low(), a.get_high() & b.get_high());
315 }
316 static inline Vec16b operator && (Vec16b const & a, Vec16b const & b) {
317 return a & b;
318 }
319
320 // vector operator | : bitwise or
321 static inline Vec16b operator | (Vec16b const & a, Vec16b const & b) {
322 return Vec16b(a.get_low() | b.get_low(), a.get_high() | b.get_high());
323 }
324 static inline Vec16b operator || (Vec16b const & a, Vec16b const & b) {
325 return a | b;
326 }
327
328 // vector operator ^ : bitwise xor
329 static inline Vec16b operator ^ (Vec16b const & a, Vec16b const & b) {
330 return Vec16b(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
331 }
332
333 // vector operator ~ : bitwise not
334 static inline Vec16b operator ~ (Vec16b const & a) {
335 return Vec16b(~(a.get_low()), ~(a.get_high()));
336 }
337
338 // vector operator ! : element not
339 static inline Vec16b operator ! (Vec16b const & a) {
340 return ~a;
341 }
342
343 // vector operator &= : bitwise and
344 static inline Vec16b & operator &= (Vec16b & a, Vec16b const & b) {
345 a = a & b;
346 return a;
347 }
348
349 // vector operator |= : bitwise or
350 static inline Vec16b & operator |= (Vec16b & a, Vec16b const & b) {
351 a = a | b;
352 return a;
353 }
354
355 // vector operator ^= : bitwise xor
356 static inline Vec16b & operator ^= (Vec16b & a, Vec16b const & b) {
357 a = a ^ b;
358 return a;
359 }
360
361 /*****************************************************************************
362 *
363 * Functions for boolean vectors
364 *
365 *****************************************************************************/
366
367 // function andnot: a & ~ b
andnot(Vec16b const & a,Vec16b const & b)368 static inline Vec16b andnot (Vec16b const & a, Vec16b const & b) {
369 return Vec16b(Vec8ib(andnot(a.get_low(),b.get_low())), Vec8ib(andnot(a.get_high(),b.get_high())));
370 }
371
372 // horizontal_and. Returns true if all bits are 1
horizontal_and(Vec16b const & a)373 static inline bool horizontal_and (Vec16b const & a) {
374 return horizontal_and(a.get_low() & a.get_high());
375 }
376
377 // horizontal_or. Returns true if at least one bit is 1
horizontal_or(Vec16b const & a)378 static inline bool horizontal_or (Vec16b const & a) {
379 return horizontal_or(a.get_low() | a.get_high());
380 }
381
382
383 /*****************************************************************************
384 *
385 * Vec16ib: Vector of 16 Booleans for use with Vec16i and Vec16ui
386 *
387 *****************************************************************************/
388
389 class Vec16ib : public Vec16b {
390 public:
391 // Default constructor:
Vec16ib()392 Vec16ib () {
393 }
Vec16ib(Vec16b const & x)394 Vec16ib (Vec16b const & x) {
395 z0 = x.get_low();
396 z1 = x.get_high();
397 }
398 // Constructor to build from all elements:
Vec16ib(bool x0,bool x1,bool x2,bool x3,bool x4,bool x5,bool x6,bool x7,bool x8,bool x9,bool x10,bool x11,bool x12,bool x13,bool x14,bool x15)399 Vec16ib(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7,
400 bool x8, bool x9, bool x10, bool x11, bool x12, bool x13, bool x14, bool x15) {
401 z0 = Vec8ib(x0, x1, x2, x3, x4, x5, x6, x7);
402 z1 = Vec8ib(x8, x9, x10, x11, x12, x13, x14, x15);
403 }
404 // Constructor to convert from type Vec512b
Vec16ib(Vec512b const & x)405 Vec16ib (Vec512b const & x) {
406 z0 = x.get_low();
407 z1 = x.get_high();
408 }
409 // Construct from two halves
Vec16ib(Vec8ib const & x0,Vec8ib const & x1)410 Vec16ib (Vec8ib const & x0, Vec8ib const & x1) {
411 z0 = x0;
412 z1 = x1;
413 }
414 // Assignment operator to convert from type Vec512b
415 Vec16ib & operator = (Vec512b const & x) {
416 z0 = x.get_low();
417 z1 = x.get_high();
418 return *this;
419 }
420 // Constructor to broadcast scalar value:
Vec16ib(bool b)421 Vec16ib(bool b) : Vec16b(b) {
422 }
423 // Assignment operator to broadcast scalar value:
424 Vec16ib & operator = (bool b) {
425 *this = Vec16b(b);
426 return *this;
427 }
428 private: // Prevent constructing from int, etc.
429 Vec16ib(int b);
430 Vec16ib & operator = (int x);
431 public:
432 };
433
434 // Define operators for Vec16ib
435
436 // vector operator & : bitwise and
437 static inline Vec16ib operator & (Vec16ib const & a, Vec16ib const & b) {
438 return Vec16b(a) & Vec16b(b);
439 }
440 static inline Vec16ib operator && (Vec16ib const & a, Vec16ib const & b) {
441 return a & b;
442 }
443
444 // vector operator | : bitwise or
445 static inline Vec16ib operator | (Vec16ib const & a, Vec16ib const & b) {
446 return Vec16b(a) | Vec16b(b);
447 }
448 static inline Vec16ib operator || (Vec16ib const & a, Vec16ib const & b) {
449 return a | b;
450 }
451
452 // vector operator ^ : bitwise xor
453 static inline Vec16ib operator ^ (Vec16ib const & a, Vec16ib const & b) {
454 return Vec16b(a) ^ Vec16b(b);
455 }
456
457 // vector operator ~ : bitwise not
458 static inline Vec16ib operator ~ (Vec16ib const & a) {
459 return ~Vec16b(a);
460 }
461
462 // vector operator ! : element not
463 static inline Vec16ib operator ! (Vec16ib const & a) {
464 return ~a;
465 }
466
467 // vector operator &= : bitwise and
468 static inline Vec16ib & operator &= (Vec16ib & a, Vec16ib const & b) {
469 a = a & b;
470 return a;
471 }
472
473 // vector operator |= : bitwise or
474 static inline Vec16ib & operator |= (Vec16ib & a, Vec16ib const & b) {
475 a = a | b;
476 return a;
477 }
478
479 // vector operator ^= : bitwise xor
480 static inline Vec16ib & operator ^= (Vec16ib & a, Vec16ib const & b) {
481 a = a ^ b;
482 return a;
483 }
484
485 // vector function andnot
andnot(Vec16ib const & a,Vec16ib const & b)486 static inline Vec16ib andnot (Vec16ib const & a, Vec16ib const & b) {
487 return Vec16ib(andnot(Vec16b(a), Vec16b(b)));
488 }
489
490
491 /*****************************************************************************
492 *
493 * Vec8b: Base class vector of 8 Booleans
494 *
495 *****************************************************************************/
496
497 class Vec8b : public Vec16b {
498 public:
499 // Default constructor:
Vec8b()500 Vec8b () {
501 }
Vec8b(Vec16b const & x)502 Vec8b (Vec16b const & x) {
503 z0 = x.get_low();
504 z1 = x.get_high();
505 }
506 // Constructor to convert from type Vec512b
Vec8b(Vec512b const & x)507 Vec8b (Vec512b const & x) {
508 z0 = x.get_low();
509 z1 = x.get_high();
510 }
511 // construct from two halves
Vec8b(Vec4qb const & x0,Vec4qb const & x1)512 Vec8b (Vec4qb const & x0, Vec4qb const & x1) {
513 z0 = x0;
514 z1 = x1;
515 }
516 // Constructor to broadcast single value:
Vec8b(bool b)517 Vec8b(bool b) {
518 z0 = z1 = Vec8i(-int32_t(b));
519 }
520 // Assignment operator to broadcast scalar value:
521 Vec8b & operator = (bool b) {
522 z0 = z1 = Vec8i(-int32_t(b));
523 return *this;
524 }
525 private:
526 // Prevent constructing from int, etc. because of ambiguity
527 Vec8b(int b);
528 // Prevent assigning int because of ambiguity
529 Vec8b & operator = (int x);
530 public:
531 // split into two halves
get_low()532 Vec4qb get_low() const {
533 return Vec4qb(z0);
534 }
get_high()535 Vec4qb get_high() const {
536 return Vec4qb(z1);
537 }
538 // Assignment operator to convert from type Vec512b
539 Vec8b & operator = (Vec512b const & x) {
540 z0 = x.get_low();
541 z1 = x.get_high();
542 return *this;
543 }
544 // Member function to change a single element in vector
545 // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,bool value)546 Vec8b const & insert(uint32_t index, bool value) {
547 if (index < 4) {
548 z0 = Vec4qb(z0).insert(index, value);
549 }
550 else {
551 z1 = Vec4qb(z1).insert(index-4, value);
552 }
553 return *this;
554 }
extract(uint32_t index)555 bool extract(uint32_t index) const {
556 if (index < 4) {
557 return Vec4qb(Vec4q(z0)).extract(index);
558 }
559 else {
560 return Vec4qb(Vec4q(z1)).extract(index-4);
561 }
562 }
563 bool operator [] (uint32_t index) const {
564 return extract(index);
565 }
size()566 static int size () {
567 return 8;
568 }
569 };
570
571
572 /*****************************************************************************
573 *
574 * Vec8qb: Vector of 8 Booleans for use with Vec8q and Vec8qu
575 *
576 *****************************************************************************/
577
578 class Vec8qb : public Vec8b {
579 public:
580 // Default constructor:
Vec8qb()581 Vec8qb () {
582 }
Vec8qb(Vec16b const & x)583 Vec8qb (Vec16b const & x) {
584 z0 = x.get_low();
585 z1 = x.get_high();
586 }
587 // Constructor to build from all elements:
Vec8qb(bool x0,bool x1,bool x2,bool x3,bool x4,bool x5,bool x6,bool x7)588 Vec8qb(bool x0, bool x1, bool x2, bool x3, bool x4, bool x5, bool x6, bool x7) {
589 z0 = Vec4qb(x0, x1, x2, x3);
590 z1 = Vec4qb(x4, x5, x6, x7);
591 }
592 // Constructor to convert from type Vec512b
Vec8qb(Vec512b const & x)593 Vec8qb (Vec512b const & x) {
594 z0 = x.get_low();
595 z1 = x.get_high();
596 }
597 // construct from two halves
Vec8qb(Vec4qb const & x0,Vec4qb const & x1)598 Vec8qb (Vec4qb const & x0, Vec4qb const & x1) {
599 z0 = x0;
600 z1 = x1;
601 }
602 // Assignment operator to convert from type Vec512b
603 Vec8qb & operator = (Vec512b const & x) {
604 z0 = x.get_low();
605 z1 = x.get_high();
606 return *this;
607 }
608 // Constructor to broadcast single value:
Vec8qb(bool b)609 Vec8qb(bool b) : Vec8b(b) {
610 }
611 // Assignment operator to broadcast scalar value:
612 Vec8qb & operator = (bool b) {
613 *this = Vec8b(b);
614 return *this;
615 }
616 private:
617 // Prevent constructing from int, etc. because of ambiguity
618 Vec8qb(int b);
619 // Prevent assigning int because of ambiguity
620 Vec8qb & operator = (int x);
621 public:
622 };
623
624 // Define operators for Vec8qb
625
626 // vector operator & : bitwise and
627 static inline Vec8qb operator & (Vec8qb const & a, Vec8qb const & b) {
628 return Vec16b(a) & Vec16b(b);
629 }
630 static inline Vec8qb operator && (Vec8qb const & a, Vec8qb const & b) {
631 return a & b;
632 }
633
634 // vector operator | : bitwise or
635 static inline Vec8qb operator | (Vec8qb const & a, Vec8qb const & b) {
636 return Vec16b(a) | Vec16b(b);
637 }
638 static inline Vec8qb operator || (Vec8qb const & a, Vec8qb const & b) {
639 return a | b;
640 }
641
642 // vector operator ^ : bitwise xor
643 static inline Vec8qb operator ^ (Vec8qb const & a, Vec8qb const & b) {
644 return Vec16b(a) ^ Vec16b(b);
645 }
646
647 // vector operator ~ : bitwise not
648 static inline Vec8qb operator ~ (Vec8qb const & a) {
649 return ~Vec16b(a);
650 }
651
652 // vector operator ! : element not
653 static inline Vec8qb operator ! (Vec8qb const & a) {
654 return ~a;
655 }
656
657 // vector operator &= : bitwise and
658 static inline Vec8qb & operator &= (Vec8qb & a, Vec8qb const & b) {
659 a = a & b;
660 return a;
661 }
662
663 // vector operator |= : bitwise or
664 static inline Vec8qb & operator |= (Vec8qb & a, Vec8qb const & b) {
665 a = a | b;
666 return a;
667 }
668
669 // vector operator ^= : bitwise xor
670 static inline Vec8qb & operator ^= (Vec8qb & a, Vec8qb const & b) {
671 a = a ^ b;
672 return a;
673 }
674
675 // vector function andnot
andnot(Vec8qb const & a,Vec8qb const & b)676 static inline Vec8qb andnot (Vec8qb const & a, Vec8qb const & b) {
677 return Vec8qb(andnot(Vec16b(a), Vec16b(b)));
678 }
679
680
681 /*****************************************************************************
682 *
683 * Vector of 16 32-bit signed integers
684 *
685 *****************************************************************************/
686
687 class Vec16i: public Vec512b {
688 public:
689 // Default constructor:
Vec16i()690 Vec16i() {
691 }
692 // Constructor to broadcast the same value into all elements:
Vec16i(int i)693 Vec16i(int i) {
694 z0 = z1 = Vec8i(i);
695 }
696 // Constructor to build from all elements:
Vec16i(int32_t i0,int32_t i1,int32_t i2,int32_t i3,int32_t i4,int32_t i5,int32_t i6,int32_t i7,int32_t i8,int32_t i9,int32_t i10,int32_t i11,int32_t i12,int32_t i13,int32_t i14,int32_t i15)697 Vec16i(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7,
698 int32_t i8, int32_t i9, int32_t i10, int32_t i11, int32_t i12, int32_t i13, int32_t i14, int32_t i15) {
699 z0 = Vec8i(i0, i1, i2, i3, i4, i5, i6, i7);
700 z1 = Vec8i(i8, i9, i10, i11, i12, i13, i14, i15);
701 }
702 // Constructor to build from two Vec8i:
Vec16i(Vec8i const & a0,Vec8i const & a1)703 Vec16i(Vec8i const & a0, Vec8i const & a1) {
704 *this = Vec512b(a0, a1);
705 }
706 // Constructor to convert from type Vec512b
Vec16i(Vec512b const & x)707 Vec16i(Vec512b const & x) {
708 z0 = x.get_low();
709 z1 = x.get_high();
710 }
711 // Assignment operator to convert from type Vec512b
712 Vec16i & operator = (Vec512b const & x) {
713 z0 = x.get_low();
714 z1 = x.get_high();
715 return *this;
716 }
717 // Member function to load from array (unaligned)
load(void const * p)718 Vec16i & load(void const * p) {
719 Vec512b::load(p);
720 return *this;
721 }
722 // Member function to load from array, aligned by 64
load_a(void const * p)723 Vec16i & load_a(void const * p) {
724 Vec512b::load_a(p);
725 return *this;
726 }
727 // Partial load. Load n elements and set the rest to 0
load_partial(int n,void const * p)728 Vec16i & load_partial(int n, void const * p) {
729 if (n < 8) {
730 z0 = Vec8i().load_partial(n, p);
731 z1 = Vec8i(0);
732 }
733 else {
734 z0 = Vec8i().load(p);
735 z1 = Vec8i().load_partial(n - 8, (int32_t const*)p + 8);
736 }
737 return *this;
738 }
739 // Partial store. Store n elements
store_partial(int n,void * p)740 void store_partial(int n, void * p) const {
741 if (n < 8) {
742 Vec8i(get_low()).store_partial(n, p);
743 }
744 else {
745 Vec8i(get_low()).store(p);
746 Vec8i(get_high()).store_partial(n - 8, (int32_t *)p + 8);
747 }
748 }
749 // cut off vector to n elements. The last 8-n elements are set to zero
cutoff(int n)750 Vec16i & cutoff(int n) {
751 if (n < 8) {
752 z0 = Vec8i(z0).cutoff(n);
753 z1 = Vec8i(0);
754 }
755 else {
756 z1 = Vec8i(z1).cutoff(n - 8);
757 }
758 return *this;
759 }
760 // Member function to change a single element in vector
insert(uint32_t index,int32_t value)761 Vec16i const & insert(uint32_t index, int32_t value) {
762 if (index < 8) {
763 z0 = Vec8i(z0).insert(index, value);
764 }
765 else {
766 z1 = Vec8i(z1).insert(index - 8, value);
767 }
768 return *this;
769 }
770 // Member function extract a single element from vector
extract(uint32_t index)771 int32_t extract(uint32_t index) const {
772 if (index < 8) {
773 return Vec8i(z0).extract(index);
774 }
775 else {
776 return Vec8i(z1).extract(index - 8);
777 }
778 }
779 // Extract a single element. Use store function if extracting more than one element.
780 // Operator [] can only read an element, not write.
781 int32_t operator [] (uint32_t index) const {
782 return extract(index);
783 }
784 // Member functions to split into two Vec8i:
get_low()785 Vec8i get_low() const {
786 return Vec8i(z0);
787 }
get_high()788 Vec8i get_high() const {
789 return Vec8i(z1);
790 }
size()791 static int size () {
792 return 16;
793 }
794 };
795
796
797 // Define operators for Vec16i
798
799 // vector operator + : add element by element
800 static inline Vec16i operator + (Vec16i const & a, Vec16i const & b) {
801 return Vec16i(a.get_low() + b.get_low(), a.get_high() + b.get_high());
802 }
803
804 // vector operator += : add
805 static inline Vec16i & operator += (Vec16i & a, Vec16i const & b) {
806 a = a + b;
807 return a;
808 }
809
810 // postfix operator ++
811 static inline Vec16i operator ++ (Vec16i & a, int) {
812 Vec16i a0 = a;
813 a = a + 1;
814 return a0;
815 }
816
817 // prefix operator ++
818 static inline Vec16i & operator ++ (Vec16i & a) {
819 a = a + 1;
820 return a;
821 }
822
823 // vector operator - : subtract element by element
824 static inline Vec16i operator - (Vec16i const & a, Vec16i const & b) {
825 return Vec16i(a.get_low() - b.get_low(), a.get_high() - b.get_high());
826 }
827
828 // vector operator - : unary minus
829 static inline Vec16i operator - (Vec16i const & a) {
830 return Vec16i(-a.get_low(), -a.get_high());
831 }
832
833 // vector operator -= : subtract
834 static inline Vec16i & operator -= (Vec16i & a, Vec16i const & b) {
835 a = a - b;
836 return a;
837 }
838
839 // postfix operator --
840 static inline Vec16i operator -- (Vec16i & a, int) {
841 Vec16i a0 = a;
842 a = a - 1;
843 return a0;
844 }
845
846 // prefix operator --
847 static inline Vec16i & operator -- (Vec16i & a) {
848 a = a - 1;
849 return a;
850 }
851
852 // vector operator * : multiply element by element
853 static inline Vec16i operator * (Vec16i const & a, Vec16i const & b) {
854 return Vec16i(a.get_low() * b.get_low(), a.get_high() * b.get_high());
855 }
856
857 // vector operator *= : multiply
858 static inline Vec16i & operator *= (Vec16i & a, Vec16i const & b) {
859 a = a * b;
860 return a;
861 }
862
863 // vector operator / : divide all elements by same integer
864 // See bottom of file
865
866
867 // vector operator << : shift left
868 static inline Vec16i operator << (Vec16i const & a, int32_t b) {
869 return Vec16i(a.get_low() << b, a.get_high() << b);
870 }
871
872 // vector operator <<= : shift left
873 static inline Vec16i & operator <<= (Vec16i & a, int32_t b) {
874 a = a << b;
875 return a;
876 }
877
878 // vector operator >> : shift right arithmetic
879 static inline Vec16i operator >> (Vec16i const & a, int32_t b) {
880 return Vec16i(a.get_low() >> b, a.get_high() >> b);
881 }
882
883 // vector operator >>= : shift right arithmetic
884 static inline Vec16i & operator >>= (Vec16i & a, int32_t b) {
885 a = a >> b;
886 return a;
887 }
888
889 // vector operator == : returns true for elements for which a == b
890 static inline Vec16ib operator == (Vec16i const & a, Vec16i const & b) {
891 return Vec16ib(a.get_low() == b.get_low(), a.get_high() == b.get_high());
892 }
893
894 // vector operator != : returns true for elements for which a != b
895 static inline Vec16ib operator != (Vec16i const & a, Vec16i const & b) {
896 return Vec16ib(a.get_low() != b.get_low(), a.get_high() != b.get_high());
897 }
898
899 // vector operator > : returns true for elements for which a > b
900 static inline Vec16ib operator > (Vec16i const & a, Vec16i const & b) {
901 return Vec16ib(a.get_low() > b.get_low(), a.get_high() > b.get_high());
902 }
903
904 // vector operator < : returns true for elements for which a < b
905 static inline Vec16ib operator < (Vec16i const & a, Vec16i const & b) {
906 return b > a;
907 }
908
909 // vector operator >= : returns true for elements for which a >= b (signed)
910 static inline Vec16ib operator >= (Vec16i const & a, Vec16i const & b) {
911 return Vec16ib(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
912 }
913
914 // vector operator <= : returns true for elements for which a <= b (signed)
915 static inline Vec16ib operator <= (Vec16i const & a, Vec16i const & b) {
916 return b >= a;
917 }
918
919 // vector operator & : bitwise and
920 static inline Vec16i operator & (Vec16i const & a, Vec16i const & b) {
921 return Vec16i(a.get_low() & b.get_low(), a.get_high() & b.get_high());
922 }
923
924 // vector operator &= : bitwise and
925 static inline Vec16i & operator &= (Vec16i & a, Vec16i const & b) {
926 a = a & b;
927 return a;
928 }
929
930 // vector operator | : bitwise or
931 static inline Vec16i operator | (Vec16i const & a, Vec16i const & b) {
932 return Vec16i(a.get_low() | b.get_low(), a.get_high() | b.get_high());
933 }
934
935 // vector operator |= : bitwise or
936 static inline Vec16i & operator |= (Vec16i & a, Vec16i const & b) {
937 a = a | b;
938 return a;
939 }
940
941 // vector operator ^ : bitwise xor
942 static inline Vec16i operator ^ (Vec16i const & a, Vec16i const & b) {
943 return Vec16i(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
944 }
945
946 // vector operator ^= : bitwise xor
947 static inline Vec16i & operator ^= (Vec16i & a, Vec16i const & b) {
948 a = a ^ b;
949 return a;
950 }
951
952 // vector operator ~ : bitwise not
953 static inline Vec16i operator ~ (Vec16i const & a) {
954 return Vec16i(~(a.get_low()), ~(a.get_high()));
955 }
956
957 // Functions for this class
958
959 // Select between two operands. Corresponds to this pseudocode:
960 // for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec16ib const & s,Vec16i const & a,Vec16i const & b)961 static inline Vec16i select (Vec16ib const & s, Vec16i const & a, Vec16i const & b) {
962 return Vec16i(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
963 }
964
965 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec16ib const & f,Vec16i const & a,Vec16i const & b)966 static inline Vec16i if_add (Vec16ib const & f, Vec16i const & a, Vec16i const & b) {
967 return Vec16i(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
968 }
969
970 // Horizontal add: Calculates the sum of all vector elements.
971 // Overflow will wrap around
horizontal_add(Vec16i const & a)972 static inline int32_t horizontal_add (Vec16i const & a) {
973 return horizontal_add(a.get_low() + a.get_high());
974 }
975
976 // function add_saturated: add element by element, signed with saturation
add_saturated(Vec16i const & a,Vec16i const & b)977 static inline Vec16i add_saturated(Vec16i const & a, Vec16i const & b) {
978 return Vec16i(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
979 }
980
981 // function sub_saturated: subtract element by element, signed with saturation
sub_saturated(Vec16i const & a,Vec16i const & b)982 static inline Vec16i sub_saturated(Vec16i const & a, Vec16i const & b) {
983 return Vec16i(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
984 }
985
986 // function max: a > b ? a : b
max(Vec16i const & a,Vec16i const & b)987 static inline Vec16i max(Vec16i const & a, Vec16i const & b) {
988 return Vec16i(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
989 }
990
991 // function min: a < b ? a : b
min(Vec16i const & a,Vec16i const & b)992 static inline Vec16i min(Vec16i const & a, Vec16i const & b) {
993 return Vec16i(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
994 }
995
996 // function abs: a >= 0 ? a : -a
abs(Vec16i const & a)997 static inline Vec16i abs(Vec16i const & a) {
998 return Vec16i(abs(a.get_low()), abs(a.get_high()));
999 }
1000
1001 // function abs_saturated: same as abs, saturate if overflow
abs_saturated(Vec16i const & a)1002 static inline Vec16i abs_saturated(Vec16i const & a) {
1003 return Vec16i(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
1004 }
1005
1006 // function rotate_left all elements
1007 // Use negative count to rotate right
rotate_left(Vec16i const & a,int b)1008 static inline Vec16i rotate_left(Vec16i const & a, int b) {
1009 return Vec16i(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b));
1010 }
1011
1012
1013 /*****************************************************************************
1014 *
1015 * Vector of 16 32-bit unsigned integers
1016 *
1017 *****************************************************************************/
1018
1019 class Vec16ui : public Vec16i {
1020 public:
1021 // Default constructor:
Vec16ui()1022 Vec16ui() {
1023 };
1024 // Constructor to broadcast the same value into all elements:
Vec16ui(uint32_t i)1025 Vec16ui(uint32_t i) {
1026 z0 = z1 = Vec8ui(i);
1027 };
1028 // Constructor to build from all elements:
Vec16ui(uint32_t i0,uint32_t i1,uint32_t i2,uint32_t i3,uint32_t i4,uint32_t i5,uint32_t i6,uint32_t i7,uint32_t i8,uint32_t i9,uint32_t i10,uint32_t i11,uint32_t i12,uint32_t i13,uint32_t i14,uint32_t i15)1029 Vec16ui(uint32_t i0, uint32_t i1, uint32_t i2, uint32_t i3, uint32_t i4, uint32_t i5, uint32_t i6, uint32_t i7,
1030 uint32_t i8, uint32_t i9, uint32_t i10, uint32_t i11, uint32_t i12, uint32_t i13, uint32_t i14, uint32_t i15) {
1031 z0 = Vec8ui(i0, i1, i2, i3, i4, i5, i6, i7);
1032 z1 = Vec8ui(i8, i9, i10, i11, i12, i13, i14, i15);
1033 };
1034 // Constructor to build from two Vec8ui:
Vec16ui(Vec8ui const & a0,Vec8ui const & a1)1035 Vec16ui(Vec8ui const & a0, Vec8ui const & a1) {
1036 z0 = a0;
1037 z1 = a1;
1038 }
1039 // Constructor to convert from type Vec512b
Vec16ui(Vec512b const & x)1040 Vec16ui(Vec512b const & x) {
1041 *this = x;
1042 };
1043 // Assignment operator to convert from type Vec512b
1044 Vec16ui & operator = (Vec512b const & x) {
1045 z0 = x.get_low();
1046 z1 = x.get_high();
1047 return *this;
1048 };
1049 // Member function to load from array (unaligned)
load(void const * p)1050 Vec16ui & load(void const * p) {
1051 Vec16i::load(p);
1052 return *this;
1053 }
1054 // Member function to load from array, aligned by 64
load_a(void const * p)1055 Vec16ui & load_a(void const * p) {
1056 Vec16i::load_a(p);
1057 return *this;
1058 }
1059 // Member function to change a single element in vector
1060 // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,uint32_t value)1061 Vec16ui const & insert(uint32_t index, uint32_t value) {
1062 Vec16i::insert(index, value);
1063 return *this;
1064 }
1065 // Member function extract a single element from vector
extract(uint32_t index)1066 uint32_t extract(uint32_t index) const {
1067 return Vec16i::extract(index);
1068 }
1069 // Extract a single element. Use store function if extracting more than one element.
1070 // Operator [] can only read an element, not write.
1071 uint32_t operator [] (uint32_t index) const {
1072 return extract(index);
1073 }
1074 // Member functions to split into two Vec4ui:
get_low()1075 Vec8ui get_low() const {
1076 return Vec8ui(Vec16i::get_low());
1077 }
get_high()1078 Vec8ui get_high() const {
1079 return Vec8ui(Vec16i::get_high());
1080 }
1081 };
1082
1083 // Define operators for this class
1084
1085 // vector operator + : add
1086 static inline Vec16ui operator + (Vec16ui const & a, Vec16ui const & b) {
1087 return Vec16ui (Vec16i(a) + Vec16i(b));
1088 }
1089
1090 // vector operator - : subtract
1091 static inline Vec16ui operator - (Vec16ui const & a, Vec16ui const & b) {
1092 return Vec16ui (Vec16i(a) - Vec16i(b));
1093 }
1094
1095 // vector operator * : multiply
1096 static inline Vec16ui operator * (Vec16ui const & a, Vec16ui const & b) {
1097 return Vec16ui (Vec16i(a) * Vec16i(b));
1098 }
1099
1100 // vector operator / : divide
1101 // See bottom of file
1102
1103 // vector operator >> : shift right logical all elements
1104 static inline Vec16ui operator >> (Vec16ui const & a, uint32_t b) {
1105 return Vec16ui(a.get_low() >> b, a.get_high() >> b);
1106 }
1107
1108 // vector operator >> : shift right logical all elements
1109 static inline Vec16ui operator >> (Vec16ui const & a, int32_t b) {
1110 return a >> (uint32_t)b;
1111 }
1112
1113 // vector operator >>= : shift right logical
1114 static inline Vec16ui & operator >>= (Vec16ui & a, uint32_t b) {
1115 a = a >> b;
1116 return a;
1117 }
1118
1119 // vector operator >>= : shift right logical
1120 static inline Vec16ui & operator >>= (Vec16ui & a, int32_t b) {
1121 a = a >> uint32_t(b);
1122 return a;
1123 }
1124
1125 // vector operator << : shift left all elements
1126 static inline Vec16ui operator << (Vec16ui const & a, uint32_t b) {
1127 return Vec16ui ((Vec16i)a << (int32_t)b);
1128 }
1129
1130 // vector operator << : shift left all elements
1131 static inline Vec16ui operator << (Vec16ui const & a, int32_t b) {
1132 return Vec16ui ((Vec16i)a << (int32_t)b);
1133 }
1134
1135 // vector operator < : returns true for elements for which a < b (unsigned)
1136 static inline Vec16ib operator < (Vec16ui const & a, Vec16ui const & b) {
1137 return Vec16ib(a.get_low() < b.get_low(), a.get_high() < b.get_high());
1138 }
1139
1140 // vector operator > : returns true for elements for which a > b (unsigned)
1141 static inline Vec16ib operator > (Vec16ui const & a, Vec16ui const & b) {
1142 return b < a;
1143 }
1144
1145 // vector operator >= : returns true for elements for which a >= b (unsigned)
1146 static inline Vec16ib operator >= (Vec16ui const & a, Vec16ui const & b) {
1147 return Vec16ib(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
1148 }
1149
1150 // vector operator <= : returns true for elements for which a <= b (unsigned)
1151 static inline Vec16ib operator <= (Vec16ui const & a, Vec16ui const & b) {
1152 return b >= a;
1153 }
1154
1155 // vector operator & : bitwise and
1156 static inline Vec16ui operator & (Vec16ui const & a, Vec16ui const & b) {
1157 return Vec16ui(Vec16i(a) & Vec16i(b));
1158 }
1159
1160 // vector operator | : bitwise or
1161 static inline Vec16ui operator | (Vec16ui const & a, Vec16ui const & b) {
1162 return Vec16ui(Vec16i(a) | Vec16i(b));
1163 }
1164
1165 // vector operator ^ : bitwise xor
1166 static inline Vec16ui operator ^ (Vec16ui const & a, Vec16ui const & b) {
1167 return Vec16ui(Vec16i(a) ^ Vec16i(b));
1168 }
1169
1170 // vector operator ~ : bitwise not
1171 static inline Vec16ui operator ~ (Vec16ui const & a) {
1172 return Vec16ui( ~ Vec16i(a));
1173 }
1174
1175 // Functions for this class
1176
1177 // Select between two operands. Corresponds to this pseudocode:
1178 // for (int i = 0; i < 16; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec16ib const & s,Vec16ui const & a,Vec16ui const & b)1179 static inline Vec16ui select (Vec16ib const & s, Vec16ui const & a, Vec16ui const & b) {
1180 return Vec16ui(select(s, Vec16i(a), Vec16i(b)));
1181 }
1182
1183 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec16ib const & f,Vec16ui const & a,Vec16ui const & b)1184 static inline Vec16ui if_add (Vec16ib const & f, Vec16ui const & a, Vec16ui const & b) {
1185 return Vec16ui(if_add(f, Vec16i(a), Vec16i(b)));
1186 }
1187
1188 // Horizontal add: Calculates the sum of all vector elements.
1189 // Overflow will wrap around
horizontal_add(Vec16ui const & a)1190 static inline uint32_t horizontal_add (Vec16ui const & a) {
1191 return horizontal_add((Vec16i)a);
1192 }
1193
1194 // horizontal_add_x: Horizontal add extended: Calculates the sum of all vector elements. Defined later in this file
1195
1196 // function add_saturated: add element by element, unsigned with saturation
add_saturated(Vec16ui const & a,Vec16ui const & b)1197 static inline Vec16ui add_saturated(Vec16ui const & a, Vec16ui const & b) {
1198 return Vec16ui(add_saturated(a.get_low(), b.get_low()), add_saturated(a.get_high(), b.get_high()));
1199 }
1200
1201 // function sub_saturated: subtract element by element, unsigned with saturation
sub_saturated(Vec16ui const & a,Vec16ui const & b)1202 static inline Vec16ui sub_saturated(Vec16ui const & a, Vec16ui const & b) {
1203 return Vec16ui(sub_saturated(a.get_low(), b.get_low()), sub_saturated(a.get_high(), b.get_high()));
1204 }
1205
1206 // function max: a > b ? a : b
max(Vec16ui const & a,Vec16ui const & b)1207 static inline Vec16ui max(Vec16ui const & a, Vec16ui const & b) {
1208 return Vec16ui(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
1209 }
1210
1211 // function min: a < b ? a : b
min(Vec16ui const & a,Vec16ui const & b)1212 static inline Vec16ui min(Vec16ui const & a, Vec16ui const & b) {
1213 return Vec16ui(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
1214 }
1215
1216
1217 /*****************************************************************************
1218 *
1219 * Vector of 8 64-bit signed integers
1220 *
1221 *****************************************************************************/
1222
1223 class Vec8q : public Vec512b {
1224 public:
1225 // Default constructor:
Vec8q()1226 Vec8q() {
1227 }
1228 // Constructor to broadcast the same value into all elements:
Vec8q(int64_t i)1229 Vec8q(int64_t i) {
1230 z0 = z1 = Vec4q(i);
1231 }
1232 // Constructor to build from all elements:
Vec8q(int64_t i0,int64_t i1,int64_t i2,int64_t i3,int64_t i4,int64_t i5,int64_t i6,int64_t i7)1233 Vec8q(int64_t i0, int64_t i1, int64_t i2, int64_t i3, int64_t i4, int64_t i5, int64_t i6, int64_t i7) {
1234 z0 = Vec4q(i0, i1, i2, i3);
1235 z1 = Vec4q(i4, i5, i6, i7);
1236 }
1237 // Constructor to build from two Vec4q:
Vec8q(Vec4q const & a0,Vec4q const & a1)1238 Vec8q(Vec4q const & a0, Vec4q const & a1) {
1239 z0 = a0;
1240 z1 = a1;
1241 }
1242 // Constructor to convert from type Vec512b
Vec8q(Vec512b const & x)1243 Vec8q(Vec512b const & x) {
1244 z0 = x.get_low();
1245 z1 = x.get_high();
1246 }
1247 // Assignment operator to convert from type Vec512b
1248 Vec8q & operator = (Vec512b const & x) {
1249 z0 = x.get_low();
1250 z1 = x.get_high();
1251 return *this;
1252 }
1253 // Member function to load from array (unaligned)
load(void const * p)1254 Vec8q & load(void const * p) {
1255 z0 = Vec4q().load(p);
1256 z1 = Vec4q().load((int64_t const*)p+4);
1257 return *this;
1258 }
1259 // Member function to load from array, aligned by 64
load_a(void const * p)1260 Vec8q & load_a(void const * p) {
1261 z0 = Vec4q().load_a(p);
1262 z1 = Vec4q().load_a((int64_t const*)p+4);
1263 return *this;
1264 }
1265 // Partial load. Load n elements and set the rest to 0
load_partial(int n,void const * p)1266 Vec8q & load_partial(int n, void const * p) {
1267 if (n < 4) {
1268 z0 = Vec4q().load_partial(n, p);
1269 z1 = Vec4q(0);
1270 }
1271 else {
1272 z0 = Vec4q().load(p);
1273 z1 = Vec4q().load_partial(n - 4, (int64_t const*)p + 4);
1274 }
1275 return *this;
1276 }
1277 // Partial store. Store n elements
store_partial(int n,void * p)1278 void store_partial(int n, void * p) const {
1279 if (n < 4) {
1280 Vec4q(get_low()).store_partial(n, p);
1281 }
1282 else {
1283 Vec4q(get_low()).store(p);
1284 Vec4q(get_high()).store_partial(n - 4, (int64_t *)p + 4);
1285 }
1286 }
1287 // cut off vector to n elements. The last 8-n elements are set to zero
cutoff(int n)1288 Vec8q & cutoff(int n) {
1289 if (n < 4) {
1290 z0 = Vec4q(z0).cutoff(n);
1291 z1 = Vec4q(0);
1292 }
1293 else {
1294 z1 = Vec4q(z1).cutoff(n - 4);
1295 }
1296 return *this;
1297 }
1298 // Member function to change a single element in vector
1299 // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,int64_t value)1300 Vec8q const & insert(uint32_t index, int64_t value) {
1301 if (index < 4) {
1302 z0 = Vec4q(z0).insert(index, value);
1303 }
1304 else {
1305 z1 = Vec4q(z1).insert(index-4, value);
1306 }
1307 return *this;
1308 }
1309 // Member function extract a single element from vector
extract(uint32_t index)1310 int64_t extract(uint32_t index) const {
1311 if (index < 4) {
1312 return Vec4q(z0).extract(index);
1313 }
1314 else {
1315 return Vec4q(z1).extract(index - 4);
1316 }
1317 }
1318 // Extract a single element. Use store function if extracting more than one element.
1319 // Operator [] can only read an element, not write.
1320 int64_t operator [] (uint32_t index) const {
1321 return extract(index);
1322 }
1323 // Member functions to split into two Vec2q:
get_low()1324 Vec4q get_low() const {
1325 return Vec4q(z0);
1326 }
get_high()1327 Vec4q get_high() const {
1328 return Vec4q(z1);
1329 }
size()1330 static int size () {
1331 return 8;
1332 }
1333 };
1334
1335
1336 // Define operators for Vec8q
1337
1338 // vector operator + : add element by element
1339 static inline Vec8q operator + (Vec8q const & a, Vec8q const & b) {
1340 return Vec8q(a.get_low() + b.get_low(), a.get_high() + b.get_high());
1341 }
1342
1343 // vector operator += : add
1344 static inline Vec8q & operator += (Vec8q & a, Vec8q const & b) {
1345 a = a + b;
1346 return a;
1347 }
1348
1349 // postfix operator ++
1350 static inline Vec8q operator ++ (Vec8q & a, int) {
1351 Vec8q a0 = a;
1352 a = a + 1;
1353 return a0;
1354 }
1355
1356 // prefix operator ++
1357 static inline Vec8q & operator ++ (Vec8q & a) {
1358 a = a + 1;
1359 return a;
1360 }
1361
1362 // vector operator - : subtract element by element
1363 static inline Vec8q operator - (Vec8q const & a, Vec8q const & b) {
1364 return Vec8q(a.get_low() - b.get_low(), a.get_high() - b.get_high());
1365 }
1366
1367 // vector operator - : unary minus
1368 static inline Vec8q operator - (Vec8q const & a) {
1369 return Vec8q(- a.get_low(), - a.get_high());
1370 }
1371
1372 // vector operator -= : subtract
1373 static inline Vec8q & operator -= (Vec8q & a, Vec8q const & b) {
1374 a = a - b;
1375 return a;
1376 }
1377
1378 // postfix operator --
1379 static inline Vec8q operator -- (Vec8q & a, int) {
1380 Vec8q a0 = a;
1381 a = a - 1;
1382 return a0;
1383 }
1384
1385 // prefix operator --
1386 static inline Vec8q & operator -- (Vec8q & a) {
1387 a = a - 1;
1388 return a;
1389 }
1390
1391 // vector operator * : multiply element by element
1392 static inline Vec8q operator * (Vec8q const & a, Vec8q const & b) {
1393 return Vec8q(a.get_low() * b.get_low(), a.get_high() * b.get_high());
1394 }
1395
1396 // vector operator *= : multiply
1397 static inline Vec8q & operator *= (Vec8q & a, Vec8q const & b) {
1398 a = a * b;
1399 return a;
1400 }
1401
1402 // vector operator << : shift left
1403 static inline Vec8q operator << (Vec8q const & a, int32_t b) {
1404 return Vec8q(a.get_low() << b, a.get_high() << b);
1405 }
1406
1407 // vector operator <<= : shift left
1408 static inline Vec8q & operator <<= (Vec8q & a, int32_t b) {
1409 a = a << b;
1410 return a;
1411 }
1412
1413 // vector operator >> : shift right arithmetic
1414 static inline Vec8q operator >> (Vec8q const & a, int32_t b) {
1415 return Vec8q(a.get_low() >> b, a.get_high() >> b);
1416 }
1417
1418 // vector operator >>= : shift right arithmetic
1419 static inline Vec8q & operator >>= (Vec8q & a, int32_t b) {
1420 a = a >> b;
1421 return a;
1422 }
1423
1424 // vector operator == : returns true for elements for which a == b
1425 static inline Vec8qb operator == (Vec8q const & a, Vec8q const & b) {
1426 return Vec8qb(a.get_low() == b.get_low(), a.get_high() == b.get_high());
1427 }
1428
1429 // vector operator != : returns true for elements for which a != b
1430 static inline Vec8qb operator != (Vec8q const & a, Vec8q const & b) {
1431 return Vec8qb(a.get_low() != b.get_low(), a.get_high() != b.get_high());
1432 }
1433
1434 // vector operator < : returns true for elements for which a < b
1435 static inline Vec8qb operator < (Vec8q const & a, Vec8q const & b) {
1436 return Vec8qb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
1437 }
1438
1439 // vector operator > : returns true for elements for which a > b
1440 static inline Vec8qb operator > (Vec8q const & a, Vec8q const & b) {
1441 return b < a;
1442 }
1443
1444 // vector operator >= : returns true for elements for which a >= b (signed)
1445 static inline Vec8qb operator >= (Vec8q const & a, Vec8q const & b) {
1446 return Vec8qb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
1447 }
1448
1449 // vector operator <= : returns true for elements for which a <= b (signed)
1450 static inline Vec8qb operator <= (Vec8q const & a, Vec8q const & b) {
1451 return b >= a;
1452 }
1453
1454 // vector operator & : bitwise and
1455 static inline Vec8q operator & (Vec8q const & a, Vec8q const & b) {
1456 return Vec8q(a.get_low() & b.get_low(), a.get_high() & b.get_high());
1457 }
1458
1459 // vector operator &= : bitwise and
1460 static inline Vec8q & operator &= (Vec8q & a, Vec8q const & b) {
1461 a = a & b;
1462 return a;
1463 }
1464
1465 // vector operator | : bitwise or
1466 static inline Vec8q operator | (Vec8q const & a, Vec8q const & b) {
1467 return Vec8q(a.get_low() | b.get_low(), a.get_high() | b.get_high());
1468 }
1469
1470 // vector operator |= : bitwise or
1471 static inline Vec8q & operator |= (Vec8q & a, Vec8q const & b) {
1472 a = a | b;
1473 return a;
1474 }
1475
1476 // vector operator ^ : bitwise xor
1477 static inline Vec8q operator ^ (Vec8q const & a, Vec8q const & b) {
1478 return Vec8q(a.get_low() ^ b.get_low(), a.get_high() ^ b.get_high());
1479 }
1480 // vector operator ^= : bitwise xor
1481 static inline Vec8q & operator ^= (Vec8q & a, Vec8q const & b) {
1482 a = a ^ b;
1483 return a;
1484 }
1485
1486 // vector operator ~ : bitwise not
1487 static inline Vec8q operator ~ (Vec8q const & a) {
1488 return Vec8q(~(a.get_low()), ~(a.get_high()));
1489 }
1490
1491 // Functions for this class
1492
1493 // Select between two operands. Corresponds to this pseudocode:
1494 // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec8qb const & s,Vec8q const & a,Vec8q const & b)1495 static inline Vec8q select (Vec8qb const & s, Vec8q const & a, Vec8q const & b) {
1496 return Vec8q(select(s.get_low(), a.get_low(), b.get_low()), select(s.get_high(), a.get_high(), b.get_high()));
1497 }
1498
1499 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec8qb const & f,Vec8q const & a,Vec8q const & b)1500 static inline Vec8q if_add (Vec8qb const & f, Vec8q const & a, Vec8q const & b) {
1501 return Vec8q(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
1502 }
1503
1504 // Horizontal add: Calculates the sum of all vector elements.
1505 // Overflow will wrap around
horizontal_add(Vec8q const & a)1506 static inline int64_t horizontal_add (Vec8q const & a) {
1507 return horizontal_add(a.get_low() + a.get_high());
1508 }
1509
1510 // Horizontal add extended: Calculates the sum of all vector elements
1511 // Elements are sign extended before adding to avoid overflow
horizontal_add_x(Vec16i const & x)1512 static inline int64_t horizontal_add_x (Vec16i const & x) {
1513 return horizontal_add_x(x.get_low()) + horizontal_add_x(x.get_high());
1514 }
1515
1516 // Horizontal add extended: Calculates the sum of all vector elements
1517 // Elements are zero extended before adding to avoid overflow
horizontal_add_x(Vec16ui const & x)1518 static inline uint64_t horizontal_add_x (Vec16ui const & x) {
1519 return horizontal_add_x(x.get_low()) + horizontal_add_x(x.get_high());
1520 }
1521
1522 // function max: a > b ? a : b
max(Vec8q const & a,Vec8q const & b)1523 static inline Vec8q max(Vec8q const & a, Vec8q const & b) {
1524 return Vec8q(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
1525 }
1526
1527 // function min: a < b ? a : b
min(Vec8q const & a,Vec8q const & b)1528 static inline Vec8q min(Vec8q const & a, Vec8q const & b) {
1529 return Vec8q(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
1530 }
1531
1532 // function abs: a >= 0 ? a : -a
abs(Vec8q const & a)1533 static inline Vec8q abs(Vec8q const & a) {
1534 return Vec8q(abs(a.get_low()), abs(a.get_high()));
1535 }
1536
1537 // function abs_saturated: same as abs, saturate if overflow
abs_saturated(Vec8q const & a)1538 static inline Vec8q abs_saturated(Vec8q const & a) {
1539 return Vec8q(abs_saturated(a.get_low()), abs_saturated(a.get_high()));
1540 }
1541
1542 // function rotate_left all elements
1543 // Use negative count to rotate right
rotate_left(Vec8q const & a,int b)1544 static inline Vec8q rotate_left(Vec8q const & a, int b) {
1545 return Vec8q(rotate_left(a.get_low(), b), rotate_left(a.get_high(), b));
1546 }
1547
1548
1549 /*****************************************************************************
1550 *
1551 * Vector of 8 64-bit unsigned integers
1552 *
1553 *****************************************************************************/
1554
1555 class Vec8uq : public Vec8q {
1556 public:
1557 // Default constructor:
Vec8uq()1558 Vec8uq() {
1559 }
1560 // Constructor to broadcast the same value into all elements:
Vec8uq(uint64_t i)1561 Vec8uq(uint64_t i) {
1562 z0 = z1 = Vec4uq(i);
1563 }
1564 // Constructor to convert from Vec8q:
Vec8uq(Vec8q const & x)1565 Vec8uq(Vec8q const & x) {
1566 z0 = x.get_low();
1567 z1 = x.get_high();
1568 }
1569 // Constructor to convert from type Vec512b
Vec8uq(Vec512b const & x)1570 Vec8uq(Vec512b const & x) {
1571 z0 = x.get_low();
1572 z1 = x.get_high();
1573 }
1574 // Constructor to build from all elements:
Vec8uq(uint64_t i0,uint64_t i1,uint64_t i2,uint64_t i3,uint64_t i4,uint64_t i5,uint64_t i6,uint64_t i7)1575 Vec8uq(uint64_t i0, uint64_t i1, uint64_t i2, uint64_t i3, uint64_t i4, uint64_t i5, uint64_t i6, uint64_t i7) {
1576 z0 = Vec4q(i0, i1, i2, i3);
1577 z1 = Vec4q(i4, i5, i6, i7);
1578 }
1579 // Constructor to build from two Vec4uq:
Vec8uq(Vec4uq const & a0,Vec4uq const & a1)1580 Vec8uq(Vec4uq const & a0, Vec4uq const & a1) {
1581 z0 = a0;
1582 z1 = a1;
1583 }
1584 // Assignment operator to convert from Vec8q:
1585 Vec8uq & operator = (Vec8q const & x) {
1586 z0 = x.get_low();
1587 z1 = x.get_high();
1588 return *this;
1589 }
1590 // Assignment operator to convert from type Vec512b
1591 Vec8uq & operator = (Vec512b const & x) {
1592 z0 = x.get_low();
1593 z1 = x.get_high();
1594 return *this;
1595 }
1596 // Member function to load from array (unaligned)
load(void const * p)1597 Vec8uq & load(void const * p) {
1598 Vec8q::load(p);
1599 return *this;
1600 }
1601 // Member function to load from array, aligned by 32
load_a(void const * p)1602 Vec8uq & load_a(void const * p) {
1603 Vec8q::load_a(p);
1604 return *this;
1605 }
1606 // Member function to change a single element in vector
1607 // Note: This function is inefficient. Use load function if changing more than one element
insert(uint32_t index,uint64_t value)1608 Vec8uq const & insert(uint32_t index, uint64_t value) {
1609 Vec8q::insert(index, value);
1610 return *this;
1611 }
1612 // Member function extract a single element from vector
extract(uint32_t index)1613 uint64_t extract(uint32_t index) const {
1614 return Vec8q::extract(index);
1615 }
1616 // Extract a single element. Use store function if extracting more than one element.
1617 // Operator [] can only read an element, not write.
1618 uint64_t operator [] (uint32_t index) const {
1619 return extract(index);
1620 }
1621 // Member functions to split into two Vec2uq:
get_low()1622 Vec4uq get_low() const {
1623 return Vec4uq(Vec8q::get_low());
1624 }
get_high()1625 Vec4uq get_high() const {
1626 return Vec4uq(Vec8q::get_high());
1627 }
1628 };
1629
1630 // Define operators for this class
1631
1632 // vector operator + : add
1633 static inline Vec8uq operator + (Vec8uq const & a, Vec8uq const & b) {
1634 return Vec8uq (Vec8q(a) + Vec8q(b));
1635 }
1636
1637 // vector operator - : subtract
1638 static inline Vec8uq operator - (Vec8uq const & a, Vec8uq const & b) {
1639 return Vec8uq (Vec8q(a) - Vec8q(b));
1640 }
1641
1642 // vector operator * : multiply element by element
1643 static inline Vec8uq operator * (Vec8uq const & a, Vec8uq const & b) {
1644 return Vec8uq (Vec8q(a) * Vec8q(b));
1645 }
1646
1647 // vector operator >> : shift right logical all elements
1648 static inline Vec8uq operator >> (Vec8uq const & a, uint32_t b) {
1649 return Vec8uq(a.get_low() >> b, a.get_high() >> b);
1650 }
1651
1652 // vector operator >> : shift right logical all elements
1653 static inline Vec8uq operator >> (Vec8uq const & a, int32_t b) {
1654 return a >> (uint32_t)b;
1655 }
1656
1657 // vector operator >>= : shift right artihmetic
1658 static inline Vec8uq & operator >>= (Vec8uq & a, uint32_t b) {
1659 a = a >> b;
1660 return a;
1661 }
1662
1663 // vector operator >>= : shift right logical
1664 static inline Vec8uq & operator >>= (Vec8uq & a, int32_t b) {
1665 a = a >> uint32_t(b);
1666 return a;
1667 }
1668
1669 // vector operator << : shift left all elements
1670 static inline Vec8uq operator << (Vec8uq const & a, uint32_t b) {
1671 return Vec8uq ((Vec8q)a << (int32_t)b);
1672 }
1673
1674 // vector operator << : shift left all elements
1675 static inline Vec8uq operator << (Vec8uq const & a, int32_t b) {
1676 return Vec8uq ((Vec8q)a << b);
1677 }
1678
1679 // vector operator < : returns true for elements for which a < b (unsigned)
1680 static inline Vec8qb operator < (Vec8uq const & a, Vec8uq const & b) {
1681 return Vec8qb(a.get_low() < b.get_low(), a.get_high() < b.get_high());
1682 }
1683
1684 // vector operator > : returns true for elements for which a > b (unsigned)
1685 static inline Vec8qb operator > (Vec8uq const & a, Vec8uq const & b) {
1686 return b < a;
1687 }
1688
1689 // vector operator >= : returns true for elements for which a >= b (unsigned)
1690 static inline Vec8qb operator >= (Vec8uq const & a, Vec8uq const & b) {
1691 return Vec8qb(a.get_low() >= b.get_low(), a.get_high() >= b.get_high());
1692 }
1693
1694 // vector operator <= : returns true for elements for which a <= b (unsigned)
1695 static inline Vec8qb operator <= (Vec8uq const & a, Vec8uq const & b) {
1696 return b >= a;
1697 }
1698
1699 // vector operator & : bitwise and
1700 static inline Vec8uq operator & (Vec8uq const & a, Vec8uq const & b) {
1701 return Vec8uq(Vec8q(a) & Vec8q(b));
1702 }
1703
1704 // vector operator | : bitwise or
1705 static inline Vec8uq operator | (Vec8uq const & a, Vec8uq const & b) {
1706 return Vec8uq(Vec8q(a) | Vec8q(b));
1707 }
1708
1709 // vector operator ^ : bitwise xor
1710 static inline Vec8uq operator ^ (Vec8uq const & a, Vec8uq const & b) {
1711 return Vec8uq(Vec8q(a) ^ Vec8q(b));
1712 }
1713
1714 // Functions for this class
1715
1716 // Select between two operands. Corresponds to this pseudocode:
1717 // for (int i = 0; i < 4; i++) result[i] = s[i] ? a[i] : b[i];
select(Vec8qb const & s,Vec8uq const & a,Vec8uq const & b)1718 static inline Vec8uq select (Vec8qb const & s, Vec8uq const & a, Vec8uq const & b) {
1719 return Vec8uq(select(s, Vec8q(a), Vec8q(b)));
1720 }
1721
1722 // Conditional add: For all vector elements i: result[i] = f[i] ? (a[i] + b[i]) : a[i]
if_add(Vec8qb const & f,Vec8uq const & a,Vec8uq const & b)1723 static inline Vec8uq if_add (Vec8qb const & f, Vec8uq const & a, Vec8uq const & b) {
1724 return Vec8uq(if_add(f.get_low(), a.get_low(), b.get_low()), if_add(f.get_high(), a.get_high(), b.get_high()));
1725 }
1726
1727 // Horizontal add: Calculates the sum of all vector elements.
1728 // Overflow will wrap around
horizontal_add(Vec8uq const & a)1729 static inline uint64_t horizontal_add (Vec8uq const & a) {
1730 return horizontal_add(Vec8q(a));
1731 }
1732
1733 // function max: a > b ? a : b
max(Vec8uq const & a,Vec8uq const & b)1734 static inline Vec8uq max(Vec8uq const & a, Vec8uq const & b) {
1735 return Vec8uq(max(a.get_low(), b.get_low()), max(a.get_high(), b.get_high()));
1736 }
1737
1738 // function min: a < b ? a : b
min(Vec8uq const & a,Vec8uq const & b)1739 static inline Vec8uq min(Vec8uq const & a, Vec8uq const & b) {
1740 return Vec8uq(min(a.get_low(), b.get_low()), min(a.get_high(), b.get_high()));
1741 }
1742
1743
1744 /*****************************************************************************
1745 *
1746 * Vector permute functions
1747 *
1748 ******************************************************************************
1749 *
1750 * These permute functions can reorder the elements of a vector and optionally
1751 * set some elements to zero.
1752 *
1753 * The indexes are inserted as template parameters in <>. These indexes must be
1754 * constants. Each template parameter is an index to the element you want to select.
1755 * An index of -1 will generate zero. An index of -256 means don't care.
1756 *
1757 * Example:
1758 * Vec8q a(10,11,12,13,14,15,16,17); // a is (10,11,12,13,14,15,16,17)
1759 * Vec8q b;
1760 * b = permute8q<0,2,7,7,-1,-1,1,1>(a); // b is (10,12,17,17, 0, 0,11,11)
1761 *
1762 * A lot of the code here is metaprogramming aiming to find the instructions
1763 * that best fit the template parameters and instruction set. The metacode
1764 * will be reduced out to leave only a few vector instructions in release
1765 * mode with optimization on.
1766 *****************************************************************************/
1767
1768 // Permute vector of 8 64-bit integers.
1769 // Index -1 gives 0, index -256 means don't care.
1770 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
permute8q(Vec8q const & a)1771 static inline Vec8q permute8q(Vec8q const & a) {
1772 return Vec8q(blend4q<i0,i1,i2,i3> (a.get_low(), a.get_high()),
1773 blend4q<i4,i5,i6,i7> (a.get_low(), a.get_high()));
1774 }
1775
1776 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
permute8uq(Vec8uq const & a)1777 static inline Vec8uq permute8uq(Vec8uq const & a) {
1778 return Vec8uq (permute8q<i0,i1,i2,i3,i4,i5,i6,i7> (a));
1779 }
1780
1781
1782 // Permute vector of 16 32-bit integers.
1783 // Index -1 gives 0, index -256 means don't care.
1784 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
permute16i(Vec16i const & a)1785 static inline Vec16i permute16i(Vec16i const & a) {
1786 return Vec16i(blend8i<i0,i1,i2 ,i3 ,i4 ,i5 ,i6 ,i7 > (a.get_low(), a.get_high()),
1787 blend8i<i8,i9,i10,i11,i12,i13,i14,i15> (a.get_low(), a.get_high()));
1788 }
1789
1790 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
permute16ui(Vec16ui const & a)1791 static inline Vec16ui permute16ui(Vec16ui const & a) {
1792 return Vec16ui (permute16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (a));
1793 }
1794
1795
1796 /*****************************************************************************
1797 *
1798 * Vector blend functions
1799 *
1800 ******************************************************************************
1801 *
1802 * These blend functions can mix elements from two different vectors and
1803 * optionally set some elements to zero.
1804 *
1805 * The indexes are inserted as template parameters in <>. These indexes must be
1806 * constants. Each template parameter is an index to the element you want to
1807 * select, where higher indexes indicate an element from the second source
1808 * vector. For example, if each vector has 8 elements, then indexes 0 - 7
1809 * will select an element from the first vector and indexes 8 - 15 will select
1810 * an element from the second vector. A negative index will generate zero.
1811 *
1812 * Example:
1813 * Vec8q a(100,101,102,103,104,105,106,107); // a is (100, 101, 102, 103, 104, 105, 106, 107)
1814 * Vec8q b(200,201,202,203,204,205,206,207); // b is (200, 201, 202, 203, 204, 205, 206, 207)
1815 * Vec8q c;
1816 * c = blend8q<1,0,9,8,7,-1,15,15> (a,b); // c is (101, 100, 201, 200, 107, 0, 207, 207)
1817 *
1818 * A lot of the code here is metaprogramming aiming to find the instructions
1819 * that best fit the template parameters and instruction set. The metacode
1820 * will be reduced out to leave only a few vector instructions in release
1821 * mode with optimization on.
1822 *****************************************************************************/
1823
1824
1825 // helper function used below
1826 template <int n>
select4(Vec8q const & a,Vec8q const & b)1827 static inline Vec4q select4(Vec8q const & a, Vec8q const & b) {
1828 switch (n) {
1829 case 0:
1830 return a.get_low();
1831 case 1:
1832 return a.get_high();
1833 case 2:
1834 return b.get_low();
1835 case 3:
1836 return b.get_high();
1837 }
1838 return Vec4q(0);
1839 }
1840
1841 // blend vectors Vec8q
1842 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
blend8q(Vec8q const & a,Vec8q const & b)1843 static inline Vec8q blend8q(Vec8q const & a, Vec8q const & b) {
1844 const int j0 = i0 >= 0 ? i0/4 : i0;
1845 const int j1 = i1 >= 0 ? i1/4 : i1;
1846 const int j2 = i2 >= 0 ? i2/4 : i2;
1847 const int j3 = i3 >= 0 ? i3/4 : i3;
1848 const int j4 = i4 >= 0 ? i4/4 : i4;
1849 const int j5 = i5 >= 0 ? i5/4 : i5;
1850 const int j6 = i6 >= 0 ? i6/4 : i6;
1851 const int j7 = i7 >= 0 ? i7/4 : i7;
1852 Vec4q x0, x1;
1853
1854 const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3;
1855 const int r1 = j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
1856 const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : j3;
1857 const int s1 = (j5 >= 0 && j5 != r1) ? j5 : (j6 >= 0 && j6 != r1) ? j6 : j7;
1858
1859 // Combine all the indexes into a single bitfield, with 4 bits for each
1860 const int m1 = (i0&0xF) | (i1&0xF)<<4 | (i2&0xF)<<8 | (i3&0xF)<<12 | (i4&0xF)<<16 | (i5&0xF)<<20 | (i6&0xF)<<24 | (i7&0xF)<<28;
1861
1862 // Mask to zero out negative indexes
1863 const int mz = (i0<0?0:0xF) | (i1<0?0:0xF)<<4 | (i2<0?0:0xF)<<8 | (i3<0?0:0xF)<<12 | (i4<0?0:0xF)<<16 | (i5<0?0:0xF)<<20 | (i6<0?0:0xF)<<24 | (i7<0?0:0xF)<<28;
1864
1865 if (r0 < 0) {
1866 x0 = Vec4q(0);
1867 }
1868 else if (((m1 ^ r0*0x4444) & 0xCCCC & mz) == 0) {
1869 // i0 - i3 all from same source
1870 x0 = permute4q<i0 & -13, i1 & -13, i2 & -13, i3 & -13> (select4<r0> (a,b));
1871 }
1872 else if ((j2 < 0 || j2 == r0 || j2 == s0) && (j3 < 0 || j3 == r0 || j3 == s0)) {
1873 // i0 - i3 all from two sources
1874 const int k0 = i0 >= 0 ? i0 & 3 : i0;
1875 const int k1 = (i1 >= 0 ? i1 & 3 : i1) | (j1 == s0 ? 4 : 0);
1876 const int k2 = (i2 >= 0 ? i2 & 3 : i2) | (j2 == s0 ? 4 : 0);
1877 const int k3 = (i3 >= 0 ? i3 & 3 : i3) | (j3 == s0 ? 4 : 0);
1878 x0 = blend4q<k0,k1,k2,k3> (select4<r0>(a,b), select4<s0>(a,b));
1879 }
1880 else {
1881 // i0 - i3 from three or four different sources
1882 x0 = blend4q<0,1,6,7> (
1883 blend4q<i0 & -13, (i1 & -13) | 4, -0x100, -0x100> (select4<j0>(a,b), select4<j1>(a,b)),
1884 blend4q<-0x100, -0x100, i2 & -13, (i3 & -13) | 4> (select4<j2>(a,b), select4<j3>(a,b)));
1885 }
1886
1887 if (r1 < 0) {
1888 x1 = Vec4q(0);
1889 }
1890 else if (((m1 ^ uint32_t(r1)*0x44440000u) & 0xCCCC0000 & mz) == 0) {
1891 // i4 - i7 all from same source
1892 x1 = permute4q<i4 & -13, i5 & -13, i6 & -13, i7 & -13> (select4<r1> (a,b));
1893 }
1894 else if ((j6 < 0 || j6 == r1 || j6 == s1) && (j7 < 0 || j7 == r1 || j7 == s1)) {
1895 // i4 - i7 all from two sources
1896 const int k4 = i4 >= 0 ? i4 & 3 : i4;
1897 const int k5 = (i5 >= 0 ? i5 & 3 : i5) | (j5 == s1 ? 4 : 0);
1898 const int k6 = (i6 >= 0 ? i6 & 3 : i6) | (j6 == s1 ? 4 : 0);
1899 const int k7 = (i7 >= 0 ? i7 & 3 : i7) | (j7 == s1 ? 4 : 0);
1900 x1 = blend4q<k4,k5,k6,k7> (select4<r1>(a,b), select4<s1>(a,b));
1901 }
1902 else {
1903 // i4 - i7 from three or four different sources
1904 x1 = blend4q<0,1,6,7> (
1905 blend4q<i4 & -13, (i5 & -13) | 4, -0x100, -0x100> (select4<j4>(a,b), select4<j5>(a,b)),
1906 blend4q<-0x100, -0x100, i6 & -13, (i7 & -13) | 4> (select4<j6>(a,b), select4<j7>(a,b)));
1907 }
1908
1909 return Vec8q(x0,x1);
1910 }
1911
1912 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
blend8uq(Vec8uq const & a,Vec8uq const & b)1913 static inline Vec8uq blend8uq(Vec8uq const & a, Vec8uq const & b) {
1914 return Vec8uq( blend8q<i0,i1,i2,i3,i4,i5,i6,i7> (a,b));
1915 }
1916
1917
1918 // helper function used below
1919 template <int n>
select4(Vec16i const & a,Vec16i const & b)1920 static inline Vec8i select4(Vec16i const & a, Vec16i const & b) {
1921 switch (n) {
1922 case 0:
1923 return a.get_low();
1924 case 1:
1925 return a.get_high();
1926 case 2:
1927 return b.get_low();
1928 case 3:
1929 return b.get_high();
1930 }
1931 return Vec8i(0);
1932 }
1933
1934 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
1935 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
blend16i(Vec16i const & a,Vec16i const & b)1936 static inline Vec16i blend16i(Vec16i const & a, Vec16i const & b) {
1937
1938 const int j0 = i0 >= 0 ? i0 /8 : i0;
1939 const int j1 = i1 >= 0 ? i1 /8 : i1;
1940 const int j2 = i2 >= 0 ? i2 /8 : i2;
1941 const int j3 = i3 >= 0 ? i3 /8 : i3;
1942 const int j4 = i4 >= 0 ? i4 /8 : i4;
1943 const int j5 = i5 >= 0 ? i5 /8 : i5;
1944 const int j6 = i6 >= 0 ? i6 /8 : i6;
1945 const int j7 = i7 >= 0 ? i7 /8 : i7;
1946 const int j8 = i8 >= 0 ? i8 /8 : i8;
1947 const int j9 = i9 >= 0 ? i9 /8 : i9;
1948 const int j10 = i10 >= 0 ? i10/8 : i10;
1949 const int j11 = i11 >= 0 ? i11/8 : i11;
1950 const int j12 = i12 >= 0 ? i12/8 : i12;
1951 const int j13 = i13 >= 0 ? i13/8 : i13;
1952 const int j14 = i14 >= 0 ? i14/8 : i14;
1953 const int j15 = i15 >= 0 ? i15/8 : i15;
1954
1955 Vec8i x0, x1;
1956
1957 const int r0 = j0 >= 0 ? j0 : j1 >= 0 ? j1 : j2 >= 0 ? j2 : j3 >= 0 ? j3 : j4 >= 0 ? j4 : j5 >= 0 ? j5 : j6 >= 0 ? j6 : j7;
1958 const int r1 = j8 >= 0 ? j8 : j9 >= 0 ? j9 : j10 >= 0 ? j10 : j11 >= 0 ? j11 : j12 >= 0 ? j12 : j13 >= 0 ? j13 : j14 >= 0 ? j14 : j15;
1959 const int s0 = (j1 >= 0 && j1 != r0) ? j1 : (j2 >= 0 && j2 != r0) ? j2 : (j3 >= 0 && j3 != r0) ? j3 : (j4 >= 0 && j4 != r0) ? j4 : (j5 >= 0 && j5 != r0) ? j5 : (j6 >= 0 && j6 != r0) ? j6 : j7;
1960 const int s1 = (j9 >= 0 && j9 != r1) ? j9 : (j10>= 0 && j10!= r1) ? j10 : (j11>= 0 && j11!= r1) ? j11: (j12>= 0 && j12!= r1) ? j12: (j13>= 0 && j13!= r1) ? j13: (j14>= 0 && j14!= r1) ? j14: j15;
1961
1962 if (r0 < 0) {
1963 x0 = Vec8i(0);
1964 }
1965 else if (r0 == s0) {
1966 // i0 - i7 all from same source
1967 x0 = permute8i<i0&-25, i1&-25, i2&-25, i3&-25, i4&-25, i5&-25, i6&-25, i7&-25> (select4<r0> (a,b));
1968 }
1969 else if ((j2<0||j2==r0||j2==s0) && (j3<0||j3==r0||j3==s0) && (j4<0||j4==r0||j4==s0) && (j5<0||j5==r0||j5==s0) && (j6<0||j6==r0||j6==s0) && (j7<0||j7==r0||j7==s0)) {
1970 // i0 - i7 all from two sources
1971 const int k0 = i0 >= 0 ? (i0 & 7) : i0;
1972 const int k1 = (i1 >= 0 ? (i1 & 7) : i1) | (j1 == s0 ? 8 : 0);
1973 const int k2 = (i2 >= 0 ? (i2 & 7) : i2) | (j2 == s0 ? 8 : 0);
1974 const int k3 = (i3 >= 0 ? (i3 & 7) : i3) | (j3 == s0 ? 8 : 0);
1975 const int k4 = (i4 >= 0 ? (i4 & 7) : i4) | (j4 == s0 ? 8 : 0);
1976 const int k5 = (i5 >= 0 ? (i5 & 7) : i5) | (j5 == s0 ? 8 : 0);
1977 const int k6 = (i6 >= 0 ? (i6 & 7) : i6) | (j6 == s0 ? 8 : 0);
1978 const int k7 = (i7 >= 0 ? (i7 & 7) : i7) | (j7 == s0 ? 8 : 0);
1979 x0 = blend8i<k0,k1,k2,k3,k4,k5,k6,k7> (select4<r0>(a,b), select4<s0>(a,b));
1980 }
1981 else {
1982 // i0 - i7 from three or four different sources
1983 const int n0 = j0 >= 0 ? j0 /2*8 + 0 : j0;
1984 const int n1 = j1 >= 0 ? j1 /2*8 + 1 : j1;
1985 const int n2 = j2 >= 0 ? j2 /2*8 + 2 : j2;
1986 const int n3 = j3 >= 0 ? j3 /2*8 + 3 : j3;
1987 const int n4 = j4 >= 0 ? j4 /2*8 + 4 : j4;
1988 const int n5 = j5 >= 0 ? j5 /2*8 + 5 : j5;
1989 const int n6 = j6 >= 0 ? j6 /2*8 + 6 : j6;
1990 const int n7 = j7 >= 0 ? j7 /2*8 + 7 : j7;
1991 x0 = blend8i<n0, n1, n2, n3, n4, n5, n6, n7> (
1992 blend8i< j0 & 2 ? -256 : i0 &15, j1 & 2 ? -256 : i1 &15, j2 & 2 ? -256 : i2 &15, j3 & 2 ? -256 : i3 &15, j4 & 2 ? -256 : i4 &15, j5 & 2 ? -256 : i5 &15, j6 & 2 ? -256 : i6 &15, j7 & 2 ? -256 : i7 &15> (a.get_low(),a.get_high()),
1993 blend8i<(j0^2)& 6 ? -256 : i0 &15, (j1^2)& 6 ? -256 : i1 &15, (j2^2)& 6 ? -256 : i2 &15, (j3^2)& 6 ? -256 : i3 &15, (j4^2)& 6 ? -256 : i4 &15, (j5^2)& 6 ? -256 : i5 &15, (j6^2)& 6 ? -256 : i6 &15, (j7^2)& 6 ? -256 : i7 &15> (b.get_low(),b.get_high()));
1994 }
1995
1996 if (r1 < 0) {
1997 x1 = Vec8i(0);
1998 }
1999 else if (r1 == s1) {
2000 // i8 - i15 all from same source
2001 x1 = permute8i<i8&-25, i9&-25, i10&-25, i11&-25, i12&-25, i13&-25, i14&-25, i15&-25> (select4<r1> (a,b));
2002 }
2003 else if ((j10<0||j10==r1||j10==s1) && (j11<0||j11==r1||j11==s1) && (j12<0||j12==r1||j12==s1) && (j13<0||j13==r1||j13==s1) && (j14<0||j14==r1||j14==s1) && (j15<0||j15==r1||j15==s1)) {
2004 // i8 - i15 all from two sources
2005 const int k8 = i8 >= 0 ? (i8 & 7) : i8;
2006 const int k9 = (i9 >= 0 ? (i9 & 7) : i9 ) | (j9 == s1 ? 8 : 0);
2007 const int k10= (i10>= 0 ? (i10& 7) : i10) | (j10== s1 ? 8 : 0);
2008 const int k11= (i11>= 0 ? (i11& 7) : i11) | (j11== s1 ? 8 : 0);
2009 const int k12= (i12>= 0 ? (i12& 7) : i12) | (j12== s1 ? 8 : 0);
2010 const int k13= (i13>= 0 ? (i13& 7) : i13) | (j13== s1 ? 8 : 0);
2011 const int k14= (i14>= 0 ? (i14& 7) : i14) | (j14== s1 ? 8 : 0);
2012 const int k15= (i15>= 0 ? (i15& 7) : i15) | (j15== s1 ? 8 : 0);
2013 x1 = blend8i<k8,k9,k10,k11,k12,k13,k14,k15> (select4<r1>(a,b), select4<s1>(a,b));
2014 }
2015 else {
2016 // i8 - i15 from three or four different sources
2017 const int n8 = j8 >= 0 ? j8 /2*8 + 0 : j8 ;
2018 const int n9 = j9 >= 0 ? j9 /2*8 + 1 : j9 ;
2019 const int n10= j10>= 0 ? j10/2*8 + 2 : j10;
2020 const int n11= j11>= 0 ? j11/2*8 + 3 : j11;
2021 const int n12= j12>= 0 ? j12/2*8 + 4 : j12;
2022 const int n13= j13>= 0 ? j13/2*8 + 5 : j13;
2023 const int n14= j14>= 0 ? j14/2*8 + 6 : j14;
2024 const int n15= j15>= 0 ? j15/2*8 + 7 : j15;
2025 x1 = blend8i<n8, n9, n10, n11, n12, n13, n14, n15> (
2026 blend8i< j8 & 2 ? -256 : i8 &15, j9 & 2 ? -256 : i9 &15, j10 & 2 ? -256 : i10 &15, j11 & 2 ? -256 : i11 &15, j12 & 2 ? -256 : i12 &15, j13 & 2 ? -256 : i13 &15, j14 & 2 ? -256 : i14 &15, j15 & 2 ? -256 : i15 &15> (a.get_low(),a.get_high()),
2027 blend8i<(j8^2)& 6 ? -256 : i8 &15, (j9^2)& 6 ? -256 : i9 &15, (j10^2)& 6 ? -256 : i10 &15, (j11^2)& 6 ? -256 : i11 &15, (j12^2)& 6 ? -256 : i12 &15, (j13^2)& 6 ? -256 : i13 &15, (j14^2)& 6 ? -256 : i14 &15, (j15^2)& 6 ? -256 : i15 &15> (b.get_low(),b.get_high()));
2028 }
2029 return Vec16i(x0,x1);
2030 }
2031
2032 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
2033 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15 >
blend16ui(Vec16ui const & a,Vec16ui const & b)2034 static inline Vec16ui blend16ui(Vec16ui const & a, Vec16ui const & b) {
2035 return Vec16ui( blend16i<i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15> (Vec16i(a),Vec16i(b)));
2036 }
2037
2038
2039 /*****************************************************************************
2040 *
2041 * Vector lookup functions
2042 *
2043 ******************************************************************************
2044 *
2045 * These functions use vector elements as indexes into a table.
2046 * The table is given as one or more vectors or as an array.
2047 *
2048 * This can be used for several purposes:
2049 * - table lookup
2050 * - permute or blend with variable indexes
2051 * - blend from more than two sources
2052 * - gather non-contiguous data
2053 *
2054 * An index out of range may produce any value - the actual value produced is
2055 * implementation dependent and may be different for different instruction
2056 * sets. An index out of range does not produce an error message or exception.
2057 *
2058 * Example:
2059 * Vec8q a(2,0,0,6,4,3,5,0); // index a is ( 2, 0, 0, 6, 4, 3, 5, 0)
2060 * Vec8q b(100,101,102,103,104,105,106,107); // table b is (100, 101, 102, 103, 104, 105, 106, 107)
2061 * Vec8q c;
2062 * c = lookup8 (a,b); // c is (102, 100, 100, 106, 104, 103, 105, 100)
2063 *
2064 *****************************************************************************/
2065
lookup16(Vec16i const & index,Vec16i const & table)2066 static inline Vec16i lookup16(Vec16i const & index, Vec16i const & table) {
2067 int32_t tab[16];
2068 table.store(tab);
2069 Vec8i t0 = lookup<16>(index.get_low(), tab);
2070 Vec8i t1 = lookup<16>(index.get_high(), tab);
2071 return Vec16i(t0, t1);
2072 }
2073
2074 template <int n>
lookup(Vec16i const & index,void const * table)2075 static inline Vec16i lookup(Vec16i const & index, void const * table) {
2076 if (n <= 0) return 0;
2077 if (n <= 8) {
2078 Vec8i table1 = Vec8i().load(table);
2079 return Vec16i(
2080 lookup8 (index.get_low(), table1),
2081 lookup8 (index.get_high(), table1));
2082 }
2083 if (n <= 16) return lookup16(index, Vec16i().load(table));
2084 // n > 16. Limit index
2085 Vec16ui i1;
2086 if ((n & (n-1)) == 0) {
2087 // n is a power of 2, make index modulo n
2088 i1 = Vec16ui(index) & (n-1);
2089 }
2090 else {
2091 // n is not a power of 2, limit to n-1
2092 i1 = min(Vec16ui(index), n-1);
2093 }
2094 int32_t const * t = (int32_t const *)table;
2095 return Vec16i(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]],
2096 t[i1[8]],t[i1[9]],t[i1[10]],t[i1[11]],t[i1[12]],t[i1[13]],t[i1[14]],t[i1[15]]);
2097 }
2098
lookup8(Vec8q const & index,Vec8q const & table)2099 static inline Vec8q lookup8(Vec8q const & index, Vec8q const & table) {
2100 int64_t tab[8];
2101 table.store(tab);
2102 Vec4q t0 = lookup<8>(index.get_low(), tab);
2103 Vec4q t1 = lookup<8>(index.get_high(), tab);
2104 return Vec8q(t0, t1);
2105 }
2106
2107 template <int n>
lookup(Vec8q const & index,void const * table)2108 static inline Vec8q lookup(Vec8q const & index, void const * table) {
2109 if (n <= 0) return 0;
2110 if (n <= 4) {
2111 Vec4q table1 = Vec4q().load(table);
2112 return Vec8q(
2113 lookup4 (index.get_low(), table1),
2114 lookup4 (index.get_high(), table1));
2115 }
2116 if (n <= 8) {
2117 return lookup8(index, Vec8q().load(table));
2118 }
2119 // n > 8. Limit index
2120 Vec8uq i1;
2121 if ((n & (n-1)) == 0) {
2122 // n is a power of 2, make index modulo n
2123 i1 = Vec8uq(index) & (n-1);
2124 }
2125 else {
2126 // n is not a power of 2, limit to n-1
2127 i1 = min(Vec8uq(index), n-1);
2128 }
2129 int64_t const * t = (int64_t const *)table;
2130 return Vec8q(t[i1[0]],t[i1[1]],t[i1[2]],t[i1[3]],t[i1[4]],t[i1[5]],t[i1[6]],t[i1[7]]);
2131 }
2132
2133 /*****************************************************************************
2134 *
2135 * Vector scatter functions
2136 *
2137 ******************************************************************************
2138 *
2139 * These functions write the elements of a vector to arbitrary positions in an
2140 * array in memory. Each vector element is written to an array position
2141 * determined by an index. An element is not written if the corresponding
2142 * index is out of range.
2143 * The indexes can be specified as constant template parameters or as an
2144 * integer vector.
2145 *
2146 * The scatter functions are useful if the data are distributed in a sparce
2147 * manner into the array. If the array is dense then it is more efficient
2148 * to permute the data into the right positions and then write the whole
2149 * permuted vector into the array.
2150 *
2151 * Example:
2152 * Vec8q a(10,11,12,13,14,15,16,17);
2153 * int64_t b[16] = {0};
2154 * scatter<0,2,14,10,1,-1,5,9>(a,b);
2155 * // Now, b = {10,14,11,0,0,16,0,0,0,17,13,0,0,0,12,0}
2156 *
2157 *****************************************************************************/
2158
2159 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
2160 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
scatter(Vec16i const & data,void * array)2161 static inline void scatter(Vec16i const & data, void * array) {
2162 int32_t* arr = (int32_t*)array;
2163 const int index[16] = {i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15};
2164 for (int i = 0; i < 16; i++) {
2165 if (index[i] >= 0) arr[index[i]] = data[i];
2166 }
2167 }
2168
2169 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
scatter(Vec8q const & data,void * array)2170 static inline void scatter(Vec8q const & data, void * array) {
2171 int64_t* arr = (int64_t*)array;
2172 const int index[8] = {i0,i1,i2,i3,i4,i5,i6,i7};
2173 for (int i = 0; i < 8; i++) {
2174 if (index[i] >= 0) arr[index[i]] = data[i];
2175 }
2176 }
2177
scatter(Vec16i const & index,uint32_t limit,Vec16i const & data,void * array)2178 static inline void scatter(Vec16i const & index, uint32_t limit, Vec16i const & data, void * array) {
2179 int32_t* arr = (int32_t*)array;
2180 for (int i = 0; i < 16; i++) {
2181 if (uint32_t(index[i]) < limit) arr[index[i]] = data[i];
2182 }
2183 }
2184
scatter(Vec8q const & index,uint32_t limit,Vec8q const & data,void * array)2185 static inline void scatter(Vec8q const & index, uint32_t limit, Vec8q const & data, void * array) {
2186 int64_t* arr = (int64_t*)array;
2187 for (int i = 0; i < 8; i++) {
2188 if (uint64_t(index[i]) < uint64_t(limit)) arr[index[i]] = data[i];
2189 }
2190 }
2191
scatter(Vec8i const & index,uint32_t limit,Vec8q const & data,void * array)2192 static inline void scatter(Vec8i const & index, uint32_t limit, Vec8q const & data, void * array) {
2193 int64_t* arr = (int64_t*)array;
2194 for (int i = 0; i < 8; i++) {
2195 if (uint32_t(index[i]) < limit) arr[index[i]] = data[i];
2196 }
2197 }
2198
2199 /*****************************************************************************
2200 *
2201 * Gather functions with fixed indexes
2202 *
2203 *****************************************************************************/
2204 // Load elements from array a with indices i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15
2205 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
2206 int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15>
gather16i(void const * a)2207 static inline Vec16i gather16i(void const * a) {
2208 Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7|i8|i9|i10|i11|i12|i13|i14|i15)>=0> Negative_array_index; // Error message if index is negative
2209 // find smallest and biggest index, using only compile-time constant expressions
2210 const int i01min = i0 < i1 ? i0 : i1;
2211 const int i23min = i2 < i3 ? i2 : i3;
2212 const int i45min = i4 < i5 ? i4 : i5;
2213 const int i67min = i6 < i7 ? i6 : i7;
2214 const int i89min = i8 < i9 ? i8 : i9;
2215 const int i1011min = i10 < i11 ? i10 : i11;
2216 const int i1213min = i12 < i13 ? i12 : i13;
2217 const int i1415min = i14 < i15 ? i14 : i15;
2218 const int i0_3min = i01min < i23min ? i01min : i23min;
2219 const int i4_7min = i45min < i67min ? i45min : i67min;
2220 const int i8_11min = i89min < i1011min ? i89min : i1011min;
2221 const int i12_15min = i1213min < i1415min ? i1213min : i1415min;
2222 const int i0_7min = i0_3min < i4_7min ? i0_3min : i4_7min;
2223 const int i8_15min = i8_11min < i12_15min ? i8_11min : i12_15min;
2224 const int imin = i0_7min < i8_15min ? i0_7min : i8_15min;
2225 const int i01max = i0 > i1 ? i0 : i1;
2226 const int i23max = i2 > i3 ? i2 : i3;
2227 const int i45max = i4 > i5 ? i4 : i5;
2228 const int i67max = i6 > i7 ? i6 : i7;
2229 const int i89max = i8 > i9 ? i8 : i9;
2230 const int i1011max = i10 > i11 ? i10 : i11;
2231 const int i1213max = i12 > i13 ? i12 : i13;
2232 const int i1415max = i14 > i15 ? i14 : i15;
2233 const int i0_3max = i01max > i23max ? i01max : i23max;
2234 const int i4_7max = i45max > i67max ? i45max : i67max;
2235 const int i8_11max = i89max > i1011max ? i89max : i1011max;
2236 const int i12_15max = i1213max > i1415max ? i1213max : i1415max;
2237 const int i0_7max = i0_3max > i4_7max ? i0_3max : i4_7max;
2238 const int i8_15max = i8_11max > i12_15max ? i8_11max : i12_15max;
2239 const int imax = i0_7max > i8_15max ? i0_7max : i8_15max;
2240 if (imax - imin <= 15) {
2241 // load one contiguous block and permute
2242 if (imax > 15) {
2243 // make sure we don't read past the end of the array
2244 Vec16i b = Vec16i().load((int32_t const *)a + imax-15);
2245 return permute16i<i0-imax+15, i1-imax+15, i2-imax+15, i3-imax+15, i4-imax+15, i5-imax+15, i6-imax+15, i7-imax+15,
2246 i8-imax+15, i9-imax+15, i10-imax+15, i11-imax+15, i12-imax+15, i13-imax+15, i14-imax+15, i15-imax+15> (b);
2247 }
2248 else {
2249 Vec16i b = Vec16i().load((int32_t const *)a + imin);
2250 return permute16i<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin,
2251 i8-imin, i9-imin, i10-imin, i11-imin, i12-imin, i13-imin, i14-imin, i15-imin> (b);
2252 }
2253 }
2254 if ((i0<imin+16 || i0>imax-16) && (i1<imin+16 || i1>imax-16) && (i2<imin+16 || i2>imax-16) && (i3<imin+16 || i3>imax-16)
2255 && (i4<imin+16 || i4>imax-16) && (i5<imin+16 || i5>imax-16) && (i6<imin+16 || i6>imax-16) && (i7<imin+16 || i7>imax-16)
2256 && (i8<imin+16 || i8>imax-16) && (i9<imin+16 || i9>imax-16) && (i10<imin+16 || i10>imax-16) && (i11<imin+16 || i11>imax-16)
2257 && (i12<imin+16 || i12>imax-16) && (i13<imin+16 || i13>imax-16) && (i14<imin+16 || i14>imax-16) && (i15<imin+16 || i15>imax-16) ) {
2258 // load two contiguous blocks and blend
2259 Vec16i b = Vec16i().load((int32_t const *)a + imin);
2260 Vec16i c = Vec16i().load((int32_t const *)a + imax-15);
2261 const int j0 = i0 <imin+16 ? i0 -imin : 31-imax+i0;
2262 const int j1 = i1 <imin+16 ? i1 -imin : 31-imax+i1;
2263 const int j2 = i2 <imin+16 ? i2 -imin : 31-imax+i2;
2264 const int j3 = i3 <imin+16 ? i3 -imin : 31-imax+i3;
2265 const int j4 = i4 <imin+16 ? i4 -imin : 31-imax+i4;
2266 const int j5 = i5 <imin+16 ? i5 -imin : 31-imax+i5;
2267 const int j6 = i6 <imin+16 ? i6 -imin : 31-imax+i6;
2268 const int j7 = i7 <imin+16 ? i7 -imin : 31-imax+i7;
2269 const int j8 = i8 <imin+16 ? i8 -imin : 31-imax+i8;
2270 const int j9 = i9 <imin+16 ? i9 -imin : 31-imax+i9;
2271 const int j10 = i10<imin+16 ? i10-imin : 31-imax+i10;
2272 const int j11 = i11<imin+16 ? i11-imin : 31-imax+i11;
2273 const int j12 = i12<imin+16 ? i12-imin : 31-imax+i12;
2274 const int j13 = i13<imin+16 ? i13-imin : 31-imax+i13;
2275 const int j14 = i14<imin+16 ? i14-imin : 31-imax+i14;
2276 const int j15 = i15<imin+16 ? i15-imin : 31-imax+i15;
2277 return blend16i<j0,j1,j2,j3,j4,j5,j6,j7,j8,j9,j10,j11,j12,j13,j14,j15>(b, c);
2278 }
2279 // use lookup function
2280 return lookup<imax+1>(Vec16i(i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15), a);
2281 }
2282
2283
2284 template <int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7>
gather8q(void const * a)2285 static inline Vec8q gather8q(void const * a) {
2286 Static_error_check<(i0|i1|i2|i3|i4|i5|i6|i7)>=0> Negative_array_index; // Error message if index is negative
2287
2288 const int i01min = i0 < i1 ? i0 : i1;
2289 const int i23min = i2 < i3 ? i2 : i3;
2290 const int i45min = i4 < i5 ? i4 : i5;
2291 const int i67min = i6 < i7 ? i6 : i7;
2292 const int i0123min = i01min < i23min ? i01min : i23min;
2293 const int i4567min = i45min < i67min ? i45min : i67min;
2294 const int imin = i0123min < i4567min ? i0123min : i4567min;
2295 const int i01max = i0 > i1 ? i0 : i1;
2296 const int i23max = i2 > i3 ? i2 : i3;
2297 const int i45max = i4 > i5 ? i4 : i5;
2298 const int i67max = i6 > i7 ? i6 : i7;
2299 const int i0123max = i01max > i23max ? i01max : i23max;
2300 const int i4567max = i45max > i67max ? i45max : i67max;
2301 const int imax = i0123max > i4567max ? i0123max : i4567max;
2302 if (imax - imin <= 7) {
2303 // load one contiguous block and permute
2304 if (imax > 7) {
2305 // make sure we don't read past the end of the array
2306 Vec8q b = Vec8q().load((int64_t const *)a + imax-7);
2307 return permute8q<i0-imax+7, i1-imax+7, i2-imax+7, i3-imax+7, i4-imax+7, i5-imax+7, i6-imax+7, i7-imax+7> (b);
2308 }
2309 else {
2310 Vec8q b = Vec8q().load((int64_t const *)a + imin);
2311 return permute8q<i0-imin, i1-imin, i2-imin, i3-imin, i4-imin, i5-imin, i6-imin, i7-imin> (b);
2312 }
2313 }
2314 if ((i0<imin+8 || i0>imax-8) && (i1<imin+8 || i1>imax-8) && (i2<imin+8 || i2>imax-8) && (i3<imin+8 || i3>imax-8)
2315 && (i4<imin+8 || i4>imax-8) && (i5<imin+8 || i5>imax-8) && (i6<imin+8 || i6>imax-8) && (i7<imin+8 || i7>imax-8)) {
2316 // load two contiguous blocks and blend
2317 Vec8q b = Vec8q().load((int64_t const *)a + imin);
2318 Vec8q c = Vec8q().load((int64_t const *)a + imax-7);
2319 const int j0 = i0<imin+8 ? i0-imin : 15-imax+i0;
2320 const int j1 = i1<imin+8 ? i1-imin : 15-imax+i1;
2321 const int j2 = i2<imin+8 ? i2-imin : 15-imax+i2;
2322 const int j3 = i3<imin+8 ? i3-imin : 15-imax+i3;
2323 const int j4 = i4<imin+8 ? i4-imin : 15-imax+i4;
2324 const int j5 = i5<imin+8 ? i5-imin : 15-imax+i5;
2325 const int j6 = i6<imin+8 ? i6-imin : 15-imax+i6;
2326 const int j7 = i7<imin+8 ? i7-imin : 15-imax+i7;
2327 return blend8q<j0, j1, j2, j3, j4, j5, j6, j7>(b, c);
2328 }
2329 // use lookup function
2330 return lookup<imax+1>(Vec8q(i0,i1,i2,i3,i4,i5,i6,i7), a);
2331 }
2332
2333
2334 /*****************************************************************************
2335 *
2336 * Functions for conversion between integer sizes
2337 *
2338 *****************************************************************************/
2339
2340 // Extend 16-bit integers to 32-bit integers, signed and unsigned
2341
2342 // Function extend_to_int : extends Vec16s to Vec16i with sign extension
extend_to_int(Vec16s const & a)2343 static inline Vec16i extend_to_int (Vec16s const & a) {
2344 return Vec16i(extend_low(a), extend_high(a));
2345 }
2346
2347 // Function extend_to_int : extends Vec16us to Vec16ui with zero extension
extend_to_int(Vec16us const & a)2348 static inline Vec16ui extend_to_int (Vec16us const & a) {
2349 return Vec16i(extend_low(a), extend_high(a));
2350 }
2351
2352 // Function extend_to_int : extends Vec16c to Vec16i with sign extension
extend_to_int(Vec16c const & a)2353 static inline Vec16i extend_to_int (Vec16c const & a) {
2354 return extend_to_int(Vec16s(extend_low(a), extend_high(a)));
2355 }
2356
2357 // Function extend_to_int : extends Vec16uc to Vec16ui with zero extension
extend_to_int(Vec16uc const & a)2358 static inline Vec16ui extend_to_int (Vec16uc const & a) {
2359 return extend_to_int(Vec16s(extend_low(a), extend_high(a)));
2360 }
2361
2362
2363 // Extend 32-bit integers to 64-bit integers, signed and unsigned
2364
2365 // Function extend_low : extends the low 8 elements to 64 bits with sign extension
extend_low(Vec16i const & a)2366 static inline Vec8q extend_low (Vec16i const & a) {
2367 return Vec8q(extend_low(a.get_low()), extend_high(a.get_low()));
2368 }
2369
2370 // Function extend_high : extends the high 8 elements to 64 bits with sign extension
extend_high(Vec16i const & a)2371 static inline Vec8q extend_high (Vec16i const & a) {
2372 return Vec8q(extend_low(a.get_high()), extend_high(a.get_high()));
2373 }
2374
2375 // Function extend_low : extends the low 8 elements to 64 bits with zero extension
extend_low(Vec16ui const & a)2376 static inline Vec8uq extend_low (Vec16ui const & a) {
2377 return Vec8q(extend_low(a.get_low()), extend_high(a.get_low()));
2378 }
2379
2380 // Function extend_high : extends the high 8 elements to 64 bits with zero extension
extend_high(Vec16ui const & a)2381 static inline Vec8uq extend_high (Vec16ui const & a) {
2382 return Vec8q(extend_low(a.get_high()), extend_high(a.get_high()));
2383 }
2384
2385
2386 // Compress 32-bit integers to 8-bit integers, signed and unsigned, with and without saturation
2387
2388 // Function compress : packs two vectors of 16-bit integers into one vector of 8-bit integers
2389 // Overflow wraps around
compress_to_int8(Vec16i const & a)2390 static inline Vec16c compress_to_int8 (Vec16i const & a) {
2391 Vec16s b = compress(a.get_low(), a.get_high());
2392 Vec16c c = compress(b.get_low(), b.get_high());
2393 return c;
2394 }
2395
compress_to_int16(Vec16i const & a)2396 static inline Vec16s compress_to_int16 (Vec16i const & a) {
2397 return compress(a.get_low(), a.get_high());
2398 }
2399
2400 // with signed saturation
compress_to_int8_saturated(Vec16i const & a)2401 static inline Vec16c compress_to_int8_saturated (Vec16i const & a) {
2402 Vec16s b = compress_saturated(a.get_low(), a.get_high());
2403 Vec16c c = compress_saturated(b.get_low(), b.get_high());
2404 return c;
2405 }
2406
compress_to_int16_saturated(Vec16i const & a)2407 static inline Vec16s compress_to_int16_saturated (Vec16i const & a) {
2408 return compress_saturated(a.get_low(), a.get_high());
2409 }
2410
2411 // with unsigned saturation
compress_to_int8_saturated(Vec16ui const & a)2412 static inline Vec16uc compress_to_int8_saturated (Vec16ui const & a) {
2413 Vec16us b = compress_saturated(a.get_low(), a.get_high());
2414 Vec16uc c = compress_saturated(b.get_low(), b.get_high());
2415 return c;
2416 }
2417
compress_to_int16_saturated(Vec16ui const & a)2418 static inline Vec16us compress_to_int16_saturated (Vec16ui const & a) {
2419 return compress_saturated(a.get_low(), a.get_high());
2420 }
2421
2422 // Compress 64-bit integers to 32-bit integers, signed and unsigned, with and without saturation
2423
2424 // Function compress : packs two vectors of 64-bit integers into one vector of 32-bit integers
2425 // Overflow wraps around
compress(Vec8q const & low,Vec8q const & high)2426 static inline Vec16i compress (Vec8q const & low, Vec8q const & high) {
2427 return Vec16i(compress(low.get_low(),low.get_high()), compress(high.get_low(),high.get_high()));
2428 }
2429
2430 // Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
2431 // Signed, with saturation
compress_saturated(Vec8q const & low,Vec8q const & high)2432 static inline Vec16i compress_saturated (Vec8q const & low, Vec8q const & high) {
2433 return Vec16i(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
2434 }
2435
2436 // Function compress_saturated : packs two vectors of 64-bit integers into one vector of 32-bit integers
2437 // Unsigned, with saturation
compress_saturated(Vec8uq const & low,Vec8uq const & high)2438 static inline Vec16ui compress_saturated (Vec8uq const & low, Vec8uq const & high) {
2439 return Vec16ui(compress_saturated(low.get_low(),low.get_high()), compress_saturated(high.get_low(),high.get_high()));
2440 }
2441
2442
2443 /*****************************************************************************
2444 *
2445 * Integer division operators
2446 *
2447 * Please see the file vectori128.h for explanation.
2448 *
2449 *****************************************************************************/
2450
2451 // vector operator / : divide each element by divisor
2452
2453 // vector operator / : divide all elements by same integer
2454 static inline Vec16i operator / (Vec16i const & a, Divisor_i const & d) {
2455 return Vec16i(a.get_low() / d, a.get_high() / d);
2456 }
2457
2458 // vector operator /= : divide
2459 static inline Vec16i & operator /= (Vec16i & a, Divisor_i const & d) {
2460 a = a / d;
2461 return a;
2462 }
2463
2464 // vector operator / : divide all elements by same integer
2465 static inline Vec16ui operator / (Vec16ui const & a, Divisor_ui const & d) {
2466 return Vec16ui(a.get_low() / d, a.get_high() / d);
2467 }
2468
2469 // vector operator /= : divide
2470 static inline Vec16ui & operator /= (Vec16ui & a, Divisor_ui const & d) {
2471 a = a / d;
2472 return a;
2473 }
2474
2475
2476 /*****************************************************************************
2477 *
2478 * Integer division 2: divisor is a compile-time constant
2479 *
2480 *****************************************************************************/
2481
2482 // Divide Vec16i by compile-time constant
2483 template <int32_t d>
divide_by_i(Vec16i const & a)2484 static inline Vec16i divide_by_i(Vec16i const & a) {
2485 return Vec16i(divide_by_i<d>(a.get_low()), divide_by_i<d>(a.get_high()));
2486 }
2487
2488 // define Vec16i a / const_int(d)
2489 template <int32_t d>
2490 static inline Vec16i operator / (Vec16i const & a, Const_int_t<d>) {
2491 return divide_by_i<d>(a);
2492 }
2493
2494 // define Vec16i a / const_uint(d)
2495 template <uint32_t d>
2496 static inline Vec16i operator / (Vec16i const & a, Const_uint_t<d>) {
2497 Static_error_check< (d<0x80000000u) > Error_overflow_dividing_signed_by_unsigned; // Error: dividing signed by overflowing unsigned
2498 return divide_by_i<int32_t(d)>(a); // signed divide
2499 }
2500
2501 // vector operator /= : divide
2502 template <int32_t d>
2503 static inline Vec16i & operator /= (Vec16i & a, Const_int_t<d> b) {
2504 a = a / b;
2505 return a;
2506 }
2507
2508 // vector operator /= : divide
2509 template <uint32_t d>
2510 static inline Vec16i & operator /= (Vec16i & a, Const_uint_t<d> b) {
2511 a = a / b;
2512 return a;
2513 }
2514
2515 // Divide Vec16ui by compile-time constant
2516 template <uint32_t d>
divide_by_ui(Vec16ui const & a)2517 static inline Vec16ui divide_by_ui(Vec16ui const & a) {
2518 return Vec16ui( divide_by_ui<d>(a.get_low()), divide_by_ui<d>(a.get_high()));
2519 }
2520
2521 // define Vec16ui a / const_uint(d)
2522 template <uint32_t d>
2523 static inline Vec16ui operator / (Vec16ui const & a, Const_uint_t<d>) {
2524 return divide_by_ui<d>(a);
2525 }
2526
2527 // define Vec16ui a / const_int(d)
2528 template <int32_t d>
2529 static inline Vec16ui operator / (Vec16ui const & a, Const_int_t<d>) {
2530 Static_error_check< (d>=0) > Error_dividing_unsigned_by_negative;// Error: dividing unsigned by negative is ambiguous
2531 return divide_by_ui<d>(a); // unsigned divide
2532 }
2533
2534 // vector operator /= : divide
2535 template <uint32_t d>
2536 static inline Vec16ui & operator /= (Vec16ui & a, Const_uint_t<d> b) {
2537 a = a / b;
2538 return a;
2539 }
2540
2541 // vector operator /= : divide
2542 template <int32_t d>
2543 static inline Vec16ui & operator /= (Vec16ui & a, Const_int_t<d> b) {
2544 a = a / b;
2545 return a;
2546 }
2547
2548
2549 /*****************************************************************************
2550 *
2551 * Horizontal scan functions
2552 *
2553 *****************************************************************************/
2554
2555 // Get index to the first element that is true. Return -1 if all are false
horizontal_find_first(Vec16ib const & x)2556 static inline int horizontal_find_first(Vec16ib const & x) {
2557 int a1 = horizontal_find_first(x.get_low());
2558 if (a1 >= 0) return a1;
2559 int a2 = horizontal_find_first(x.get_high());
2560 if (a2 < 0) return a2;
2561 return a2 + 8;
2562 }
2563
horizontal_find_first(Vec8qb const & x)2564 static inline int horizontal_find_first(Vec8qb const & x) {
2565 int a1 = horizontal_find_first(x.get_low());
2566 if (a1 >= 0) return a1;
2567 int a2 = horizontal_find_first(x.get_high());
2568 if (a2 < 0) return a2;
2569 return a2 + 4;
2570 }
2571
2572 // count the number of true elements
horizontal_count(Vec16ib const & x)2573 static inline uint32_t horizontal_count(Vec16ib const & x) {
2574 return horizontal_count(x.get_low()) + horizontal_count(x.get_high());
2575 }
2576
horizontal_count(Vec8qb const & x)2577 static inline uint32_t horizontal_count(Vec8qb const & x) {
2578 return horizontal_count(x.get_low()) + horizontal_count(x.get_high());
2579 }
2580
2581
2582 /*****************************************************************************
2583 *
2584 * Boolean <-> bitfield conversion functions
2585 *
2586 *****************************************************************************/
2587
2588 // to_bits: convert to integer bitfield
to_bits(Vec16b const & a)2589 static inline uint16_t to_bits(Vec16b const & a) {
2590 return to_bits(a.get_low()) | ((uint16_t)to_bits(a.get_high()) << 8);
2591 }
2592
2593 // to_bits: convert to integer bitfield
to_bits(Vec16ib const & a)2594 static inline uint16_t to_bits(Vec16ib const & a) {
2595 return to_bits(a.get_low()) | ((uint16_t)to_bits(a.get_high()) << 8);
2596 }
2597
2598 // to_Vec16ib: convert integer bitfield to boolean vector
to_Vec16ib(uint16_t const & x)2599 static inline Vec16ib to_Vec16ib(uint16_t const & x) {
2600 return Vec16i(to_Vec8ib(uint8_t(x)), to_Vec8ib(uint8_t(x>>8)));
2601 }
2602
2603 // to_bits: convert to integer bitfield
to_bits(Vec8b const & a)2604 static inline uint8_t to_bits(Vec8b const & a) {
2605 return to_bits(a.get_low()) | (to_bits(a.get_high()) << 4);
2606 }
2607
2608 // to_Vec8qb: convert integer bitfield to boolean vector
to_Vec8qb(uint8_t x)2609 static inline Vec8qb to_Vec8qb(uint8_t x) {
2610 return Vec8q(to_Vec4qb(x), to_Vec4qb(x>>4));
2611 }
2612
2613 #ifdef VCL_NAMESPACE
2614 }
2615 #endif
2616
2617 #endif // VECTORI512_H
2618