1 /*
2  * Copyright (c) 2015-2020, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /** \file
30  * \brief SIMD types and primitive operations.
31  */
32 
33 #ifndef SIMD_UTILS
34 #define SIMD_UTILS
35 
36 #if !defined(_WIN32) && !defined(__SSSE3__)
37 #error SSSE3 instructions must be enabled
38 #endif
39 
40 #include "config.h"
41 #include "ue2common.h"
42 #include "simd_types.h"
43 #include "unaligned.h"
44 #include "util/arch.h"
45 #include "util/intrinsics.h"
46 
47 #include <string.h> // for memcpy
48 
49 // Define a common assume_aligned using an appropriate compiler built-in, if
50 // it's available. Note that we need to handle C or C++ compilation.
51 #ifdef __cplusplus
52 #  ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
53 #    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
54 #  endif
55 #else
56 #  ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
57 #    define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
58 #  endif
59 #endif
60 
61 // Fallback to identity case.
62 #ifndef assume_aligned
63 #define assume_aligned(x, y) (x)
64 #endif
65 
66 #ifdef __cplusplus
67 extern "C" {
68 #endif
69 extern const char vbs_mask_data[];
70 #ifdef __cplusplus
71 }
72 #endif
73 
ones128(void)74 static really_inline m128 ones128(void) {
75 #if defined(__GNUC__) || defined(__INTEL_COMPILER)
76     /* gcc gets this right */
77     return _mm_set1_epi8(0xFF);
78 #else
79     /* trick from Intel's optimization guide to generate all-ones.
80      * ICC converts this to the single cmpeq instruction */
81     return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
82 #endif
83 }
84 
zeroes128(void)85 static really_inline m128 zeroes128(void) {
86     return _mm_setzero_si128();
87 }
88 
89 /** \brief Bitwise not for m128*/
not128(m128 a)90 static really_inline m128 not128(m128 a) {
91     return _mm_xor_si128(a, ones128());
92 }
93 
94 /** \brief Return 1 if a and b are different otherwise 0 */
diff128(m128 a,m128 b)95 static really_inline int diff128(m128 a, m128 b) {
96     return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
97 }
98 
isnonzero128(m128 a)99 static really_inline int isnonzero128(m128 a) {
100     return !!diff128(a, zeroes128());
101 }
102 
103 /**
104  * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
105  * mask indicating which 32-bit words contain differences.
106  */
diffrich128(m128 a,m128 b)107 static really_inline u32 diffrich128(m128 a, m128 b) {
108     a = _mm_cmpeq_epi32(a, b);
109     return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
110 }
111 
112 /**
113  * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
114  * returns a 4-bit mask indicating which 64-bit words contain differences.
115  */
diffrich64_128(m128 a,m128 b)116 static really_inline u32 diffrich64_128(m128 a, m128 b) {
117 #if defined(HAVE_SSE41)
118     a = _mm_cmpeq_epi64(a, b);
119     return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
120 #else
121     u32 d = diffrich128(a, b);
122     return (d | (d >> 1)) & 0x5;
123 #endif
124 }
125 
126 static really_really_inline
lshift64_m128(m128 a,unsigned b)127 m128 lshift64_m128(m128 a, unsigned b) {
128 #if defined(HAVE__BUILTIN_CONSTANT_P)
129     if (__builtin_constant_p(b)) {
130         return _mm_slli_epi64(a, b);
131     }
132 #endif
133     m128 x = _mm_cvtsi32_si128(b);
134     return _mm_sll_epi64(a, x);
135 }
136 
137 #define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
138 #define eq128(a, b)      _mm_cmpeq_epi8((a), (b))
139 #define movemask128(a)  ((u32)_mm_movemask_epi8((a)))
140 
141 #if defined(HAVE_AVX512)
cast512to128(const m512 in)142 static really_inline m128 cast512to128(const m512 in) {
143     return _mm512_castsi512_si128(in);
144 }
145 #endif
146 
set16x8(u8 c)147 static really_inline m128 set16x8(u8 c) {
148     return _mm_set1_epi8(c);
149 }
150 
set4x32(u32 c)151 static really_inline m128 set4x32(u32 c) {
152     return _mm_set1_epi32(c);
153 }
154 
movd(const m128 in)155 static really_inline u32 movd(const m128 in) {
156     return _mm_cvtsi128_si32(in);
157 }
158 
159 #if defined(HAVE_AVX512)
movd512(const m512 in)160 static really_inline u32 movd512(const m512 in) {
161     // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
162     //       so we use 2-step convertions to work around.
163     return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
164 }
165 
movq512(const m512 in)166 static really_inline u64a movq512(const m512 in) {
167     // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
168     //       so we use 2-step convertions to work around.
169     return _mm_cvtsi128_si64(_mm512_castsi512_si128(in));
170 }
171 #endif
172 
movq(const m128 in)173 static really_inline u64a movq(const m128 in) {
174 #if defined(ARCH_X86_64)
175     return _mm_cvtsi128_si64(in);
176 #else // 32-bit - this is horrific
177     u32 lo = movd(in);
178     u32 hi = movd(_mm_srli_epi64(in, 32));
179     return (u64a)hi << 32 | lo;
180 #endif
181 }
182 
183 /* another form of movq */
184 static really_inline
load_m128_from_u64a(const u64a * p)185 m128 load_m128_from_u64a(const u64a *p) {
186     return _mm_set_epi64x(0LL, *p);
187 }
188 
189 #define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
190 #define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
191 
192 #if defined(HAVE_SSE41)
193 #define extract32from128(a, imm) _mm_extract_epi32(a, imm)
194 #define extract64from128(a, imm) _mm_extract_epi64(a, imm)
195 #else
196 #define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
197 #define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
198 #endif
199 
200 #if !defined(HAVE_AVX2)
201 // TODO: this entire file needs restructuring - this carveout is awful
202 #define extractlow64from256(a) movq(a.lo)
203 #define extractlow32from256(a) movd(a.lo)
204 #if defined(HAVE_SSE41)
205 #define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
206 #define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
207 #else
208 #define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
209 #define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
210 #endif
211 
212 #endif // !AVX2
213 
and128(m128 a,m128 b)214 static really_inline m128 and128(m128 a, m128 b) {
215     return _mm_and_si128(a,b);
216 }
217 
xor128(m128 a,m128 b)218 static really_inline m128 xor128(m128 a, m128 b) {
219     return _mm_xor_si128(a,b);
220 }
221 
or128(m128 a,m128 b)222 static really_inline m128 or128(m128 a, m128 b) {
223     return _mm_or_si128(a,b);
224 }
225 
226 #if defined(HAVE_AVX512VBMI)
expand128(m128 a)227 static really_inline m512 expand128(m128 a) {
228     return _mm512_broadcast_i32x4(a);
229 }
230 
expand256(m256 a)231 static really_inline m512 expand256(m256 a) {
232     return _mm512_broadcast_i64x4(a);
233 }
234 
expand384(m384 a)235 static really_inline m512 expand384(m384 a) {
236     u64a *lo = (u64a*)&a.lo;
237     u64a *mid = (u64a*)&a.mid;
238     u64a *hi = (u64a*)&a.hi;
239     return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0],
240                             lo[1], lo[0]);
241 }
242 #endif
243 
andnot128(m128 a,m128 b)244 static really_inline m128 andnot128(m128 a, m128 b) {
245     return _mm_andnot_si128(a, b);
246 }
247 
248 // aligned load
load128(const void * ptr)249 static really_inline m128 load128(const void *ptr) {
250     assert(ISALIGNED_N(ptr, alignof(m128)));
251     ptr = assume_aligned(ptr, 16);
252     return _mm_load_si128((const m128 *)ptr);
253 }
254 
255 // aligned store
store128(void * ptr,m128 a)256 static really_inline void store128(void *ptr, m128 a) {
257     assert(ISALIGNED_N(ptr, alignof(m128)));
258     ptr = assume_aligned(ptr, 16);
259     *(m128 *)ptr = a;
260 }
261 
262 // unaligned load
loadu128(const void * ptr)263 static really_inline m128 loadu128(const void *ptr) {
264     return _mm_loadu_si128((const m128 *)ptr);
265 }
266 
267 // unaligned store
storeu128(void * ptr,m128 a)268 static really_inline void storeu128(void *ptr, m128 a) {
269     _mm_storeu_si128 ((m128 *)ptr, a);
270 }
271 
272 // packed unaligned store of first N bytes
273 static really_inline
storebytes128(void * ptr,m128 a,unsigned int n)274 void storebytes128(void *ptr, m128 a, unsigned int n) {
275     assert(n <= sizeof(a));
276     memcpy(ptr, &a, n);
277 }
278 
279 // packed unaligned load of first N bytes, pad with zero
280 static really_inline
loadbytes128(const void * ptr,unsigned int n)281 m128 loadbytes128(const void *ptr, unsigned int n) {
282     m128 a = zeroes128();
283     assert(n <= sizeof(a));
284     memcpy(&a, ptr, n);
285     return a;
286 }
287 
288 #ifdef __cplusplus
289 extern "C" {
290 #endif
291 extern const u8 simd_onebit_masks[];
292 #ifdef __cplusplus
293 }
294 #endif
295 
296 static really_inline
mask1bit128(unsigned int n)297 m128 mask1bit128(unsigned int n) {
298     assert(n < sizeof(m128) * 8);
299     u32 mask_idx = ((n % 8) * 64) + 95;
300     mask_idx -= n / 8;
301     return loadu128(&simd_onebit_masks[mask_idx]);
302 }
303 
304 // switches on bit N in the given vector.
305 static really_inline
setbit128(m128 * ptr,unsigned int n)306 void setbit128(m128 *ptr, unsigned int n) {
307     *ptr = or128(mask1bit128(n), *ptr);
308 }
309 
310 // switches off bit N in the given vector.
311 static really_inline
clearbit128(m128 * ptr,unsigned int n)312 void clearbit128(m128 *ptr, unsigned int n) {
313     *ptr = andnot128(mask1bit128(n), *ptr);
314 }
315 
316 // tests bit N in the given vector.
317 static really_inline
testbit128(m128 val,unsigned int n)318 char testbit128(m128 val, unsigned int n) {
319     const m128 mask = mask1bit128(n);
320 #if defined(HAVE_SSE41)
321     return !_mm_testz_si128(mask, val);
322 #else
323     return isnonzero128(and128(mask, val));
324 #endif
325 }
326 
327 // offset must be an immediate
328 #define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
329 
330 static really_inline
pshufb_m128(m128 a,m128 b)331 m128 pshufb_m128(m128 a, m128 b) {
332     m128 result;
333     result = _mm_shuffle_epi8(a, b);
334     return result;
335 }
336 
337 static really_inline
pshufb_m256(m256 a,m256 b)338 m256 pshufb_m256(m256 a, m256 b) {
339 #if defined(HAVE_AVX2)
340     return _mm256_shuffle_epi8(a, b);
341 #else
342     m256 rv;
343     rv.lo = pshufb_m128(a.lo, b.lo);
344     rv.hi = pshufb_m128(a.hi, b.hi);
345     return rv;
346 #endif
347 }
348 
349 #if defined(HAVE_AVX512)
350 static really_inline
pshufb_m512(m512 a,m512 b)351 m512 pshufb_m512(m512 a, m512 b) {
352     return _mm512_shuffle_epi8(a, b);
353 }
354 
355 static really_inline
maskz_pshufb_m512(__mmask64 k,m512 a,m512 b)356 m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
357     return _mm512_maskz_shuffle_epi8(k, a, b);
358 }
359 
360 #if defined(HAVE_AVX512VBMI)
361 #define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
362 #define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
363 #endif
364 
365 #endif
366 
367 static really_inline
variable_byte_shift_m128(m128 in,s32 amount)368 m128 variable_byte_shift_m128(m128 in, s32 amount) {
369     assert(amount >= -16 && amount <= 16);
370     m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
371     return pshufb_m128(in, shift_mask);
372 }
373 
374 static really_inline
max_u8_m128(m128 a,m128 b)375 m128 max_u8_m128(m128 a, m128 b) {
376     return _mm_max_epu8(a, b);
377 }
378 
379 static really_inline
min_u8_m128(m128 a,m128 b)380 m128 min_u8_m128(m128 a, m128 b) {
381     return _mm_min_epu8(a, b);
382 }
383 
384 static really_inline
sadd_u8_m128(m128 a,m128 b)385 m128 sadd_u8_m128(m128 a, m128 b) {
386     return _mm_adds_epu8(a, b);
387 }
388 
389 static really_inline
sub_u8_m128(m128 a,m128 b)390 m128 sub_u8_m128(m128 a, m128 b) {
391     return _mm_sub_epi8(a, b);
392 }
393 
394 static really_inline
set64x2(u64a hi,u64a lo)395 m128 set64x2(u64a hi, u64a lo) {
396     return _mm_set_epi64x(hi, lo);
397 }
398 
399 /****
400  **** 256-bit Primitives
401  ****/
402 
403 #if defined(HAVE_AVX2)
404 
405 static really_really_inline
lshift64_m256(m256 a,unsigned b)406 m256 lshift64_m256(m256 a, unsigned b) {
407 #if defined(HAVE__BUILTIN_CONSTANT_P)
408     if (__builtin_constant_p(b)) {
409         return _mm256_slli_epi64(a, b);
410     }
411 #endif
412     m128 x = _mm_cvtsi32_si128(b);
413     return _mm256_sll_epi64(a, x);
414 }
415 
416 #define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
417 
418 static really_inline
set32x8(u32 in)419 m256 set32x8(u32 in) {
420     return _mm256_set1_epi8(in);
421 }
422 
423 #define eq256(a, b)     _mm256_cmpeq_epi8((a), (b))
424 #define movemask256(a)  ((u32)_mm256_movemask_epi8((a)))
425 
426 static really_inline
set2x128(m128 a)427 m256 set2x128(m128 a) {
428     return _mm256_broadcastsi128_si256(a);
429 }
430 
431 #else
432 
433 static really_really_inline
lshift64_m256(m256 a,int b)434 m256 lshift64_m256(m256 a, int b) {
435     m256 rv = a;
436     rv.lo = lshift64_m128(rv.lo, b);
437     rv.hi = lshift64_m128(rv.hi, b);
438     return rv;
439 }
440 
441 static really_inline
rshift64_m256(m256 a,int b)442 m256 rshift64_m256(m256 a, int b) {
443     m256 rv = a;
444     rv.lo = rshift64_m128(rv.lo, b);
445     rv.hi = rshift64_m128(rv.hi, b);
446     return rv;
447 }
448 static really_inline
set32x8(u32 in)449 m256 set32x8(u32 in) {
450     m256 rv;
451     rv.lo = set16x8((u8) in);
452     rv.hi = rv.lo;
453     return rv;
454 }
455 
456 static really_inline
eq256(m256 a,m256 b)457 m256 eq256(m256 a, m256 b) {
458     m256 rv;
459     rv.lo = eq128(a.lo, b.lo);
460     rv.hi = eq128(a.hi, b.hi);
461     return rv;
462 }
463 
464 static really_inline
movemask256(m256 a)465 u32 movemask256(m256 a) {
466     u32 lo_mask = movemask128(a.lo);
467     u32 hi_mask = movemask128(a.hi);
468     return lo_mask | (hi_mask << 16);
469 }
470 
471 static really_inline
set2x128(m128 a)472 m256 set2x128(m128 a) {
473     m256 rv = {a, a};
474     return rv;
475 }
476 #endif
477 
zeroes256(void)478 static really_inline m256 zeroes256(void) {
479 #if defined(HAVE_AVX2)
480     return _mm256_setzero_si256();
481 #else
482     m256 rv = {zeroes128(), zeroes128()};
483     return rv;
484 #endif
485 }
486 
ones256(void)487 static really_inline m256 ones256(void) {
488 #if defined(HAVE_AVX2)
489     m256 rv = _mm256_set1_epi8(0xFF);
490 #else
491     m256 rv = {ones128(), ones128()};
492 #endif
493     return rv;
494 }
495 
496 #if defined(HAVE_AVX2)
and256(m256 a,m256 b)497 static really_inline m256 and256(m256 a, m256 b) {
498     return _mm256_and_si256(a, b);
499 }
500 #else
and256(m256 a,m256 b)501 static really_inline m256 and256(m256 a, m256 b) {
502     m256 rv;
503     rv.lo = and128(a.lo, b.lo);
504     rv.hi = and128(a.hi, b.hi);
505     return rv;
506 }
507 #endif
508 
509 #if defined(HAVE_AVX2)
or256(m256 a,m256 b)510 static really_inline m256 or256(m256 a, m256 b) {
511     return _mm256_or_si256(a, b);
512 }
513 #else
or256(m256 a,m256 b)514 static really_inline m256 or256(m256 a, m256 b) {
515     m256 rv;
516     rv.lo = or128(a.lo, b.lo);
517     rv.hi = or128(a.hi, b.hi);
518     return rv;
519 }
520 #endif
521 
522 #if defined(HAVE_AVX2)
xor256(m256 a,m256 b)523 static really_inline m256 xor256(m256 a, m256 b) {
524     return _mm256_xor_si256(a, b);
525 }
526 #else
xor256(m256 a,m256 b)527 static really_inline m256 xor256(m256 a, m256 b) {
528     m256 rv;
529     rv.lo = xor128(a.lo, b.lo);
530     rv.hi = xor128(a.hi, b.hi);
531     return rv;
532 }
533 #endif
534 
535 #if defined(HAVE_AVX2)
not256(m256 a)536 static really_inline m256 not256(m256 a) {
537     return _mm256_xor_si256(a, ones256());
538 }
539 #else
not256(m256 a)540 static really_inline m256 not256(m256 a) {
541     m256 rv;
542     rv.lo = not128(a.lo);
543     rv.hi = not128(a.hi);
544     return rv;
545 }
546 #endif
547 
548 #if defined(HAVE_AVX2)
andnot256(m256 a,m256 b)549 static really_inline m256 andnot256(m256 a, m256 b) {
550     return _mm256_andnot_si256(a, b);
551 }
552 #else
andnot256(m256 a,m256 b)553 static really_inline m256 andnot256(m256 a, m256 b) {
554     m256 rv;
555     rv.lo = andnot128(a.lo, b.lo);
556     rv.hi = andnot128(a.hi, b.hi);
557     return rv;
558 }
559 #endif
560 
diff256(m256 a,m256 b)561 static really_inline int diff256(m256 a, m256 b) {
562 #if defined(HAVE_AVX2)
563     return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
564 #else
565     return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
566 #endif
567 }
568 
isnonzero256(m256 a)569 static really_inline int isnonzero256(m256 a) {
570 #if defined(HAVE_AVX2)
571     return !!diff256(a, zeroes256());
572 #else
573     return isnonzero128(or128(a.lo, a.hi));
574 #endif
575 }
576 
577 /**
578  * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
579  * mask indicating which 32-bit words contain differences.
580  */
diffrich256(m256 a,m256 b)581 static really_inline u32 diffrich256(m256 a, m256 b) {
582 #if defined(HAVE_AVX2)
583     a = _mm256_cmpeq_epi32(a, b);
584     return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
585 #else
586     m128 z = zeroes128();
587     a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
588     a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
589     m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z);
590     return ~(_mm_movemask_epi8(packed)) & 0xff;
591 #endif
592 }
593 
594 /**
595  * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
596  * returns an 8-bit mask indicating which 64-bit words contain differences.
597  */
diffrich64_256(m256 a,m256 b)598 static really_inline u32 diffrich64_256(m256 a, m256 b) {
599     u32 d = diffrich256(a, b);
600     return (d | (d >> 1)) & 0x55555555;
601 }
602 
603 // aligned load
load256(const void * ptr)604 static really_inline m256 load256(const void *ptr) {
605     assert(ISALIGNED_N(ptr, alignof(m256)));
606 #if defined(HAVE_AVX2)
607     return _mm256_load_si256((const m256 *)ptr);
608 #else
609     m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
610     return rv;
611 #endif
612 }
613 
614 // aligned load  of 128-bit value to low and high part of 256-bit value
load2x128(const void * ptr)615 static really_inline m256 load2x128(const void *ptr) {
616 #if defined(HAVE_AVX2)
617     return set2x128(load128(ptr));
618 #else
619     assert(ISALIGNED_N(ptr, alignof(m128)));
620     m256 rv;
621     rv.hi = rv.lo = load128(ptr);
622     return rv;
623 #endif
624 }
625 
loadu2x128(const void * ptr)626 static really_inline m256 loadu2x128(const void *ptr) {
627     return set2x128(loadu128(ptr));
628 }
629 
630 // aligned store
store256(void * ptr,m256 a)631 static really_inline void store256(void *ptr, m256 a) {
632     assert(ISALIGNED_N(ptr, alignof(m256)));
633 #if defined(HAVE_AVX2)
634     _mm256_store_si256((m256 *)ptr, a);
635 #else
636     ptr = assume_aligned(ptr, 16);
637     *(m256 *)ptr = a;
638 #endif
639 }
640 
641 // unaligned load
loadu256(const void * ptr)642 static really_inline m256 loadu256(const void *ptr) {
643 #if defined(HAVE_AVX2)
644     return _mm256_loadu_si256((const m256 *)ptr);
645 #else
646     m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
647     return rv;
648 #endif
649 }
650 
651 // unaligned store
storeu256(void * ptr,m256 a)652 static really_inline void storeu256(void *ptr, m256 a) {
653 #if defined(HAVE_AVX2)
654     _mm256_storeu_si256((m256 *)ptr, a);
655 #else
656     storeu128(ptr, a.lo);
657     storeu128((char *)ptr + 16, a.hi);
658 #endif
659 }
660 
661 // packed unaligned store of first N bytes
662 static really_inline
storebytes256(void * ptr,m256 a,unsigned int n)663 void storebytes256(void *ptr, m256 a, unsigned int n) {
664     assert(n <= sizeof(a));
665     memcpy(ptr, &a, n);
666 }
667 
668 // packed unaligned load of first N bytes, pad with zero
669 static really_inline
loadbytes256(const void * ptr,unsigned int n)670 m256 loadbytes256(const void *ptr, unsigned int n) {
671     m256 a = zeroes256();
672     assert(n <= sizeof(a));
673     memcpy(&a, ptr, n);
674     return a;
675 }
676 
677 static really_inline
mask1bit256(unsigned int n)678 m256 mask1bit256(unsigned int n) {
679     assert(n < sizeof(m256) * 8);
680     u32 mask_idx = ((n % 8) * 64) + 95;
681     mask_idx -= n / 8;
682     return loadu256(&simd_onebit_masks[mask_idx]);
683 }
684 
685 static really_inline
set64x4(u64a hi_1,u64a hi_0,u64a lo_1,u64a lo_0)686 m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
687 #if defined(HAVE_AVX2)
688     return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
689 #else
690     m256 rv;
691     rv.hi = set64x2(hi_1, hi_0);
692     rv.lo = set64x2(lo_1, lo_0);
693     return rv;
694 #endif
695 }
696 
697 #if !defined(HAVE_AVX2)
698 // switches on bit N in the given vector.
699 static really_inline
setbit256(m256 * ptr,unsigned int n)700 void setbit256(m256 *ptr, unsigned int n) {
701     assert(n < sizeof(*ptr) * 8);
702     m128 *sub;
703     if (n < 128) {
704         sub = &ptr->lo;
705     } else {
706         sub = &ptr->hi;
707         n -= 128;
708     }
709     setbit128(sub, n);
710 }
711 
712 // switches off bit N in the given vector.
713 static really_inline
clearbit256(m256 * ptr,unsigned int n)714 void clearbit256(m256 *ptr, unsigned int n) {
715     assert(n < sizeof(*ptr) * 8);
716     m128 *sub;
717     if (n < 128) {
718         sub = &ptr->lo;
719     } else {
720         sub = &ptr->hi;
721         n -= 128;
722     }
723     clearbit128(sub, n);
724 }
725 
726 // tests bit N in the given vector.
727 static really_inline
testbit256(m256 val,unsigned int n)728 char testbit256(m256 val, unsigned int n) {
729     assert(n < sizeof(val) * 8);
730     m128 sub;
731     if (n < 128) {
732         sub = val.lo;
733     } else {
734         sub = val.hi;
735         n -= 128;
736     }
737     return testbit128(sub, n);
738 }
739 
740 static really_really_inline
movdq_hi(m256 x)741 m128 movdq_hi(m256 x) {
742     return x.hi;
743 }
744 
745 static really_really_inline
movdq_lo(m256 x)746 m128 movdq_lo(m256 x) {
747     return x.lo;
748 }
749 
750 static really_inline
combine2x128(m128 hi,m128 lo)751 m256 combine2x128(m128 hi, m128 lo) {
752     m256 rv = {lo, hi};
753     return rv;
754 }
755 
756 #else // AVX2
757 
758 // switches on bit N in the given vector.
759 static really_inline
setbit256(m256 * ptr,unsigned int n)760 void setbit256(m256 *ptr, unsigned int n) {
761     *ptr = or256(mask1bit256(n), *ptr);
762 }
763 
764 static really_inline
clearbit256(m256 * ptr,unsigned int n)765 void clearbit256(m256 *ptr, unsigned int n) {
766     *ptr = andnot256(mask1bit256(n), *ptr);
767 }
768 
769 // tests bit N in the given vector.
770 static really_inline
testbit256(m256 val,unsigned int n)771 char testbit256(m256 val, unsigned int n) {
772     const m256 mask = mask1bit256(n);
773     return !_mm256_testz_si256(mask, val);
774 }
775 
776 static really_really_inline
movdq_hi(m256 x)777 m128 movdq_hi(m256 x) {
778     return _mm256_extracti128_si256(x, 1);
779 }
780 
781 static really_really_inline
movdq_lo(m256 x)782 m128 movdq_lo(m256 x) {
783     return _mm256_extracti128_si256(x, 0);
784 }
785 
786 #define cast256to128(a) _mm256_castsi256_si128(a)
787 #define cast128to256(a) _mm256_castsi128_si256(a)
788 #define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
789 #define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
790 #define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
791 #define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
792 #define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
793 #define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
794 #define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
795 #define extractlow32from256(a) movd(cast256to128(a))
796 #define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
797 #define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
798 #define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
799 
800 static really_inline
combine2x128(m128 hi,m128 lo)801 m256 combine2x128(m128 hi, m128 lo) {
802 #if defined(_mm256_set_m128i)
803     return _mm256_set_m128i(hi, lo);
804 #else
805     return insert128to256(cast128to256(lo), hi, 1);
806 #endif
807 }
808 #endif //AVX2
809 
810 #if defined(HAVE_AVX512)
811 #define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
812 #define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
813 #define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
814 #define set2x256(a) _mm512_broadcast_i64x4(a)
815 #define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
816 #define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
817 #endif
818 
819 /****
820  **** 384-bit Primitives
821  ****/
822 
and384(m384 a,m384 b)823 static really_inline m384 and384(m384 a, m384 b) {
824     m384 rv;
825     rv.lo = and128(a.lo, b.lo);
826     rv.mid = and128(a.mid, b.mid);
827     rv.hi = and128(a.hi, b.hi);
828     return rv;
829 }
830 
or384(m384 a,m384 b)831 static really_inline m384 or384(m384 a, m384 b) {
832     m384 rv;
833     rv.lo = or128(a.lo, b.lo);
834     rv.mid = or128(a.mid, b.mid);
835     rv.hi = or128(a.hi, b.hi);
836     return rv;
837 }
838 
xor384(m384 a,m384 b)839 static really_inline m384 xor384(m384 a, m384 b) {
840     m384 rv;
841     rv.lo = xor128(a.lo, b.lo);
842     rv.mid = xor128(a.mid, b.mid);
843     rv.hi = xor128(a.hi, b.hi);
844     return rv;
845 }
not384(m384 a)846 static really_inline m384 not384(m384 a) {
847     m384 rv;
848     rv.lo = not128(a.lo);
849     rv.mid = not128(a.mid);
850     rv.hi = not128(a.hi);
851     return rv;
852 }
andnot384(m384 a,m384 b)853 static really_inline m384 andnot384(m384 a, m384 b) {
854     m384 rv;
855     rv.lo = andnot128(a.lo, b.lo);
856     rv.mid = andnot128(a.mid, b.mid);
857     rv.hi = andnot128(a.hi, b.hi);
858     return rv;
859 }
860 
861 static really_really_inline
lshift64_m384(m384 a,unsigned b)862 m384 lshift64_m384(m384 a, unsigned b) {
863     m384 rv;
864     rv.lo = lshift64_m128(a.lo, b);
865     rv.mid = lshift64_m128(a.mid, b);
866     rv.hi = lshift64_m128(a.hi, b);
867     return rv;
868 }
869 
zeroes384(void)870 static really_inline m384 zeroes384(void) {
871     m384 rv = {zeroes128(), zeroes128(), zeroes128()};
872     return rv;
873 }
874 
ones384(void)875 static really_inline m384 ones384(void) {
876     m384 rv = {ones128(), ones128(), ones128()};
877     return rv;
878 }
879 
diff384(m384 a,m384 b)880 static really_inline int diff384(m384 a, m384 b) {
881     return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
882 }
883 
isnonzero384(m384 a)884 static really_inline int isnonzero384(m384 a) {
885     return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
886 }
887 
888 /**
889  * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
890  * mask indicating which 32-bit words contain differences.
891  */
diffrich384(m384 a,m384 b)892 static really_inline u32 diffrich384(m384 a, m384 b) {
893     m128 z = zeroes128();
894     a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
895     a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
896     a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
897     m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
898                                   _mm_packs_epi32(a.hi, z));
899     return ~(_mm_movemask_epi8(packed)) & 0xfff;
900 }
901 
902 /**
903  * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
904  * returns a 12-bit mask indicating which 64-bit words contain differences.
905  */
diffrich64_384(m384 a,m384 b)906 static really_inline u32 diffrich64_384(m384 a, m384 b) {
907     u32 d = diffrich384(a, b);
908     return (d | (d >> 1)) & 0x55555555;
909 }
910 
911 // aligned load
load384(const void * ptr)912 static really_inline m384 load384(const void *ptr) {
913     assert(ISALIGNED_16(ptr));
914     m384 rv = { load128(ptr), load128((const char *)ptr + 16),
915                 load128((const char *)ptr + 32) };
916     return rv;
917 }
918 
919 // aligned store
store384(void * ptr,m384 a)920 static really_inline void store384(void *ptr, m384 a) {
921     assert(ISALIGNED_16(ptr));
922     ptr = assume_aligned(ptr, 16);
923     *(m384 *)ptr = a;
924 }
925 
926 // unaligned load
loadu384(const void * ptr)927 static really_inline m384 loadu384(const void *ptr) {
928     m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
929                 loadu128((const char *)ptr + 32)};
930     return rv;
931 }
932 
933 // packed unaligned store of first N bytes
934 static really_inline
storebytes384(void * ptr,m384 a,unsigned int n)935 void storebytes384(void *ptr, m384 a, unsigned int n) {
936     assert(n <= sizeof(a));
937     memcpy(ptr, &a, n);
938 }
939 
940 // packed unaligned load of first N bytes, pad with zero
941 static really_inline
loadbytes384(const void * ptr,unsigned int n)942 m384 loadbytes384(const void *ptr, unsigned int n) {
943     m384 a = zeroes384();
944     assert(n <= sizeof(a));
945     memcpy(&a, ptr, n);
946     return a;
947 }
948 
949 // switches on bit N in the given vector.
950 static really_inline
setbit384(m384 * ptr,unsigned int n)951 void setbit384(m384 *ptr, unsigned int n) {
952     assert(n < sizeof(*ptr) * 8);
953     m128 *sub;
954     if (n < 128) {
955         sub = &ptr->lo;
956     } else if (n < 256) {
957         sub = &ptr->mid;
958     } else {
959         sub = &ptr->hi;
960     }
961     setbit128(sub, n % 128);
962 }
963 
964 // switches off bit N in the given vector.
965 static really_inline
clearbit384(m384 * ptr,unsigned int n)966 void clearbit384(m384 *ptr, unsigned int n) {
967     assert(n < sizeof(*ptr) * 8);
968     m128 *sub;
969     if (n < 128) {
970         sub = &ptr->lo;
971     } else if (n < 256) {
972         sub = &ptr->mid;
973     } else {
974         sub = &ptr->hi;
975     }
976     clearbit128(sub, n % 128);
977 }
978 
979 // tests bit N in the given vector.
980 static really_inline
testbit384(m384 val,unsigned int n)981 char testbit384(m384 val, unsigned int n) {
982     assert(n < sizeof(val) * 8);
983     m128 sub;
984     if (n < 128) {
985         sub = val.lo;
986     } else if (n < 256) {
987         sub = val.mid;
988     } else {
989         sub = val.hi;
990     }
991     return testbit128(sub, n % 128);
992 }
993 
994 /****
995  **** 512-bit Primitives
996  ****/
997 
998 #define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
999 #define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
1000 
1001 static really_inline
zeroes512(void)1002 m512 zeroes512(void) {
1003 #if defined(HAVE_AVX512)
1004     return _mm512_setzero_si512();
1005 #else
1006     m512 rv = {zeroes256(), zeroes256()};
1007     return rv;
1008 #endif
1009 }
1010 
1011 static really_inline
ones512(void)1012 m512 ones512(void) {
1013 #if defined(HAVE_AVX512)
1014     return _mm512_set1_epi8(0xFF);
1015     //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
1016 #else
1017     m512 rv = {ones256(), ones256()};
1018     return rv;
1019 #endif
1020 }
1021 
1022 #if defined(HAVE_AVX512)
1023 static really_inline
set64x8(u8 a)1024 m512 set64x8(u8 a) {
1025     return _mm512_set1_epi8(a);
1026 }
1027 
1028 static really_inline
set8x64(u64a a)1029 m512 set8x64(u64a a) {
1030     return _mm512_set1_epi64(a);
1031 }
1032 
1033 static really_inline
set16x32(u32 a)1034 m512 set16x32(u32 a) {
1035     return _mm512_set1_epi32(a);
1036 }
1037 
1038 static really_inline
set512_64(u64a hi_3,u64a hi_2,u64a hi_1,u64a hi_0,u64a lo_3,u64a lo_2,u64a lo_1,u64a lo_0)1039 m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
1040                u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
1041     return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
1042                             lo_3, lo_2, lo_1, lo_0);
1043 }
1044 
1045 static really_inline
swap256in512(m512 a)1046 m512 swap256in512(m512 a) {
1047     m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
1048     return vpermq512(idx, a);
1049 }
1050 
1051 static really_inline
set4x128(m128 a)1052 m512 set4x128(m128 a) {
1053     return _mm512_broadcast_i32x4(a);
1054 }
1055 
1056 static really_inline
sadd_u8_m512(m512 a,m512 b)1057 m512 sadd_u8_m512(m512 a, m512 b) {
1058     return _mm512_adds_epu8(a, b);
1059 }
1060 
1061 static really_inline
max_u8_m512(m512 a,m512 b)1062 m512 max_u8_m512(m512 a, m512 b) {
1063     return _mm512_max_epu8(a, b);
1064 }
1065 
1066 static really_inline
min_u8_m512(m512 a,m512 b)1067 m512 min_u8_m512(m512 a, m512 b) {
1068     return _mm512_min_epu8(a, b);
1069 }
1070 
1071 static really_inline
sub_u8_m512(m512 a,m512 b)1072 m512 sub_u8_m512(m512 a, m512 b) {
1073     return _mm512_sub_epi8(a, b);
1074 }
1075 #endif
1076 
1077 static really_inline
and512(m512 a,m512 b)1078 m512 and512(m512 a, m512 b) {
1079 #if defined(HAVE_AVX512)
1080     return _mm512_and_si512(a, b);
1081 #else
1082     m512 rv;
1083     rv.lo = and256(a.lo, b.lo);
1084     rv.hi = and256(a.hi, b.hi);
1085     return rv;
1086 #endif
1087 }
1088 
1089 static really_inline
or512(m512 a,m512 b)1090 m512 or512(m512 a, m512 b) {
1091 #if defined(HAVE_AVX512)
1092     return _mm512_or_si512(a, b);
1093 #else
1094     m512 rv;
1095     rv.lo = or256(a.lo, b.lo);
1096     rv.hi = or256(a.hi, b.hi);
1097     return rv;
1098 #endif
1099 }
1100 
1101 static really_inline
xor512(m512 a,m512 b)1102 m512 xor512(m512 a, m512 b) {
1103 #if defined(HAVE_AVX512)
1104     return _mm512_xor_si512(a, b);
1105 #else
1106     m512 rv;
1107     rv.lo = xor256(a.lo, b.lo);
1108     rv.hi = xor256(a.hi, b.hi);
1109     return rv;
1110 #endif
1111 }
1112 
1113 static really_inline
not512(m512 a)1114 m512 not512(m512 a) {
1115 #if defined(HAVE_AVX512)
1116     return _mm512_xor_si512(a, ones512());
1117 #else
1118     m512 rv;
1119     rv.lo = not256(a.lo);
1120     rv.hi = not256(a.hi);
1121     return rv;
1122 #endif
1123 }
1124 
1125 static really_inline
andnot512(m512 a,m512 b)1126 m512 andnot512(m512 a, m512 b) {
1127 #if defined(HAVE_AVX512)
1128     return _mm512_andnot_si512(a, b);
1129 #else
1130     m512 rv;
1131     rv.lo = andnot256(a.lo, b.lo);
1132     rv.hi = andnot256(a.hi, b.hi);
1133     return rv;
1134 #endif
1135 }
1136 
1137 #if defined(HAVE_AVX512)
1138 static really_really_inline
lshift64_m512(m512 a,unsigned b)1139 m512 lshift64_m512(m512 a, unsigned b) {
1140 #if defined(HAVE__BUILTIN_CONSTANT_P)
1141     if (__builtin_constant_p(b)) {
1142         return _mm512_slli_epi64(a, b);
1143     }
1144 #endif
1145     m128 x = _mm_cvtsi32_si128(b);
1146     return _mm512_sll_epi64(a, x);
1147 }
1148 #else
1149 static really_really_inline
lshift64_m512(m512 a,unsigned b)1150 m512 lshift64_m512(m512 a, unsigned b) {
1151     m512 rv;
1152     rv.lo = lshift64_m256(a.lo, b);
1153     rv.hi = lshift64_m256(a.hi, b);
1154     return rv;
1155 }
1156 #endif
1157 
1158 #if defined(HAVE_AVX512)
1159 #define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
1160 #define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
1161 #define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
1162 #endif
1163 
1164 #if !defined(_MM_CMPINT_NE)
1165 #define _MM_CMPINT_NE 0x4
1166 #endif
1167 
1168 static really_inline
diff512(m512 a,m512 b)1169 int diff512(m512 a, m512 b) {
1170 #if defined(HAVE_AVX512)
1171     return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
1172 #else
1173     return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
1174 #endif
1175 }
1176 
1177 static really_inline
isnonzero512(m512 a)1178 int isnonzero512(m512 a) {
1179 #if defined(HAVE_AVX512)
1180     return diff512(a, zeroes512());
1181 #elif defined(HAVE_AVX2)
1182     m256 x = or256(a.lo, a.hi);
1183     return !!diff256(x, zeroes256());
1184 #else
1185     m128 x = or128(a.lo.lo, a.lo.hi);
1186     m128 y = or128(a.hi.lo, a.hi.hi);
1187     return isnonzero128(or128(x, y));
1188 #endif
1189 }
1190 
1191 /**
1192  * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
1193  * mask indicating which 32-bit words contain differences.
1194  */
1195 static really_inline
diffrich512(m512 a,m512 b)1196 u32 diffrich512(m512 a, m512 b) {
1197 #if defined(HAVE_AVX512)
1198     return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
1199 #elif defined(HAVE_AVX2)
1200     return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
1201 #else
1202     a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
1203     a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi);
1204     a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo);
1205     a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi);
1206     m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi),
1207                                   _mm_packs_epi32(a.hi.lo, a.hi.hi));
1208     return ~(_mm_movemask_epi8(packed)) & 0xffff;
1209 #endif
1210 }
1211 
1212 /**
1213  * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
1214  * returns a 16-bit mask indicating which 64-bit words contain differences.
1215  */
1216 static really_inline
diffrich64_512(m512 a,m512 b)1217 u32 diffrich64_512(m512 a, m512 b) {
1218     //TODO: cmp_epi64?
1219     u32 d = diffrich512(a, b);
1220     return (d | (d >> 1)) & 0x55555555;
1221 }
1222 
1223 // aligned load
1224 static really_inline
load512(const void * ptr)1225 m512 load512(const void *ptr) {
1226 #if defined(HAVE_AVX512)
1227     return _mm512_load_si512(ptr);
1228 #else
1229     assert(ISALIGNED_N(ptr, alignof(m256)));
1230     m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
1231     return rv;
1232 #endif
1233 }
1234 
1235 // aligned store
1236 static really_inline
store512(void * ptr,m512 a)1237 void store512(void *ptr, m512 a) {
1238     assert(ISALIGNED_N(ptr, alignof(m512)));
1239 #if defined(HAVE_AVX512)
1240     return _mm512_store_si512(ptr, a);
1241 #elif defined(HAVE_AVX2)
1242     m512 *x = (m512 *)ptr;
1243     store256(&x->lo, a.lo);
1244     store256(&x->hi, a.hi);
1245 #else
1246     ptr = assume_aligned(ptr, 16);
1247     *(m512 *)ptr = a;
1248 #endif
1249 }
1250 
1251 // unaligned load
1252 static really_inline
loadu512(const void * ptr)1253 m512 loadu512(const void *ptr) {
1254 #if defined(HAVE_AVX512)
1255     return _mm512_loadu_si512(ptr);
1256 #else
1257     m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
1258     return rv;
1259 #endif
1260 }
1261 
1262 // unaligned store
1263 static really_inline
storeu512(void * ptr,m512 a)1264 void storeu512(void *ptr, m512 a) {
1265 #if defined(HAVE_AVX512)
1266     _mm512_storeu_si512((m512 *)ptr, a);
1267 #elif defined(HAVE_AVX2)
1268     storeu256(ptr, a.lo);
1269     storeu256((char *)ptr + 32, a.hi);
1270 #else
1271     storeu128(ptr, a.lo.lo);
1272     storeu128((char *)ptr + 16, a.lo.hi);
1273     storeu128((char *)ptr + 32, a.hi.lo);
1274     storeu128((char *)ptr + 48, a.hi.hi);
1275 #endif
1276 }
1277 
1278 #if defined(HAVE_AVX512)
1279 static really_inline
loadu_maskz_m512(__mmask64 k,const void * ptr)1280 m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
1281     return _mm512_maskz_loadu_epi8(k, ptr);
1282 }
1283 
1284 static really_inline
loadu_mask_m512(m512 src,__mmask64 k,const void * ptr)1285 m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
1286     return _mm512_mask_loadu_epi8(src, k, ptr);
1287 }
1288 
1289 static really_inline
storeu_mask_m512(void * ptr,__mmask64 k,m512 a)1290 void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) {
1291     _mm512_mask_storeu_epi8(ptr, k, a);
1292 }
1293 
1294 static really_inline
set_mask_m512(__mmask64 k)1295 m512 set_mask_m512(__mmask64 k) {
1296     return _mm512_movm_epi8(k);
1297 }
1298 
1299 static really_inline
loadu_maskz_m256(__mmask32 k,const void * ptr)1300 m256 loadu_maskz_m256(__mmask32 k, const void *ptr) {
1301     return _mm256_maskz_loadu_epi8(k, ptr);
1302 }
1303 #endif
1304 
1305 // packed unaligned store of first N bytes
1306 static really_inline
storebytes512(void * ptr,m512 a,unsigned int n)1307 void storebytes512(void *ptr, m512 a, unsigned int n) {
1308     assert(n <= sizeof(a));
1309     memcpy(ptr, &a, n);
1310 }
1311 
1312 // packed unaligned load of first N bytes, pad with zero
1313 static really_inline
loadbytes512(const void * ptr,unsigned int n)1314 m512 loadbytes512(const void *ptr, unsigned int n) {
1315     m512 a = zeroes512();
1316     assert(n <= sizeof(a));
1317     memcpy(&a, ptr, n);
1318     return a;
1319 }
1320 
1321 static really_inline
mask1bit512(unsigned int n)1322 m512 mask1bit512(unsigned int n) {
1323     assert(n < sizeof(m512) * 8);
1324     u32 mask_idx = ((n % 8) * 64) + 95;
1325     mask_idx -= n / 8;
1326     return loadu512(&simd_onebit_masks[mask_idx]);
1327 }
1328 
1329 // switches on bit N in the given vector.
1330 static really_inline
setbit512(m512 * ptr,unsigned int n)1331 void setbit512(m512 *ptr, unsigned int n) {
1332     assert(n < sizeof(*ptr) * 8);
1333 #if !defined(HAVE_AVX2)
1334     m128 *sub;
1335     if (n < 128) {
1336         sub = &ptr->lo.lo;
1337     } else if (n < 256) {
1338         sub = &ptr->lo.hi;
1339     } else if (n < 384) {
1340         sub = &ptr->hi.lo;
1341     } else {
1342         sub = &ptr->hi.hi;
1343     }
1344     setbit128(sub, n % 128);
1345 #elif defined(HAVE_AVX512)
1346     *ptr = or512(mask1bit512(n), *ptr);
1347 #else
1348     m256 *sub;
1349     if (n < 256) {
1350         sub = &ptr->lo;
1351     } else {
1352         sub = &ptr->hi;
1353         n -= 256;
1354     }
1355     setbit256(sub, n);
1356 #endif
1357 }
1358 
1359 // switches off bit N in the given vector.
1360 static really_inline
clearbit512(m512 * ptr,unsigned int n)1361 void clearbit512(m512 *ptr, unsigned int n) {
1362     assert(n < sizeof(*ptr) * 8);
1363 #if !defined(HAVE_AVX2)
1364     m128 *sub;
1365     if (n < 128) {
1366         sub = &ptr->lo.lo;
1367     } else if (n < 256) {
1368         sub = &ptr->lo.hi;
1369     } else if (n < 384) {
1370         sub = &ptr->hi.lo;
1371     } else {
1372         sub = &ptr->hi.hi;
1373     }
1374     clearbit128(sub, n % 128);
1375 #elif defined(HAVE_AVX512)
1376     *ptr = andnot512(mask1bit512(n), *ptr);
1377 #else
1378     m256 *sub;
1379     if (n < 256) {
1380         sub = &ptr->lo;
1381     } else {
1382         sub = &ptr->hi;
1383         n -= 256;
1384     }
1385     clearbit256(sub, n);
1386 #endif
1387 }
1388 
1389 // tests bit N in the given vector.
1390 static really_inline
testbit512(m512 val,unsigned int n)1391 char testbit512(m512 val, unsigned int n) {
1392     assert(n < sizeof(val) * 8);
1393 #if !defined(HAVE_AVX2)
1394     m128 sub;
1395     if (n < 128) {
1396         sub = val.lo.lo;
1397     } else if (n < 256) {
1398         sub = val.lo.hi;
1399     } else if (n < 384) {
1400         sub = val.hi.lo;
1401     } else {
1402         sub = val.hi.hi;
1403     }
1404     return testbit128(sub, n % 128);
1405 #elif defined(HAVE_AVX512)
1406     const m512 mask = mask1bit512(n);
1407     return !!_mm512_test_epi8_mask(mask, val);
1408 #else
1409     m256 sub;
1410     if (n < 256) {
1411         sub = val.lo;
1412     } else {
1413         sub = val.hi;
1414         n -= 256;
1415     }
1416     return testbit256(sub, n);
1417 #endif
1418 }
1419 
1420 #endif
1421