1 /*
2 * Copyright (c) 2015-2020, Intel Corporation
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * * Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of Intel Corporation nor the names of its contributors
13 * may be used to endorse or promote products derived from this software
14 * without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /** \file
30 * \brief SIMD types and primitive operations.
31 */
32
33 #ifndef SIMD_UTILS
34 #define SIMD_UTILS
35
36 #if !defined(_WIN32) && !defined(__SSSE3__)
37 #error SSSE3 instructions must be enabled
38 #endif
39
40 #include "config.h"
41 #include "ue2common.h"
42 #include "simd_types.h"
43 #include "unaligned.h"
44 #include "util/arch.h"
45 #include "util/intrinsics.h"
46
47 #include <string.h> // for memcpy
48
49 // Define a common assume_aligned using an appropriate compiler built-in, if
50 // it's available. Note that we need to handle C or C++ compilation.
51 #ifdef __cplusplus
52 # ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED
53 # define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
54 # endif
55 #else
56 # ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED
57 # define assume_aligned(x, y) __builtin_assume_aligned((x), (y))
58 # endif
59 #endif
60
61 // Fallback to identity case.
62 #ifndef assume_aligned
63 #define assume_aligned(x, y) (x)
64 #endif
65
66 #ifdef __cplusplus
67 extern "C" {
68 #endif
69 extern const char vbs_mask_data[];
70 #ifdef __cplusplus
71 }
72 #endif
73
ones128(void)74 static really_inline m128 ones128(void) {
75 #if defined(__GNUC__) || defined(__INTEL_COMPILER)
76 /* gcc gets this right */
77 return _mm_set1_epi8(0xFF);
78 #else
79 /* trick from Intel's optimization guide to generate all-ones.
80 * ICC converts this to the single cmpeq instruction */
81 return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128());
82 #endif
83 }
84
zeroes128(void)85 static really_inline m128 zeroes128(void) {
86 return _mm_setzero_si128();
87 }
88
89 /** \brief Bitwise not for m128*/
not128(m128 a)90 static really_inline m128 not128(m128 a) {
91 return _mm_xor_si128(a, ones128());
92 }
93
94 /** \brief Return 1 if a and b are different otherwise 0 */
diff128(m128 a,m128 b)95 static really_inline int diff128(m128 a, m128 b) {
96 return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff);
97 }
98
isnonzero128(m128 a)99 static really_inline int isnonzero128(m128 a) {
100 return !!diff128(a, zeroes128());
101 }
102
103 /**
104 * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit
105 * mask indicating which 32-bit words contain differences.
106 */
diffrich128(m128 a,m128 b)107 static really_inline u32 diffrich128(m128 a, m128 b) {
108 a = _mm_cmpeq_epi32(a, b);
109 return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf;
110 }
111
112 /**
113 * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and
114 * returns a 4-bit mask indicating which 64-bit words contain differences.
115 */
diffrich64_128(m128 a,m128 b)116 static really_inline u32 diffrich64_128(m128 a, m128 b) {
117 #if defined(HAVE_SSE41)
118 a = _mm_cmpeq_epi64(a, b);
119 return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5;
120 #else
121 u32 d = diffrich128(a, b);
122 return (d | (d >> 1)) & 0x5;
123 #endif
124 }
125
126 static really_really_inline
lshift64_m128(m128 a,unsigned b)127 m128 lshift64_m128(m128 a, unsigned b) {
128 #if defined(HAVE__BUILTIN_CONSTANT_P)
129 if (__builtin_constant_p(b)) {
130 return _mm_slli_epi64(a, b);
131 }
132 #endif
133 m128 x = _mm_cvtsi32_si128(b);
134 return _mm_sll_epi64(a, x);
135 }
136
137 #define rshift64_m128(a, b) _mm_srli_epi64((a), (b))
138 #define eq128(a, b) _mm_cmpeq_epi8((a), (b))
139 #define movemask128(a) ((u32)_mm_movemask_epi8((a)))
140
141 #if defined(HAVE_AVX512)
cast512to128(const m512 in)142 static really_inline m128 cast512to128(const m512 in) {
143 return _mm512_castsi512_si128(in);
144 }
145 #endif
146
set16x8(u8 c)147 static really_inline m128 set16x8(u8 c) {
148 return _mm_set1_epi8(c);
149 }
150
set4x32(u32 c)151 static really_inline m128 set4x32(u32 c) {
152 return _mm_set1_epi32(c);
153 }
154
movd(const m128 in)155 static really_inline u32 movd(const m128 in) {
156 return _mm_cvtsi128_si32(in);
157 }
158
159 #if defined(HAVE_AVX512)
movd512(const m512 in)160 static really_inline u32 movd512(const m512 in) {
161 // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in),
162 // so we use 2-step convertions to work around.
163 return _mm_cvtsi128_si32(_mm512_castsi512_si128(in));
164 }
165
movq512(const m512 in)166 static really_inline u64a movq512(const m512 in) {
167 // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in),
168 // so we use 2-step convertions to work around.
169 return _mm_cvtsi128_si64(_mm512_castsi512_si128(in));
170 }
171 #endif
172
movq(const m128 in)173 static really_inline u64a movq(const m128 in) {
174 #if defined(ARCH_X86_64)
175 return _mm_cvtsi128_si64(in);
176 #else // 32-bit - this is horrific
177 u32 lo = movd(in);
178 u32 hi = movd(_mm_srli_epi64(in, 32));
179 return (u64a)hi << 32 | lo;
180 #endif
181 }
182
183 /* another form of movq */
184 static really_inline
load_m128_from_u64a(const u64a * p)185 m128 load_m128_from_u64a(const u64a *p) {
186 return _mm_set_epi64x(0LL, *p);
187 }
188
189 #define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed)
190 #define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed)
191
192 #if defined(HAVE_SSE41)
193 #define extract32from128(a, imm) _mm_extract_epi32(a, imm)
194 #define extract64from128(a, imm) _mm_extract_epi64(a, imm)
195 #else
196 #define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2))
197 #define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3))
198 #endif
199
200 #if !defined(HAVE_AVX2)
201 // TODO: this entire file needs restructuring - this carveout is awful
202 #define extractlow64from256(a) movq(a.lo)
203 #define extractlow32from256(a) movd(a.lo)
204 #if defined(HAVE_SSE41)
205 #define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4)
206 #define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2)
207 #else
208 #define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4))
209 #define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8))
210 #endif
211
212 #endif // !AVX2
213
and128(m128 a,m128 b)214 static really_inline m128 and128(m128 a, m128 b) {
215 return _mm_and_si128(a,b);
216 }
217
xor128(m128 a,m128 b)218 static really_inline m128 xor128(m128 a, m128 b) {
219 return _mm_xor_si128(a,b);
220 }
221
or128(m128 a,m128 b)222 static really_inline m128 or128(m128 a, m128 b) {
223 return _mm_or_si128(a,b);
224 }
225
226 #if defined(HAVE_AVX512VBMI)
expand128(m128 a)227 static really_inline m512 expand128(m128 a) {
228 return _mm512_broadcast_i32x4(a);
229 }
230
expand256(m256 a)231 static really_inline m512 expand256(m256 a) {
232 return _mm512_broadcast_i64x4(a);
233 }
234
expand384(m384 a)235 static really_inline m512 expand384(m384 a) {
236 u64a *lo = (u64a*)&a.lo;
237 u64a *mid = (u64a*)&a.mid;
238 u64a *hi = (u64a*)&a.hi;
239 return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0],
240 lo[1], lo[0]);
241 }
242 #endif
243
andnot128(m128 a,m128 b)244 static really_inline m128 andnot128(m128 a, m128 b) {
245 return _mm_andnot_si128(a, b);
246 }
247
248 // aligned load
load128(const void * ptr)249 static really_inline m128 load128(const void *ptr) {
250 assert(ISALIGNED_N(ptr, alignof(m128)));
251 ptr = assume_aligned(ptr, 16);
252 return _mm_load_si128((const m128 *)ptr);
253 }
254
255 // aligned store
store128(void * ptr,m128 a)256 static really_inline void store128(void *ptr, m128 a) {
257 assert(ISALIGNED_N(ptr, alignof(m128)));
258 ptr = assume_aligned(ptr, 16);
259 *(m128 *)ptr = a;
260 }
261
262 // unaligned load
loadu128(const void * ptr)263 static really_inline m128 loadu128(const void *ptr) {
264 return _mm_loadu_si128((const m128 *)ptr);
265 }
266
267 // unaligned store
storeu128(void * ptr,m128 a)268 static really_inline void storeu128(void *ptr, m128 a) {
269 _mm_storeu_si128 ((m128 *)ptr, a);
270 }
271
272 // packed unaligned store of first N bytes
273 static really_inline
storebytes128(void * ptr,m128 a,unsigned int n)274 void storebytes128(void *ptr, m128 a, unsigned int n) {
275 assert(n <= sizeof(a));
276 memcpy(ptr, &a, n);
277 }
278
279 // packed unaligned load of first N bytes, pad with zero
280 static really_inline
loadbytes128(const void * ptr,unsigned int n)281 m128 loadbytes128(const void *ptr, unsigned int n) {
282 m128 a = zeroes128();
283 assert(n <= sizeof(a));
284 memcpy(&a, ptr, n);
285 return a;
286 }
287
288 #ifdef __cplusplus
289 extern "C" {
290 #endif
291 extern const u8 simd_onebit_masks[];
292 #ifdef __cplusplus
293 }
294 #endif
295
296 static really_inline
mask1bit128(unsigned int n)297 m128 mask1bit128(unsigned int n) {
298 assert(n < sizeof(m128) * 8);
299 u32 mask_idx = ((n % 8) * 64) + 95;
300 mask_idx -= n / 8;
301 return loadu128(&simd_onebit_masks[mask_idx]);
302 }
303
304 // switches on bit N in the given vector.
305 static really_inline
setbit128(m128 * ptr,unsigned int n)306 void setbit128(m128 *ptr, unsigned int n) {
307 *ptr = or128(mask1bit128(n), *ptr);
308 }
309
310 // switches off bit N in the given vector.
311 static really_inline
clearbit128(m128 * ptr,unsigned int n)312 void clearbit128(m128 *ptr, unsigned int n) {
313 *ptr = andnot128(mask1bit128(n), *ptr);
314 }
315
316 // tests bit N in the given vector.
317 static really_inline
testbit128(m128 val,unsigned int n)318 char testbit128(m128 val, unsigned int n) {
319 const m128 mask = mask1bit128(n);
320 #if defined(HAVE_SSE41)
321 return !_mm_testz_si128(mask, val);
322 #else
323 return isnonzero128(and128(mask, val));
324 #endif
325 }
326
327 // offset must be an immediate
328 #define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset)
329
330 static really_inline
pshufb_m128(m128 a,m128 b)331 m128 pshufb_m128(m128 a, m128 b) {
332 m128 result;
333 result = _mm_shuffle_epi8(a, b);
334 return result;
335 }
336
337 static really_inline
pshufb_m256(m256 a,m256 b)338 m256 pshufb_m256(m256 a, m256 b) {
339 #if defined(HAVE_AVX2)
340 return _mm256_shuffle_epi8(a, b);
341 #else
342 m256 rv;
343 rv.lo = pshufb_m128(a.lo, b.lo);
344 rv.hi = pshufb_m128(a.hi, b.hi);
345 return rv;
346 #endif
347 }
348
349 #if defined(HAVE_AVX512)
350 static really_inline
pshufb_m512(m512 a,m512 b)351 m512 pshufb_m512(m512 a, m512 b) {
352 return _mm512_shuffle_epi8(a, b);
353 }
354
355 static really_inline
maskz_pshufb_m512(__mmask64 k,m512 a,m512 b)356 m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) {
357 return _mm512_maskz_shuffle_epi8(k, a, b);
358 }
359
360 #if defined(HAVE_AVX512VBMI)
361 #define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a)
362 #define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a)
363 #endif
364
365 #endif
366
367 static really_inline
variable_byte_shift_m128(m128 in,s32 amount)368 m128 variable_byte_shift_m128(m128 in, s32 amount) {
369 assert(amount >= -16 && amount <= 16);
370 m128 shift_mask = loadu128(vbs_mask_data + 16 - amount);
371 return pshufb_m128(in, shift_mask);
372 }
373
374 static really_inline
max_u8_m128(m128 a,m128 b)375 m128 max_u8_m128(m128 a, m128 b) {
376 return _mm_max_epu8(a, b);
377 }
378
379 static really_inline
min_u8_m128(m128 a,m128 b)380 m128 min_u8_m128(m128 a, m128 b) {
381 return _mm_min_epu8(a, b);
382 }
383
384 static really_inline
sadd_u8_m128(m128 a,m128 b)385 m128 sadd_u8_m128(m128 a, m128 b) {
386 return _mm_adds_epu8(a, b);
387 }
388
389 static really_inline
sub_u8_m128(m128 a,m128 b)390 m128 sub_u8_m128(m128 a, m128 b) {
391 return _mm_sub_epi8(a, b);
392 }
393
394 static really_inline
set64x2(u64a hi,u64a lo)395 m128 set64x2(u64a hi, u64a lo) {
396 return _mm_set_epi64x(hi, lo);
397 }
398
399 /****
400 **** 256-bit Primitives
401 ****/
402
403 #if defined(HAVE_AVX2)
404
405 static really_really_inline
lshift64_m256(m256 a,unsigned b)406 m256 lshift64_m256(m256 a, unsigned b) {
407 #if defined(HAVE__BUILTIN_CONSTANT_P)
408 if (__builtin_constant_p(b)) {
409 return _mm256_slli_epi64(a, b);
410 }
411 #endif
412 m128 x = _mm_cvtsi32_si128(b);
413 return _mm256_sll_epi64(a, x);
414 }
415
416 #define rshift64_m256(a, b) _mm256_srli_epi64((a), (b))
417
418 static really_inline
set32x8(u32 in)419 m256 set32x8(u32 in) {
420 return _mm256_set1_epi8(in);
421 }
422
423 #define eq256(a, b) _mm256_cmpeq_epi8((a), (b))
424 #define movemask256(a) ((u32)_mm256_movemask_epi8((a)))
425
426 static really_inline
set2x128(m128 a)427 m256 set2x128(m128 a) {
428 return _mm256_broadcastsi128_si256(a);
429 }
430
431 #else
432
433 static really_really_inline
lshift64_m256(m256 a,int b)434 m256 lshift64_m256(m256 a, int b) {
435 m256 rv = a;
436 rv.lo = lshift64_m128(rv.lo, b);
437 rv.hi = lshift64_m128(rv.hi, b);
438 return rv;
439 }
440
441 static really_inline
rshift64_m256(m256 a,int b)442 m256 rshift64_m256(m256 a, int b) {
443 m256 rv = a;
444 rv.lo = rshift64_m128(rv.lo, b);
445 rv.hi = rshift64_m128(rv.hi, b);
446 return rv;
447 }
448 static really_inline
set32x8(u32 in)449 m256 set32x8(u32 in) {
450 m256 rv;
451 rv.lo = set16x8((u8) in);
452 rv.hi = rv.lo;
453 return rv;
454 }
455
456 static really_inline
eq256(m256 a,m256 b)457 m256 eq256(m256 a, m256 b) {
458 m256 rv;
459 rv.lo = eq128(a.lo, b.lo);
460 rv.hi = eq128(a.hi, b.hi);
461 return rv;
462 }
463
464 static really_inline
movemask256(m256 a)465 u32 movemask256(m256 a) {
466 u32 lo_mask = movemask128(a.lo);
467 u32 hi_mask = movemask128(a.hi);
468 return lo_mask | (hi_mask << 16);
469 }
470
471 static really_inline
set2x128(m128 a)472 m256 set2x128(m128 a) {
473 m256 rv = {a, a};
474 return rv;
475 }
476 #endif
477
zeroes256(void)478 static really_inline m256 zeroes256(void) {
479 #if defined(HAVE_AVX2)
480 return _mm256_setzero_si256();
481 #else
482 m256 rv = {zeroes128(), zeroes128()};
483 return rv;
484 #endif
485 }
486
ones256(void)487 static really_inline m256 ones256(void) {
488 #if defined(HAVE_AVX2)
489 m256 rv = _mm256_set1_epi8(0xFF);
490 #else
491 m256 rv = {ones128(), ones128()};
492 #endif
493 return rv;
494 }
495
496 #if defined(HAVE_AVX2)
and256(m256 a,m256 b)497 static really_inline m256 and256(m256 a, m256 b) {
498 return _mm256_and_si256(a, b);
499 }
500 #else
and256(m256 a,m256 b)501 static really_inline m256 and256(m256 a, m256 b) {
502 m256 rv;
503 rv.lo = and128(a.lo, b.lo);
504 rv.hi = and128(a.hi, b.hi);
505 return rv;
506 }
507 #endif
508
509 #if defined(HAVE_AVX2)
or256(m256 a,m256 b)510 static really_inline m256 or256(m256 a, m256 b) {
511 return _mm256_or_si256(a, b);
512 }
513 #else
or256(m256 a,m256 b)514 static really_inline m256 or256(m256 a, m256 b) {
515 m256 rv;
516 rv.lo = or128(a.lo, b.lo);
517 rv.hi = or128(a.hi, b.hi);
518 return rv;
519 }
520 #endif
521
522 #if defined(HAVE_AVX2)
xor256(m256 a,m256 b)523 static really_inline m256 xor256(m256 a, m256 b) {
524 return _mm256_xor_si256(a, b);
525 }
526 #else
xor256(m256 a,m256 b)527 static really_inline m256 xor256(m256 a, m256 b) {
528 m256 rv;
529 rv.lo = xor128(a.lo, b.lo);
530 rv.hi = xor128(a.hi, b.hi);
531 return rv;
532 }
533 #endif
534
535 #if defined(HAVE_AVX2)
not256(m256 a)536 static really_inline m256 not256(m256 a) {
537 return _mm256_xor_si256(a, ones256());
538 }
539 #else
not256(m256 a)540 static really_inline m256 not256(m256 a) {
541 m256 rv;
542 rv.lo = not128(a.lo);
543 rv.hi = not128(a.hi);
544 return rv;
545 }
546 #endif
547
548 #if defined(HAVE_AVX2)
andnot256(m256 a,m256 b)549 static really_inline m256 andnot256(m256 a, m256 b) {
550 return _mm256_andnot_si256(a, b);
551 }
552 #else
andnot256(m256 a,m256 b)553 static really_inline m256 andnot256(m256 a, m256 b) {
554 m256 rv;
555 rv.lo = andnot128(a.lo, b.lo);
556 rv.hi = andnot128(a.hi, b.hi);
557 return rv;
558 }
559 #endif
560
diff256(m256 a,m256 b)561 static really_inline int diff256(m256 a, m256 b) {
562 #if defined(HAVE_AVX2)
563 return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1);
564 #else
565 return diff128(a.lo, b.lo) || diff128(a.hi, b.hi);
566 #endif
567 }
568
isnonzero256(m256 a)569 static really_inline int isnonzero256(m256 a) {
570 #if defined(HAVE_AVX2)
571 return !!diff256(a, zeroes256());
572 #else
573 return isnonzero128(or128(a.lo, a.hi));
574 #endif
575 }
576
577 /**
578 * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit
579 * mask indicating which 32-bit words contain differences.
580 */
diffrich256(m256 a,m256 b)581 static really_inline u32 diffrich256(m256 a, m256 b) {
582 #if defined(HAVE_AVX2)
583 a = _mm256_cmpeq_epi32(a, b);
584 return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF;
585 #else
586 m128 z = zeroes128();
587 a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
588 a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
589 m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z);
590 return ~(_mm_movemask_epi8(packed)) & 0xff;
591 #endif
592 }
593
594 /**
595 * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and
596 * returns an 8-bit mask indicating which 64-bit words contain differences.
597 */
diffrich64_256(m256 a,m256 b)598 static really_inline u32 diffrich64_256(m256 a, m256 b) {
599 u32 d = diffrich256(a, b);
600 return (d | (d >> 1)) & 0x55555555;
601 }
602
603 // aligned load
load256(const void * ptr)604 static really_inline m256 load256(const void *ptr) {
605 assert(ISALIGNED_N(ptr, alignof(m256)));
606 #if defined(HAVE_AVX2)
607 return _mm256_load_si256((const m256 *)ptr);
608 #else
609 m256 rv = { load128(ptr), load128((const char *)ptr + 16) };
610 return rv;
611 #endif
612 }
613
614 // aligned load of 128-bit value to low and high part of 256-bit value
load2x128(const void * ptr)615 static really_inline m256 load2x128(const void *ptr) {
616 #if defined(HAVE_AVX2)
617 return set2x128(load128(ptr));
618 #else
619 assert(ISALIGNED_N(ptr, alignof(m128)));
620 m256 rv;
621 rv.hi = rv.lo = load128(ptr);
622 return rv;
623 #endif
624 }
625
loadu2x128(const void * ptr)626 static really_inline m256 loadu2x128(const void *ptr) {
627 return set2x128(loadu128(ptr));
628 }
629
630 // aligned store
store256(void * ptr,m256 a)631 static really_inline void store256(void *ptr, m256 a) {
632 assert(ISALIGNED_N(ptr, alignof(m256)));
633 #if defined(HAVE_AVX2)
634 _mm256_store_si256((m256 *)ptr, a);
635 #else
636 ptr = assume_aligned(ptr, 16);
637 *(m256 *)ptr = a;
638 #endif
639 }
640
641 // unaligned load
loadu256(const void * ptr)642 static really_inline m256 loadu256(const void *ptr) {
643 #if defined(HAVE_AVX2)
644 return _mm256_loadu_si256((const m256 *)ptr);
645 #else
646 m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) };
647 return rv;
648 #endif
649 }
650
651 // unaligned store
storeu256(void * ptr,m256 a)652 static really_inline void storeu256(void *ptr, m256 a) {
653 #if defined(HAVE_AVX2)
654 _mm256_storeu_si256((m256 *)ptr, a);
655 #else
656 storeu128(ptr, a.lo);
657 storeu128((char *)ptr + 16, a.hi);
658 #endif
659 }
660
661 // packed unaligned store of first N bytes
662 static really_inline
storebytes256(void * ptr,m256 a,unsigned int n)663 void storebytes256(void *ptr, m256 a, unsigned int n) {
664 assert(n <= sizeof(a));
665 memcpy(ptr, &a, n);
666 }
667
668 // packed unaligned load of first N bytes, pad with zero
669 static really_inline
loadbytes256(const void * ptr,unsigned int n)670 m256 loadbytes256(const void *ptr, unsigned int n) {
671 m256 a = zeroes256();
672 assert(n <= sizeof(a));
673 memcpy(&a, ptr, n);
674 return a;
675 }
676
677 static really_inline
mask1bit256(unsigned int n)678 m256 mask1bit256(unsigned int n) {
679 assert(n < sizeof(m256) * 8);
680 u32 mask_idx = ((n % 8) * 64) + 95;
681 mask_idx -= n / 8;
682 return loadu256(&simd_onebit_masks[mask_idx]);
683 }
684
685 static really_inline
set64x4(u64a hi_1,u64a hi_0,u64a lo_1,u64a lo_0)686 m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) {
687 #if defined(HAVE_AVX2)
688 return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0);
689 #else
690 m256 rv;
691 rv.hi = set64x2(hi_1, hi_0);
692 rv.lo = set64x2(lo_1, lo_0);
693 return rv;
694 #endif
695 }
696
697 #if !defined(HAVE_AVX2)
698 // switches on bit N in the given vector.
699 static really_inline
setbit256(m256 * ptr,unsigned int n)700 void setbit256(m256 *ptr, unsigned int n) {
701 assert(n < sizeof(*ptr) * 8);
702 m128 *sub;
703 if (n < 128) {
704 sub = &ptr->lo;
705 } else {
706 sub = &ptr->hi;
707 n -= 128;
708 }
709 setbit128(sub, n);
710 }
711
712 // switches off bit N in the given vector.
713 static really_inline
clearbit256(m256 * ptr,unsigned int n)714 void clearbit256(m256 *ptr, unsigned int n) {
715 assert(n < sizeof(*ptr) * 8);
716 m128 *sub;
717 if (n < 128) {
718 sub = &ptr->lo;
719 } else {
720 sub = &ptr->hi;
721 n -= 128;
722 }
723 clearbit128(sub, n);
724 }
725
726 // tests bit N in the given vector.
727 static really_inline
testbit256(m256 val,unsigned int n)728 char testbit256(m256 val, unsigned int n) {
729 assert(n < sizeof(val) * 8);
730 m128 sub;
731 if (n < 128) {
732 sub = val.lo;
733 } else {
734 sub = val.hi;
735 n -= 128;
736 }
737 return testbit128(sub, n);
738 }
739
740 static really_really_inline
movdq_hi(m256 x)741 m128 movdq_hi(m256 x) {
742 return x.hi;
743 }
744
745 static really_really_inline
movdq_lo(m256 x)746 m128 movdq_lo(m256 x) {
747 return x.lo;
748 }
749
750 static really_inline
combine2x128(m128 hi,m128 lo)751 m256 combine2x128(m128 hi, m128 lo) {
752 m256 rv = {lo, hi};
753 return rv;
754 }
755
756 #else // AVX2
757
758 // switches on bit N in the given vector.
759 static really_inline
setbit256(m256 * ptr,unsigned int n)760 void setbit256(m256 *ptr, unsigned int n) {
761 *ptr = or256(mask1bit256(n), *ptr);
762 }
763
764 static really_inline
clearbit256(m256 * ptr,unsigned int n)765 void clearbit256(m256 *ptr, unsigned int n) {
766 *ptr = andnot256(mask1bit256(n), *ptr);
767 }
768
769 // tests bit N in the given vector.
770 static really_inline
testbit256(m256 val,unsigned int n)771 char testbit256(m256 val, unsigned int n) {
772 const m256 mask = mask1bit256(n);
773 return !_mm256_testz_si256(mask, val);
774 }
775
776 static really_really_inline
movdq_hi(m256 x)777 m128 movdq_hi(m256 x) {
778 return _mm256_extracti128_si256(x, 1);
779 }
780
781 static really_really_inline
movdq_lo(m256 x)782 m128 movdq_lo(m256 x) {
783 return _mm256_extracti128_si256(x, 0);
784 }
785
786 #define cast256to128(a) _mm256_castsi256_si128(a)
787 #define cast128to256(a) _mm256_castsi128_si256(a)
788 #define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E)
789 #define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm)
790 #define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed)
791 #define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed)
792 #define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2)
793 #define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4)
794 #define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a))
795 #define extractlow32from256(a) movd(cast256to128(a))
796 #define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b)
797 #define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b)
798 #define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset)
799
800 static really_inline
combine2x128(m128 hi,m128 lo)801 m256 combine2x128(m128 hi, m128 lo) {
802 #if defined(_mm256_set_m128i)
803 return _mm256_set_m128i(hi, lo);
804 #else
805 return insert128to256(cast128to256(lo), hi, 1);
806 #endif
807 }
808 #endif //AVX2
809
810 #if defined(HAVE_AVX512)
811 #define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm)
812 #define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b)
813 #define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b)
814 #define set2x256(a) _mm512_broadcast_i64x4(a)
815 #define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a)
816 #define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a)
817 #endif
818
819 /****
820 **** 384-bit Primitives
821 ****/
822
and384(m384 a,m384 b)823 static really_inline m384 and384(m384 a, m384 b) {
824 m384 rv;
825 rv.lo = and128(a.lo, b.lo);
826 rv.mid = and128(a.mid, b.mid);
827 rv.hi = and128(a.hi, b.hi);
828 return rv;
829 }
830
or384(m384 a,m384 b)831 static really_inline m384 or384(m384 a, m384 b) {
832 m384 rv;
833 rv.lo = or128(a.lo, b.lo);
834 rv.mid = or128(a.mid, b.mid);
835 rv.hi = or128(a.hi, b.hi);
836 return rv;
837 }
838
xor384(m384 a,m384 b)839 static really_inline m384 xor384(m384 a, m384 b) {
840 m384 rv;
841 rv.lo = xor128(a.lo, b.lo);
842 rv.mid = xor128(a.mid, b.mid);
843 rv.hi = xor128(a.hi, b.hi);
844 return rv;
845 }
not384(m384 a)846 static really_inline m384 not384(m384 a) {
847 m384 rv;
848 rv.lo = not128(a.lo);
849 rv.mid = not128(a.mid);
850 rv.hi = not128(a.hi);
851 return rv;
852 }
andnot384(m384 a,m384 b)853 static really_inline m384 andnot384(m384 a, m384 b) {
854 m384 rv;
855 rv.lo = andnot128(a.lo, b.lo);
856 rv.mid = andnot128(a.mid, b.mid);
857 rv.hi = andnot128(a.hi, b.hi);
858 return rv;
859 }
860
861 static really_really_inline
lshift64_m384(m384 a,unsigned b)862 m384 lshift64_m384(m384 a, unsigned b) {
863 m384 rv;
864 rv.lo = lshift64_m128(a.lo, b);
865 rv.mid = lshift64_m128(a.mid, b);
866 rv.hi = lshift64_m128(a.hi, b);
867 return rv;
868 }
869
zeroes384(void)870 static really_inline m384 zeroes384(void) {
871 m384 rv = {zeroes128(), zeroes128(), zeroes128()};
872 return rv;
873 }
874
ones384(void)875 static really_inline m384 ones384(void) {
876 m384 rv = {ones128(), ones128(), ones128()};
877 return rv;
878 }
879
diff384(m384 a,m384 b)880 static really_inline int diff384(m384 a, m384 b) {
881 return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi);
882 }
883
isnonzero384(m384 a)884 static really_inline int isnonzero384(m384 a) {
885 return isnonzero128(or128(or128(a.lo, a.mid), a.hi));
886 }
887
888 /**
889 * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit
890 * mask indicating which 32-bit words contain differences.
891 */
diffrich384(m384 a,m384 b)892 static really_inline u32 diffrich384(m384 a, m384 b) {
893 m128 z = zeroes128();
894 a.lo = _mm_cmpeq_epi32(a.lo, b.lo);
895 a.mid = _mm_cmpeq_epi32(a.mid, b.mid);
896 a.hi = _mm_cmpeq_epi32(a.hi, b.hi);
897 m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid),
898 _mm_packs_epi32(a.hi, z));
899 return ~(_mm_movemask_epi8(packed)) & 0xfff;
900 }
901
902 /**
903 * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and
904 * returns a 12-bit mask indicating which 64-bit words contain differences.
905 */
diffrich64_384(m384 a,m384 b)906 static really_inline u32 diffrich64_384(m384 a, m384 b) {
907 u32 d = diffrich384(a, b);
908 return (d | (d >> 1)) & 0x55555555;
909 }
910
911 // aligned load
load384(const void * ptr)912 static really_inline m384 load384(const void *ptr) {
913 assert(ISALIGNED_16(ptr));
914 m384 rv = { load128(ptr), load128((const char *)ptr + 16),
915 load128((const char *)ptr + 32) };
916 return rv;
917 }
918
919 // aligned store
store384(void * ptr,m384 a)920 static really_inline void store384(void *ptr, m384 a) {
921 assert(ISALIGNED_16(ptr));
922 ptr = assume_aligned(ptr, 16);
923 *(m384 *)ptr = a;
924 }
925
926 // unaligned load
loadu384(const void * ptr)927 static really_inline m384 loadu384(const void *ptr) {
928 m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16),
929 loadu128((const char *)ptr + 32)};
930 return rv;
931 }
932
933 // packed unaligned store of first N bytes
934 static really_inline
storebytes384(void * ptr,m384 a,unsigned int n)935 void storebytes384(void *ptr, m384 a, unsigned int n) {
936 assert(n <= sizeof(a));
937 memcpy(ptr, &a, n);
938 }
939
940 // packed unaligned load of first N bytes, pad with zero
941 static really_inline
loadbytes384(const void * ptr,unsigned int n)942 m384 loadbytes384(const void *ptr, unsigned int n) {
943 m384 a = zeroes384();
944 assert(n <= sizeof(a));
945 memcpy(&a, ptr, n);
946 return a;
947 }
948
949 // switches on bit N in the given vector.
950 static really_inline
setbit384(m384 * ptr,unsigned int n)951 void setbit384(m384 *ptr, unsigned int n) {
952 assert(n < sizeof(*ptr) * 8);
953 m128 *sub;
954 if (n < 128) {
955 sub = &ptr->lo;
956 } else if (n < 256) {
957 sub = &ptr->mid;
958 } else {
959 sub = &ptr->hi;
960 }
961 setbit128(sub, n % 128);
962 }
963
964 // switches off bit N in the given vector.
965 static really_inline
clearbit384(m384 * ptr,unsigned int n)966 void clearbit384(m384 *ptr, unsigned int n) {
967 assert(n < sizeof(*ptr) * 8);
968 m128 *sub;
969 if (n < 128) {
970 sub = &ptr->lo;
971 } else if (n < 256) {
972 sub = &ptr->mid;
973 } else {
974 sub = &ptr->hi;
975 }
976 clearbit128(sub, n % 128);
977 }
978
979 // tests bit N in the given vector.
980 static really_inline
testbit384(m384 val,unsigned int n)981 char testbit384(m384 val, unsigned int n) {
982 assert(n < sizeof(val) * 8);
983 m128 sub;
984 if (n < 128) {
985 sub = val.lo;
986 } else if (n < 256) {
987 sub = val.mid;
988 } else {
989 sub = val.hi;
990 }
991 return testbit128(sub, n % 128);
992 }
993
994 /****
995 **** 512-bit Primitives
996 ****/
997
998 #define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b))
999 #define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b))
1000
1001 static really_inline
zeroes512(void)1002 m512 zeroes512(void) {
1003 #if defined(HAVE_AVX512)
1004 return _mm512_setzero_si512();
1005 #else
1006 m512 rv = {zeroes256(), zeroes256()};
1007 return rv;
1008 #endif
1009 }
1010
1011 static really_inline
ones512(void)1012 m512 ones512(void) {
1013 #if defined(HAVE_AVX512)
1014 return _mm512_set1_epi8(0xFF);
1015 //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512());
1016 #else
1017 m512 rv = {ones256(), ones256()};
1018 return rv;
1019 #endif
1020 }
1021
1022 #if defined(HAVE_AVX512)
1023 static really_inline
set64x8(u8 a)1024 m512 set64x8(u8 a) {
1025 return _mm512_set1_epi8(a);
1026 }
1027
1028 static really_inline
set8x64(u64a a)1029 m512 set8x64(u64a a) {
1030 return _mm512_set1_epi64(a);
1031 }
1032
1033 static really_inline
set16x32(u32 a)1034 m512 set16x32(u32 a) {
1035 return _mm512_set1_epi32(a);
1036 }
1037
1038 static really_inline
set512_64(u64a hi_3,u64a hi_2,u64a hi_1,u64a hi_0,u64a lo_3,u64a lo_2,u64a lo_1,u64a lo_0)1039 m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0,
1040 u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) {
1041 return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0,
1042 lo_3, lo_2, lo_1, lo_0);
1043 }
1044
1045 static really_inline
swap256in512(m512 a)1046 m512 swap256in512(m512 a) {
1047 m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL);
1048 return vpermq512(idx, a);
1049 }
1050
1051 static really_inline
set4x128(m128 a)1052 m512 set4x128(m128 a) {
1053 return _mm512_broadcast_i32x4(a);
1054 }
1055
1056 static really_inline
sadd_u8_m512(m512 a,m512 b)1057 m512 sadd_u8_m512(m512 a, m512 b) {
1058 return _mm512_adds_epu8(a, b);
1059 }
1060
1061 static really_inline
max_u8_m512(m512 a,m512 b)1062 m512 max_u8_m512(m512 a, m512 b) {
1063 return _mm512_max_epu8(a, b);
1064 }
1065
1066 static really_inline
min_u8_m512(m512 a,m512 b)1067 m512 min_u8_m512(m512 a, m512 b) {
1068 return _mm512_min_epu8(a, b);
1069 }
1070
1071 static really_inline
sub_u8_m512(m512 a,m512 b)1072 m512 sub_u8_m512(m512 a, m512 b) {
1073 return _mm512_sub_epi8(a, b);
1074 }
1075 #endif
1076
1077 static really_inline
and512(m512 a,m512 b)1078 m512 and512(m512 a, m512 b) {
1079 #if defined(HAVE_AVX512)
1080 return _mm512_and_si512(a, b);
1081 #else
1082 m512 rv;
1083 rv.lo = and256(a.lo, b.lo);
1084 rv.hi = and256(a.hi, b.hi);
1085 return rv;
1086 #endif
1087 }
1088
1089 static really_inline
or512(m512 a,m512 b)1090 m512 or512(m512 a, m512 b) {
1091 #if defined(HAVE_AVX512)
1092 return _mm512_or_si512(a, b);
1093 #else
1094 m512 rv;
1095 rv.lo = or256(a.lo, b.lo);
1096 rv.hi = or256(a.hi, b.hi);
1097 return rv;
1098 #endif
1099 }
1100
1101 static really_inline
xor512(m512 a,m512 b)1102 m512 xor512(m512 a, m512 b) {
1103 #if defined(HAVE_AVX512)
1104 return _mm512_xor_si512(a, b);
1105 #else
1106 m512 rv;
1107 rv.lo = xor256(a.lo, b.lo);
1108 rv.hi = xor256(a.hi, b.hi);
1109 return rv;
1110 #endif
1111 }
1112
1113 static really_inline
not512(m512 a)1114 m512 not512(m512 a) {
1115 #if defined(HAVE_AVX512)
1116 return _mm512_xor_si512(a, ones512());
1117 #else
1118 m512 rv;
1119 rv.lo = not256(a.lo);
1120 rv.hi = not256(a.hi);
1121 return rv;
1122 #endif
1123 }
1124
1125 static really_inline
andnot512(m512 a,m512 b)1126 m512 andnot512(m512 a, m512 b) {
1127 #if defined(HAVE_AVX512)
1128 return _mm512_andnot_si512(a, b);
1129 #else
1130 m512 rv;
1131 rv.lo = andnot256(a.lo, b.lo);
1132 rv.hi = andnot256(a.hi, b.hi);
1133 return rv;
1134 #endif
1135 }
1136
1137 #if defined(HAVE_AVX512)
1138 static really_really_inline
lshift64_m512(m512 a,unsigned b)1139 m512 lshift64_m512(m512 a, unsigned b) {
1140 #if defined(HAVE__BUILTIN_CONSTANT_P)
1141 if (__builtin_constant_p(b)) {
1142 return _mm512_slli_epi64(a, b);
1143 }
1144 #endif
1145 m128 x = _mm_cvtsi32_si128(b);
1146 return _mm512_sll_epi64(a, x);
1147 }
1148 #else
1149 static really_really_inline
lshift64_m512(m512 a,unsigned b)1150 m512 lshift64_m512(m512 a, unsigned b) {
1151 m512 rv;
1152 rv.lo = lshift64_m256(a.lo, b);
1153 rv.hi = lshift64_m256(a.hi, b);
1154 return rv;
1155 }
1156 #endif
1157
1158 #if defined(HAVE_AVX512)
1159 #define rshift64_m512(a, b) _mm512_srli_epi64((a), (b))
1160 #define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed)
1161 #define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed)
1162 #endif
1163
1164 #if !defined(_MM_CMPINT_NE)
1165 #define _MM_CMPINT_NE 0x4
1166 #endif
1167
1168 static really_inline
diff512(m512 a,m512 b)1169 int diff512(m512 a, m512 b) {
1170 #if defined(HAVE_AVX512)
1171 return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE);
1172 #else
1173 return diff256(a.lo, b.lo) || diff256(a.hi, b.hi);
1174 #endif
1175 }
1176
1177 static really_inline
isnonzero512(m512 a)1178 int isnonzero512(m512 a) {
1179 #if defined(HAVE_AVX512)
1180 return diff512(a, zeroes512());
1181 #elif defined(HAVE_AVX2)
1182 m256 x = or256(a.lo, a.hi);
1183 return !!diff256(x, zeroes256());
1184 #else
1185 m128 x = or128(a.lo.lo, a.lo.hi);
1186 m128 y = or128(a.hi.lo, a.hi.hi);
1187 return isnonzero128(or128(x, y));
1188 #endif
1189 }
1190
1191 /**
1192 * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit
1193 * mask indicating which 32-bit words contain differences.
1194 */
1195 static really_inline
diffrich512(m512 a,m512 b)1196 u32 diffrich512(m512 a, m512 b) {
1197 #if defined(HAVE_AVX512)
1198 return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE);
1199 #elif defined(HAVE_AVX2)
1200 return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8);
1201 #else
1202 a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo);
1203 a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi);
1204 a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo);
1205 a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi);
1206 m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi),
1207 _mm_packs_epi32(a.hi.lo, a.hi.hi));
1208 return ~(_mm_movemask_epi8(packed)) & 0xffff;
1209 #endif
1210 }
1211
1212 /**
1213 * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and
1214 * returns a 16-bit mask indicating which 64-bit words contain differences.
1215 */
1216 static really_inline
diffrich64_512(m512 a,m512 b)1217 u32 diffrich64_512(m512 a, m512 b) {
1218 //TODO: cmp_epi64?
1219 u32 d = diffrich512(a, b);
1220 return (d | (d >> 1)) & 0x55555555;
1221 }
1222
1223 // aligned load
1224 static really_inline
load512(const void * ptr)1225 m512 load512(const void *ptr) {
1226 #if defined(HAVE_AVX512)
1227 return _mm512_load_si512(ptr);
1228 #else
1229 assert(ISALIGNED_N(ptr, alignof(m256)));
1230 m512 rv = { load256(ptr), load256((const char *)ptr + 32) };
1231 return rv;
1232 #endif
1233 }
1234
1235 // aligned store
1236 static really_inline
store512(void * ptr,m512 a)1237 void store512(void *ptr, m512 a) {
1238 assert(ISALIGNED_N(ptr, alignof(m512)));
1239 #if defined(HAVE_AVX512)
1240 return _mm512_store_si512(ptr, a);
1241 #elif defined(HAVE_AVX2)
1242 m512 *x = (m512 *)ptr;
1243 store256(&x->lo, a.lo);
1244 store256(&x->hi, a.hi);
1245 #else
1246 ptr = assume_aligned(ptr, 16);
1247 *(m512 *)ptr = a;
1248 #endif
1249 }
1250
1251 // unaligned load
1252 static really_inline
loadu512(const void * ptr)1253 m512 loadu512(const void *ptr) {
1254 #if defined(HAVE_AVX512)
1255 return _mm512_loadu_si512(ptr);
1256 #else
1257 m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) };
1258 return rv;
1259 #endif
1260 }
1261
1262 // unaligned store
1263 static really_inline
storeu512(void * ptr,m512 a)1264 void storeu512(void *ptr, m512 a) {
1265 #if defined(HAVE_AVX512)
1266 _mm512_storeu_si512((m512 *)ptr, a);
1267 #elif defined(HAVE_AVX2)
1268 storeu256(ptr, a.lo);
1269 storeu256((char *)ptr + 32, a.hi);
1270 #else
1271 storeu128(ptr, a.lo.lo);
1272 storeu128((char *)ptr + 16, a.lo.hi);
1273 storeu128((char *)ptr + 32, a.hi.lo);
1274 storeu128((char *)ptr + 48, a.hi.hi);
1275 #endif
1276 }
1277
1278 #if defined(HAVE_AVX512)
1279 static really_inline
loadu_maskz_m512(__mmask64 k,const void * ptr)1280 m512 loadu_maskz_m512(__mmask64 k, const void *ptr) {
1281 return _mm512_maskz_loadu_epi8(k, ptr);
1282 }
1283
1284 static really_inline
loadu_mask_m512(m512 src,__mmask64 k,const void * ptr)1285 m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) {
1286 return _mm512_mask_loadu_epi8(src, k, ptr);
1287 }
1288
1289 static really_inline
storeu_mask_m512(void * ptr,__mmask64 k,m512 a)1290 void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) {
1291 _mm512_mask_storeu_epi8(ptr, k, a);
1292 }
1293
1294 static really_inline
set_mask_m512(__mmask64 k)1295 m512 set_mask_m512(__mmask64 k) {
1296 return _mm512_movm_epi8(k);
1297 }
1298
1299 static really_inline
loadu_maskz_m256(__mmask32 k,const void * ptr)1300 m256 loadu_maskz_m256(__mmask32 k, const void *ptr) {
1301 return _mm256_maskz_loadu_epi8(k, ptr);
1302 }
1303 #endif
1304
1305 // packed unaligned store of first N bytes
1306 static really_inline
storebytes512(void * ptr,m512 a,unsigned int n)1307 void storebytes512(void *ptr, m512 a, unsigned int n) {
1308 assert(n <= sizeof(a));
1309 memcpy(ptr, &a, n);
1310 }
1311
1312 // packed unaligned load of first N bytes, pad with zero
1313 static really_inline
loadbytes512(const void * ptr,unsigned int n)1314 m512 loadbytes512(const void *ptr, unsigned int n) {
1315 m512 a = zeroes512();
1316 assert(n <= sizeof(a));
1317 memcpy(&a, ptr, n);
1318 return a;
1319 }
1320
1321 static really_inline
mask1bit512(unsigned int n)1322 m512 mask1bit512(unsigned int n) {
1323 assert(n < sizeof(m512) * 8);
1324 u32 mask_idx = ((n % 8) * 64) + 95;
1325 mask_idx -= n / 8;
1326 return loadu512(&simd_onebit_masks[mask_idx]);
1327 }
1328
1329 // switches on bit N in the given vector.
1330 static really_inline
setbit512(m512 * ptr,unsigned int n)1331 void setbit512(m512 *ptr, unsigned int n) {
1332 assert(n < sizeof(*ptr) * 8);
1333 #if !defined(HAVE_AVX2)
1334 m128 *sub;
1335 if (n < 128) {
1336 sub = &ptr->lo.lo;
1337 } else if (n < 256) {
1338 sub = &ptr->lo.hi;
1339 } else if (n < 384) {
1340 sub = &ptr->hi.lo;
1341 } else {
1342 sub = &ptr->hi.hi;
1343 }
1344 setbit128(sub, n % 128);
1345 #elif defined(HAVE_AVX512)
1346 *ptr = or512(mask1bit512(n), *ptr);
1347 #else
1348 m256 *sub;
1349 if (n < 256) {
1350 sub = &ptr->lo;
1351 } else {
1352 sub = &ptr->hi;
1353 n -= 256;
1354 }
1355 setbit256(sub, n);
1356 #endif
1357 }
1358
1359 // switches off bit N in the given vector.
1360 static really_inline
clearbit512(m512 * ptr,unsigned int n)1361 void clearbit512(m512 *ptr, unsigned int n) {
1362 assert(n < sizeof(*ptr) * 8);
1363 #if !defined(HAVE_AVX2)
1364 m128 *sub;
1365 if (n < 128) {
1366 sub = &ptr->lo.lo;
1367 } else if (n < 256) {
1368 sub = &ptr->lo.hi;
1369 } else if (n < 384) {
1370 sub = &ptr->hi.lo;
1371 } else {
1372 sub = &ptr->hi.hi;
1373 }
1374 clearbit128(sub, n % 128);
1375 #elif defined(HAVE_AVX512)
1376 *ptr = andnot512(mask1bit512(n), *ptr);
1377 #else
1378 m256 *sub;
1379 if (n < 256) {
1380 sub = &ptr->lo;
1381 } else {
1382 sub = &ptr->hi;
1383 n -= 256;
1384 }
1385 clearbit256(sub, n);
1386 #endif
1387 }
1388
1389 // tests bit N in the given vector.
1390 static really_inline
testbit512(m512 val,unsigned int n)1391 char testbit512(m512 val, unsigned int n) {
1392 assert(n < sizeof(val) * 8);
1393 #if !defined(HAVE_AVX2)
1394 m128 sub;
1395 if (n < 128) {
1396 sub = val.lo.lo;
1397 } else if (n < 256) {
1398 sub = val.lo.hi;
1399 } else if (n < 384) {
1400 sub = val.hi.lo;
1401 } else {
1402 sub = val.hi.hi;
1403 }
1404 return testbit128(sub, n % 128);
1405 #elif defined(HAVE_AVX512)
1406 const m512 mask = mask1bit512(n);
1407 return !!_mm512_test_epi8_mask(mask, val);
1408 #else
1409 m256 sub;
1410 if (n < 256) {
1411 sub = val.lo;
1412 } else {
1413 sub = val.hi;
1414 n -= 256;
1415 }
1416 return testbit256(sub, n);
1417 #endif
1418 }
1419
1420 #endif
1421