1 /* Permission is hereby granted, free of charge, to any person
2  * obtaining a copy of this software and associated documentation
3  * files (the "Software"), to deal in the Software without
4  * restriction, including without limitation the rights to use, copy,
5  * modify, merge, publish, distribute, sublicense, and/or sell copies
6  * of the Software, and to permit persons to whom the Software is
7  * furnished to do so, subject to the following conditions:
8  *
9  * The above copyright notice and this permission notice shall be
10  * included in all copies or substantial portions of the Software.
11  *
12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
13  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
14  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
15  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
16  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
17  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19  * SOFTWARE.
20  *
21  * Copyright:
22  *   2020-2021 Christopher Moore <moore@free.fr>
23  *   2020      Evan Nemerson <evan@nemerson.com>
24  */
25 
26 #if !defined(SIMDE_X86_GFNI_H)
27 #define SIMDE_X86_GFNI_H
28 
29 #include "avx512/add.h"
30 #include "avx512/and.h"
31 #include "avx512/broadcast.h"
32 #include "avx512/cmpeq.h"
33 #include "avx512/cmpge.h"
34 #include "avx512/cmpgt.h"
35 #include "avx512/cmplt.h"
36 #include "avx512/extract.h"
37 #include "avx512/insert.h"
38 #include "avx512/kshift.h"
39 #include "avx512/mov.h"
40 #include "avx512/mov_mask.h"
41 #include "avx512/permutex2var.h"
42 #include "avx512/set.h"
43 #include "avx512/set1.h"
44 #include "avx512/setzero.h"
45 #include "avx512/shuffle.h"
46 #include "avx512/srli.h"
47 #include "avx512/test.h"
48 #include "avx512/xor.h"
49 
50 HEDLEY_DIAGNOSTIC_PUSH
51 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
52 SIMDE_BEGIN_DECLS_
53 
54 /* In all the *gf2p8affine* intrinsics the argument b must be a compile-time constant so we must use macros and simde_x_mm* helper functions */
55 
56 /* N.B. The _mm*gf2p8affineinv_epi64_epi8 and _mm*gf2p8mul_epi8 intrinsics are for a Field Generator Polynomial (FGP) (aka reduction polynomial) of 0x11B */
57 /* Only the _mm*gf2p8affine_epi64_epi8 intrinsics do not assume this specific FGP */
58 
59 /* The field generator polynomial is 0x11B but we make the 0x100 bit implicit to fit inside 8 bits */
60 #define SIMDE_X86_GFNI_FGP 0x1B
61 
62 /* Computing the inverse of a GF element is expensive so use this LUT for an FGP of 0x11B */
63 
64 static const union {
65   uint8_t      u8[256];
66   simde__m128i m128i[16];
67 } simde_x_gf2p8inverse_lut = {
68   {
69    0x00, 0x01, 0x8d, 0xf6, 0xcb, 0x52, 0x7b, 0xd1, 0xe8, 0x4f, 0x29, 0xc0, 0xb0, 0xe1, 0xe5, 0xc7,
70    0x74, 0xb4, 0xaa, 0x4b, 0x99, 0x2b, 0x60, 0x5f, 0x58, 0x3f, 0xfd, 0xcc, 0xff, 0x40, 0xee, 0xb2,
71    0x3a, 0x6e, 0x5a, 0xf1, 0x55, 0x4d, 0xa8, 0xc9, 0xc1, 0x0a, 0x98, 0x15, 0x30, 0x44, 0xa2, 0xc2,
72    0x2c, 0x45, 0x92, 0x6c, 0xf3, 0x39, 0x66, 0x42, 0xf2, 0x35, 0x20, 0x6f, 0x77, 0xbb, 0x59, 0x19,
73    0x1d, 0xfe, 0x37, 0x67, 0x2d, 0x31, 0xf5, 0x69, 0xa7, 0x64, 0xab, 0x13, 0x54, 0x25, 0xe9, 0x09,
74    0xed, 0x5c, 0x05, 0xca, 0x4c, 0x24, 0x87, 0xbf, 0x18, 0x3e, 0x22, 0xf0, 0x51, 0xec, 0x61, 0x17,
75    0x16, 0x5e, 0xaf, 0xd3, 0x49, 0xa6, 0x36, 0x43, 0xf4, 0x47, 0x91, 0xdf, 0x33, 0x93, 0x21, 0x3b,
76    0x79, 0xb7, 0x97, 0x85, 0x10, 0xb5, 0xba, 0x3c, 0xb6, 0x70, 0xd0, 0x06, 0xa1, 0xfa, 0x81, 0x82,
77    0x83, 0x7e, 0x7f, 0x80, 0x96, 0x73, 0xbe, 0x56, 0x9b, 0x9e, 0x95, 0xd9, 0xf7, 0x02, 0xb9, 0xa4,
78    0xde, 0x6a, 0x32, 0x6d, 0xd8, 0x8a, 0x84, 0x72, 0x2a, 0x14, 0x9f, 0x88, 0xf9, 0xdc, 0x89, 0x9a,
79    0xfb, 0x7c, 0x2e, 0xc3, 0x8f, 0xb8, 0x65, 0x48, 0x26, 0xc8, 0x12, 0x4a, 0xce, 0xe7, 0xd2, 0x62,
80    0x0c, 0xe0, 0x1f, 0xef, 0x11, 0x75, 0x78, 0x71, 0xa5, 0x8e, 0x76, 0x3d, 0xbd, 0xbc, 0x86, 0x57,
81    0x0b, 0x28, 0x2f, 0xa3, 0xda, 0xd4, 0xe4, 0x0f, 0xa9, 0x27, 0x53, 0x04, 0x1b, 0xfc, 0xac, 0xe6,
82    0x7a, 0x07, 0xae, 0x63, 0xc5, 0xdb, 0xe2, 0xea, 0x94, 0x8b, 0xc4, 0xd5, 0x9d, 0xf8, 0x90, 0x6b,
83    0xb1, 0x0d, 0xd6, 0xeb, 0xc6, 0x0e, 0xcf, 0xad, 0x08, 0x4e, 0xd7, 0xe3, 0x5d, 0x50, 0x1e, 0xb3,
84    0x5b, 0x23, 0x38, 0x34, 0x68, 0x46, 0x03, 0x8c, 0xdd, 0x9c, 0x7d, 0xa0, 0xcd, 0x1a, 0x41, 0x1c
85   }
86 };
87 
88 SIMDE_FUNCTION_ATTRIBUTES
89 simde__m128i
simde_x_mm_gf2p8matrix_multiply_epi64_epi8(simde__m128i x,simde__m128i A)90 simde_x_mm_gf2p8matrix_multiply_epi64_epi8 (simde__m128i x, simde__m128i A) {
91   #if defined(SIMDE_X86_SSSE3_NATIVE)
92     simde__m128i r, a, p;
93     const simde__m128i byte_select = simde_x_mm_set_epu64x(UINT64_C(0xFDFDFDFDFDFDFDFD), UINT64_C(0xFEFEFEFEFEFEFEFE));
94     const simde__m128i zero = simde_mm_setzero_si128();
95 
96     a = simde_mm_shuffle_epi8(A, simde_x_mm_set_epu64x(UINT64_C(0x08090A0B0C0D0E0F), UINT64_C(0x0001020304050607)));
97     r = zero;
98 
99     #if !defined(__INTEL_COMPILER)
100       SIMDE_VECTORIZE
101     #endif
102     for (int i = 0 ; i < 8 ; i++) {
103       p = simde_mm_insert_epi16(zero, simde_mm_movemask_epi8(a), 1);
104       p = simde_mm_shuffle_epi8(p, simde_mm_sign_epi8(byte_select, x));
105       r = simde_mm_xor_si128(r, p);
106       a = simde_mm_add_epi8(a, a);
107       x = simde_mm_add_epi8(x, x);
108     }
109 
110     return r;
111   #else
112     simde__m128i_private
113       r_,
114       x_ = simde__m128i_to_private(x),
115       A_ = simde__m128i_to_private(A);
116     const uint64_t ones = UINT64_C(0x0101010101010101);
117     const uint64_t mask = UINT64_C(0x0102040810204080);
118     uint64_t q;
119 
120     #if !defined(__INTEL_COMPILER)
121       SIMDE_VECTORIZE
122     #endif
123     for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
124       q = simde_endian_bswap64_le(A_.u64[i / 8]);
125       q &= HEDLEY_STATIC_CAST(uint64_t, x_.u8[i]) * ones;
126       q ^= q >> 4;
127       q ^= q >> 2;
128       q ^= q >> 1;
129       q &= ones;
130       q *= 255;
131       q &= mask;
132       q |= q >> 32;
133       q |= q >> 16;
134       q |= q >> 8;
135       r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, q);
136     }
137 
138     return simde__m128i_from_private(r_);
139   #endif
140 }
141 
142 SIMDE_FUNCTION_ATTRIBUTES
143 simde__m256i
simde_x_mm256_gf2p8matrix_multiply_epi64_epi8(simde__m256i x,simde__m256i A)144 simde_x_mm256_gf2p8matrix_multiply_epi64_epi8 (simde__m256i x, simde__m256i A) {
145   #if defined(SIMDE_X86_AVX2_NATIVE)
146     simde__m256i r, a, p;
147     const simde__m256i byte_select = simde_x_mm256_set_epu64x(UINT64_C(0x0303030303030303), UINT64_C(0x0202020202020202),
148                                                               UINT64_C(0x0101010101010101), UINT64_C(0x0000000000000000));
149     a = simde_mm256_shuffle_epi8(A, simde_mm256_broadcastsi128_si256(simde_x_mm_set_epu64x(UINT64_C(0x08090A0B0C0D0E0F), UINT64_C(0x0001020304050607))));
150     r = simde_mm256_setzero_si256();
151 
152     #if !defined(__INTEL_COMPILER)
153       SIMDE_VECTORIZE
154     #endif
155     for (int i = 0 ; i < 8 ; i++) {
156       p = simde_mm256_set1_epi32(simde_mm256_movemask_epi8(a));
157       p = simde_mm256_shuffle_epi8(p, byte_select);
158       p = simde_mm256_xor_si256(r, p);
159       r = simde_mm256_blendv_epi8(r, p, x);
160       a = simde_mm256_add_epi8(a, a);
161       x = simde_mm256_add_epi8(x, x);
162     }
163 
164     return r;
165   #else
166     simde__m256i_private
167       r_,
168       x_ = simde__m256i_to_private(x),
169       A_ = simde__m256i_to_private(A);
170 
171     #if !defined(__INTEL_COMPILER)
172       SIMDE_VECTORIZE
173     #endif
174     for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
175       r_.m128i[i] = simde_x_mm_gf2p8matrix_multiply_epi64_epi8(x_.m128i[i], A_.m128i[i]);
176     }
177 
178     return simde__m256i_from_private(r_);
179   #endif
180 }
181 
182 SIMDE_FUNCTION_ATTRIBUTES
183 simde__m512i
simde_x_mm512_gf2p8matrix_multiply_epi64_epi8(simde__m512i x,simde__m512i A)184 simde_x_mm512_gf2p8matrix_multiply_epi64_epi8 (simde__m512i x, simde__m512i A) {
185   #if defined(SIMDE_X86_AVX512BW_NATIVE)
186     simde__m512i r, a, p;
187     const simde__m512i byte_select = simde_x_mm512_set_epu64(UINT64_C(0x0707070707070707), UINT64_C(0x0606060606060606), UINT64_C(0x0505050505050505), UINT64_C(0x0404040404040404),
188                                                              UINT64_C(0x0303030303030303), UINT64_C(0x0202020202020202), UINT64_C(0x0101010101010101), UINT64_C(0X0000000000000000));
189     a = simde_mm512_shuffle_epi8(A, simde_mm512_broadcast_i32x4(simde_x_mm_set_epu64x(UINT64_C(0x08090A0B0C0D0E0F), UINT64_C(0x0001020304050607))));
190     r = simde_mm512_setzero_si512();
191 
192     #if !defined(__INTEL_COMPILER)
193       SIMDE_VECTORIZE
194     #endif
195     for (int i = 0 ; i < 8 ; i++) {
196       p = simde_mm512_set1_epi64(HEDLEY_STATIC_CAST(int64_t, simde_mm512_movepi8_mask(a)));
197       p = simde_mm512_maskz_shuffle_epi8(simde_mm512_movepi8_mask(x), p, byte_select);
198       r = simde_mm512_xor_si512(r, p);
199       a = simde_mm512_add_epi8(a, a);
200       x = simde_mm512_add_epi8(x, x);
201     }
202 
203     return r;
204   #else
205     simde__m512i_private
206       r_,
207       x_ = simde__m512i_to_private(x),
208       A_ = simde__m512i_to_private(A);
209 
210     #if !defined(__INTEL_COMPILER)
211       SIMDE_VECTORIZE
212     #endif
213     for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
214       r_.m256i[i] = simde_x_mm256_gf2p8matrix_multiply_epi64_epi8(x_.m256i[i], A_.m256i[i]);
215     }
216 
217     return simde__m512i_from_private(r_);
218   #endif
219 }
220 
221 SIMDE_FUNCTION_ATTRIBUTES
222 simde__m128i
simde_x_mm_gf2p8inverse_epi8(simde__m128i x)223 simde_x_mm_gf2p8inverse_epi8 (simde__m128i x) {
224   #if defined(SIMDE_X86_SSE4_1_NATIVE)
225     /* N.B. CM: this fallback may not be faster */
226     simde__m128i r, u, t, test;
227     const simde__m128i sixteens = simde_mm_set1_epi8(16);
228     const simde__m128i masked_x = simde_mm_and_si128(x, simde_mm_set1_epi8(0x0F));
229 
230     test = simde_mm_set1_epi8(INT8_MIN /* 0x80 */);
231     x = simde_mm_xor_si128(x, test);
232     r = simde_mm_shuffle_epi8(simde_x_gf2p8inverse_lut.m128i[0], masked_x);
233 
234     #if !defined(__INTEL_COMPILER)
235       SIMDE_VECTORIZE
236     #endif
237     for (int i = 1 ; i < 16 ; i++) {
238       t = simde_mm_shuffle_epi8(simde_x_gf2p8inverse_lut.m128i[i], masked_x);
239       test = simde_mm_add_epi8(test, sixteens);
240       u = simde_mm_cmplt_epi8(x, test);
241       r = simde_mm_blendv_epi8(t, r, u);
242     }
243 
244     return r;
245   #else
246     simde__m128i_private
247       r_,
248       x_ = simde__m128i_to_private(x);
249 
250     #if !defined(__INTEL_COMPILER)
251       SIMDE_VECTORIZE
252     #endif
253     for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
254       r_.u8[i] = simde_x_gf2p8inverse_lut.u8[x_.u8[i]];
255     }
256 
257     return simde__m128i_from_private(r_);
258   #endif
259 }
260 
261 SIMDE_FUNCTION_ATTRIBUTES
262 simde__m256i
simde_x_mm256_gf2p8inverse_epi8(simde__m256i x)263 simde_x_mm256_gf2p8inverse_epi8 (simde__m256i x) {
264   #if defined(SIMDE_X86_AVX2_NATIVE)
265     /* N.B. CM: this fallback may not be faster */
266     simde__m256i r, u, t, test;
267     const simde__m256i sixteens = simde_mm256_set1_epi8(16);
268     const simde__m256i masked_x = simde_mm256_and_si256(x, simde_mm256_set1_epi8(0x0F));
269 
270     test = simde_mm256_set1_epi8(INT8_MIN /* 0x80 */);
271     x = simde_mm256_xor_si256(x, test);
272     r = simde_mm256_shuffle_epi8(simde_mm256_broadcastsi128_si256(simde_x_gf2p8inverse_lut.m128i[0]), masked_x);
273 
274     #if !defined(__INTEL_COMPILER)
275       SIMDE_VECTORIZE
276     #endif
277     for (int i = 1 ; i < 16 ; i++) {
278       t = simde_mm256_shuffle_epi8(simde_mm256_broadcastsi128_si256(simde_x_gf2p8inverse_lut.m128i[i]), masked_x);
279       test = simde_mm256_add_epi8(test, sixteens);
280       u = simde_mm256_cmpgt_epi8(test, x);
281       r = simde_mm256_blendv_epi8(t, r, u);
282     }
283 
284     return r;
285   #else
286     simde__m256i_private
287       r_,
288       x_ = simde__m256i_to_private(x);
289 
290     #if !defined(__INTEL_COMPILER)
291       SIMDE_VECTORIZE
292     #endif
293     for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
294       r_.m128i[i] = simde_x_mm_gf2p8inverse_epi8(x_.m128i[i]);
295     }
296 
297     return simde__m256i_from_private(r_);
298   #endif
299 }
300 
301 SIMDE_FUNCTION_ATTRIBUTES
302 simde__m512i
simde_x_mm512_gf2p8inverse_epi8(simde__m512i x)303 simde_x_mm512_gf2p8inverse_epi8 (simde__m512i x) {
304   /* N.B. CM: TODO: later add VBMI version using just two _mm512_permutex2var_epi8 and friends */
305   /* But except for Cannon Lake all processors with VBMI also have GFNI */
306   #if defined(SIMDE_X86_AVX512BW_NATIVE)
307     /* N.B. CM: this fallback may not be faster */
308     simde__m512i r, test;
309     const simde__m512i sixteens = simde_mm512_set1_epi8(16);
310     const simde__m512i masked_x = simde_mm512_and_si512(x, simde_mm512_set1_epi8(0x0F));
311 
312     r = simde_mm512_shuffle_epi8(simde_mm512_broadcast_i32x4(simde_x_gf2p8inverse_lut.m128i[0]), masked_x);
313     test = sixteens;
314 
315     #if !defined(__INTEL_COMPILER)
316       SIMDE_VECTORIZE
317     #endif
318     for (int i = 1 ; i < 16 ; i++) {
319       r = simde_mm512_mask_shuffle_epi8(r, simde_mm512_cmpge_epu8_mask(x, test), simde_mm512_broadcast_i32x4(simde_x_gf2p8inverse_lut.m128i[i]), masked_x);
320       test = simde_mm512_add_epi8(test, sixteens);
321     }
322 
323     return r;
324   #else
325     simde__m512i_private
326       r_,
327       x_ = simde__m512i_to_private(x);
328 
329     #if !defined(__INTEL_COMPILER)
330       SIMDE_VECTORIZE
331     #endif
332     for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) {
333       r_.m256i[i] = simde_x_mm256_gf2p8inverse_epi8(x_.m256i[i]);
334     }
335 
336     return simde__m512i_from_private(r_);
337   #endif
338 }
339 
340 #define simde_x_mm_gf2p8matrix_multiply_inverse_epi64_epi8(x, A) simde_x_mm_gf2p8matrix_multiply_epi64_epi8(simde_x_mm_gf2p8inverse_epi8(x), A)
341 #define simde_x_mm256_gf2p8matrix_multiply_inverse_epi64_epi8(x, A) simde_x_mm256_gf2p8matrix_multiply_epi64_epi8(simde_x_mm256_gf2p8inverse_epi8(x), A)
342 #define simde_x_mm512_gf2p8matrix_multiply_inverse_epi64_epi8(x, A) simde_x_mm512_gf2p8matrix_multiply_epi64_epi8(simde_x_mm512_gf2p8inverse_epi8(x), A)
343 
344 SIMDE_FUNCTION_ATTRIBUTES
345 simde__m128i
simde_mm_gf2p8affine_epi64_epi8(simde__m128i x,simde__m128i A,int b)346 simde_mm_gf2p8affine_epi64_epi8 (simde__m128i x, simde__m128i A, int b)
347     SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) {
348   return simde_mm_xor_si128(simde_x_mm_gf2p8matrix_multiply_epi64_epi8(x, A), simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b)));
349 }
350 #if defined(SIMDE_X86_GFNI_NATIVE)
351   #define simde_mm_gf2p8affine_epi64_epi8(x, A, b) _mm_gf2p8affine_epi64_epi8(x, A, b)
352 #endif
353 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
354   #undef _mm_gf2p8affine_epi64_epi8
355   #define _mm_gf2p8affine_epi64_epi8(x, A, b) simde_mm_gf2p8affine_epi64_epi8(x, A, b)
356 #endif
357 
358 SIMDE_FUNCTION_ATTRIBUTES
359 simde__m256i
simde_mm256_gf2p8affine_epi64_epi8(simde__m256i x,simde__m256i A,int b)360 simde_mm256_gf2p8affine_epi64_epi8 (simde__m256i x, simde__m256i A, int b)
361     SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) {
362   return simde_mm256_xor_si256(simde_x_mm256_gf2p8matrix_multiply_epi64_epi8(x, A), simde_mm256_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b)));
363 }
364 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX_NATIVE)
365   #define simde_mm256_gf2p8affine_epi64_epi8(x, A, b) _mm256_gf2p8affine_epi64_epi8(x, A, b)
366 #endif
367 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
368   #undef _mm256_gf2p8affine_epi64_epi8
369   #define _mm256_gf2p8affine_epi64_epi8(x, A, b) simde_mm256_gf2p8affine_epi64_epi8(x, A, b)
370 #endif
371 
372 SIMDE_FUNCTION_ATTRIBUTES
373 simde__m512i
simde_mm512_gf2p8affine_epi64_epi8(simde__m512i x,simde__m512i A,int b)374 simde_mm512_gf2p8affine_epi64_epi8 (simde__m512i x, simde__m512i A, int b)
375     SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) {
376   return simde_mm512_xor_si512(simde_x_mm512_gf2p8matrix_multiply_epi64_epi8(x, A), simde_mm512_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b)));
377 }
378 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
379   #define simde_mm512_gf2p8affine_epi64_epi8(x, A, b) _mm512_gf2p8affine_epi64_epi8(x, A, b)
380 #endif
381 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
382   #undef _mm512_gf2p8affine_epi64_epi8
383   #define _mm512_gf2p8affine_epi64_epi8(x, A, b) simde_mm512_gf2p8affine_epi64_epi8(x, A, b)
384 #endif
385 
386 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
387   #define simde_mm_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) _mm_mask_gf2p8affine_epi64_epi8(src, k, x, A, b)
388 #else
389   #define simde_mm_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm_mask_mov_epi8(src, k, simde_mm_gf2p8affine_epi64_epi8(x, A, b))
390 #endif
391 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
392   #undef _mm_mask_gf2p8affine_epi64_epi8
393   #define _mm_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm_mask_gf2p8affine_epi64_epi8(src, k, x, A, b)
394 #endif
395 
396 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
397   #define simde_mm256_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) _mm256_mask_gf2p8affine_epi64_epi8(src, k, x, A, b)
398 #else
399   #define simde_mm256_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm256_mask_mov_epi8(src, k, simde_mm256_gf2p8affine_epi64_epi8(x, A, b))
400 #endif
401 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
402   #undef _mm256_mask_gf2p8affine_epi64_epi8
403   #define _mm256_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm256_mask_gf2p8affine_epi64_epi8(src, k, x, A, b)
404 #endif
405 
406 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
407   #define simde_mm512_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) _mm512_mask_gf2p8affine_epi64_epi8(src, k, x, A, b)
408 #else
409   #define simde_mm512_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm512_mask_mov_epi8(src, k, simde_mm512_gf2p8affine_epi64_epi8(x, A, b))
410 #endif
411 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
412   #undef _mm512_mask_gf2p8affine_epi64_epi8
413   #define _mm512_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm512_mask_gf2p8affine_epi64_epi8(src, k, x, A, b)
414 #endif
415 
416 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
417   #define simde_mm_maskz_gf2p8affine_epi64_epi8(k, x, A, b) _mm_maskz_gf2p8affine_epi64_epi8(k, x, A, b)
418 #else
419   #define simde_mm_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm_maskz_mov_epi8(k, simde_mm_gf2p8affine_epi64_epi8(x, A, b))
420 #endif
421 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
422   #undef _mm_maskz_gf2p8affine_epi64_epi8
423   #define _mm_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm_maskz_gf2p8affine_epi64_epi8(k, x, A, b)
424 #endif
425 
426 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
427   #define simde_mm256_maskz_gf2p8affine_epi64_epi8(k, x, A, b) _mm256_maskz_gf2p8affine_epi64_epi8(k, x, A, b)
428 #else
429   #define simde_mm256_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm256_maskz_mov_epi8(k, simde_mm256_gf2p8affine_epi64_epi8(x, A, b))
430 #endif
431 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
432   #undef _mm256_maskz_gf2p8affine_epi64_epi8
433   #define _mm256_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm256_maskz_gf2p8affine_epi64_epi8(k, x, A, b)
434 #endif
435 
436 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
437   #define simde_mm512_maskz_gf2p8affine_epi64_epi8(k, x, A, b) _mm512_maskz_gf2p8affine_epi64_epi8(k, x, A, b)
438 #else
439   #define simde_mm512_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm512_maskz_mov_epi8(k, simde_mm512_gf2p8affine_epi64_epi8(x, A, b))
440 #endif
441 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
442   #undef _mm512_maskz_gf2p8affine_epi64_epi8
443   #define _mm512_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm512_maskz_gf2p8affine_epi64_epi8(k, x, A, b)
444 #endif
445 
446 SIMDE_FUNCTION_ATTRIBUTES
447 simde__m128i
simde_mm_gf2p8affineinv_epi64_epi8(simde__m128i x,simde__m128i A,int b)448 simde_mm_gf2p8affineinv_epi64_epi8 (simde__m128i x, simde__m128i A, int b)
449     SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) {
450   return simde_mm_xor_si128(simde_x_mm_gf2p8matrix_multiply_inverse_epi64_epi8(x, A), simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b)));
451 }
452 #if defined(SIMDE_X86_GFNI_NATIVE)
453   #define simde_mm_gf2p8affineinv_epi64_epi8(x, A, b) _mm_gf2p8affineinv_epi64_epi8(x, A, b)
454 #endif
455 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
456   #undef _mm_gf2p8affineinv_epi64_epi8
457   #define _mm_gf2p8affineinv_epi64_epi8(x, A, b) simde_mm_gf2p8affineinv_epi64_epi8(x, A, b)
458 #endif
459 
460 SIMDE_FUNCTION_ATTRIBUTES
461 simde__m256i
simde_mm256_gf2p8affineinv_epi64_epi8(simde__m256i x,simde__m256i A,int b)462 simde_mm256_gf2p8affineinv_epi64_epi8 (simde__m256i x, simde__m256i A, int b)
463     SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) {
464   return simde_mm256_xor_si256(simde_x_mm256_gf2p8matrix_multiply_inverse_epi64_epi8(x, A), simde_mm256_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b)));
465 }
466 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX_NATIVE)
467   #define simde_mm256_gf2p8affineinv_epi64_epi8(x, A, b) _mm256_gf2p8affineinv_epi64_epi8(x, A, b)
468 #endif
469 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
470   #undef _mm256_gf2p8affineinv_epi64_epi8
471   #define _mm256_gf2p8affineinv_epi64_epi8(x, A, b) simde_mm256_gf2p8affineinv_epi64_epi8(x, A, b)
472 #endif
473 
474 SIMDE_FUNCTION_ATTRIBUTES
475 simde__m512i
simde_mm512_gf2p8affineinv_epi64_epi8(simde__m512i x,simde__m512i A,int b)476 simde_mm512_gf2p8affineinv_epi64_epi8 (simde__m512i x, simde__m512i A, int b)
477     SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) {
478   return simde_mm512_xor_si512(simde_x_mm512_gf2p8matrix_multiply_inverse_epi64_epi8(x, A), simde_mm512_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b)));
479 }
480 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
481   #define simde_mm512_gf2p8affineinv_epi64_epi8(x, A, b) _mm512_gf2p8affineinv_epi64_epi8(x, A, b)
482 #endif
483 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
484   #undef _mm512_gf2p8affineinv_epi64_epi8
485   #define _mm512_gf2p8affineinv_epi64_epi8(x, A, b) simde_mm512_gf2p8affineinv_epi64_epi8(x, A, b)
486 #endif
487 
488 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
489   #define simde_mm_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) _mm_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b)
490 #else
491   #define simde_mm_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm_mask_mov_epi8(src, k, simde_mm_gf2p8affineinv_epi64_epi8(x, A, b))
492 #endif
493 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
494   #undef _mm_mask_gf2p8affineinv_epi64_epi8
495   #define _mm_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b)
496 #endif
497 
498 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
499   #define simde_mm256_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) _mm256_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b)
500 #else
501   #define simde_mm256_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm256_mask_mov_epi8(src, k, simde_mm256_gf2p8affineinv_epi64_epi8(x, A, b))
502 #endif
503 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
504   #undef _mm256_mask_gf2p8affineinv_epi64_epi8
505   #define _mm256_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm256_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b)
506 #endif
507 
508 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
509   #define simde_mm512_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) _mm512_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b)
510 #else
511   #define simde_mm512_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm512_mask_mov_epi8(src, k, simde_mm512_gf2p8affineinv_epi64_epi8(x, A, b))
512 #endif
513 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
514   #undef _mm512_mask_gf2p8affineinv_epi64_epi8
515   #define _mm512_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm512_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b)
516 #endif
517 
518 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
519   #define simde_mm_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) _mm_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b)
520 #else
521   #define simde_mm_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm_maskz_mov_epi8(k, simde_mm_gf2p8affineinv_epi64_epi8(x, A, b))
522 #endif
523 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
524   #undef _mm_maskz_gf2p8affineinv_epi64_epi8
525   #define _mm_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b)
526 #endif
527 
528 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
529   #define simde_mm256_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) _mm256_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b)
530 #else
531   #define simde_mm256_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm256_maskz_mov_epi8(k, simde_mm256_gf2p8affineinv_epi64_epi8(x, A, b))
532 #endif
533 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
534   #undef _mm256_maskz_gf2p8affineinv_epi64_epi8
535   #define _mm256_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm256_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b)
536 #endif
537 
538 #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
539   #define simde_mm512_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) _mm512_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b)
540 #else
541   #define simde_mm512_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm512_maskz_mov_epi8(k, simde_mm512_gf2p8affineinv_epi64_epi8(x, A, b))
542 #endif
543 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES)
544   #undef _mm512_maskz_gf2p8affineinv_epi64_epi8
545   #define _mm512_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm512_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b)
546 #endif
547 
548 SIMDE_FUNCTION_ATTRIBUTES
simde_mm_gf2p8mul_epi8(simde__m128i a,simde__m128i b)549 simde__m128i simde_mm_gf2p8mul_epi8 (simde__m128i a, simde__m128i b) {
550   #if defined(SIMDE_X86_GFNI_NATIVE) && (defined(SIMDE_X86_AVX512VL_NATIVE) || !defined(SIMDE_X86_AVX512F_NATIVE))
551     return _mm_gf2p8mul_epi8(a, b);
552   #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE)
553     const poly8x16_t pa = vreinterpretq_p8_u8(simde__m128i_to_neon_u8(a));
554     const poly8x16_t pb = vreinterpretq_p8_u8(simde__m128i_to_neon_u8(b));
555     const uint8x16_t lo = vreinterpretq_u8_p16(vmull_p8(vget_low_p8(pa), vget_low_p8(pb)));
556     #if defined (SIMDE_ARM_NEON_A64V8_NATIVE)
557       uint8x16_t hi = vreinterpretq_u8_p16(vmull_high_p8(pa, pb));
558     #else
559       uint8x16_t hi = vreinterpretq_u8_p16(vmull_p8(vget_high_p8(pa), vget_high_p8(pb)));
560     #endif
561     uint8x16x2_t hilo = vuzpq_u8(lo, hi);
562     uint8x16_t r = hilo.val[0];
563     hi = hilo.val[1];
564     const uint8x16_t idxHi = vshrq_n_u8(hi, 4);
565     const uint8x16_t idxLo = vandq_u8(hi, vdupq_n_u8(0xF));
566     #if defined (SIMDE_ARM_NEON_A64V8_NATIVE)
567       const uint8x16_t reduceLutHi = {0x00, 0xab, 0x4d, 0xe6, 0x9a, 0x31, 0xd7, 0x7c, 0x2f, 0x84, 0x62, 0xc9, 0xb5, 0x1e, 0xf8, 0x53};
568       const uint8x16_t reduceLutLo = {0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99};
569       r = veorq_u8(r, vqtbl1q_u8(reduceLutHi, idxHi));
570       r = veorq_u8(r, vqtbl1q_u8(reduceLutLo, idxLo));
571     #else
572       const uint8x8x2_t reduceLutHi = {{{0x00, 0xab, 0x4d, 0xe6, 0x9a, 0x31, 0xd7, 0x7c}, {0x2f, 0x84, 0x62, 0xc9, 0xb5, 0x1e, 0xf8, 0x53}}};
573       const uint8x8x2_t reduceLutLo = {{{0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41}, {0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99}}};
574       r = veorq_u8(r, vcombine_u8(vtbl2_u8(reduceLutHi, vget_low_u8(idxHi)), vtbl2_u8(reduceLutHi, vget_high_u8(idxHi))));
575       r = veorq_u8(r, vcombine_u8(vtbl2_u8(reduceLutLo, vget_low_u8(idxLo)), vtbl2_u8(reduceLutLo, vget_high_u8(idxLo))));
576     #endif
577     return simde__m128i_from_neon_u8(r);
578   #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
579     SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) x, y, lo, hi;
580     SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) even, odd, mask0x00FF;
581     x = simde__m128i_to_altivec_u8(a);
582     y = simde__m128i_to_altivec_u8(b);
583     mask0x00FF = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x00FF));
584     lo = vec_and(y, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), mask0x00FF));
585     hi = vec_xor(y, lo);
586     even = vec_gfmsum(x, lo);
587     odd  = vec_gfmsum(x, hi);
588     lo = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_sel(vec_rli(odd, 8), even, mask0x00FF));
589     hi = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_sel(odd, vec_rli(even, 8), mask0x00FF));
590     const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) reduceLutHi = {0x00, 0xab, 0x4d, 0xe6, 0x9a, 0x31, 0xd7, 0x7c, 0x2f, 0x84, 0x62, 0xc9, 0xb5, 0x1e, 0xf8, 0x53};
591     const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) reduceLutLo = {0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99};
592     lo = vec_xor(lo, vec_perm(reduceLutHi, reduceLutHi, vec_rli(hi, 4)));
593     lo = vec_xor(lo, vec_perm(reduceLutLo, reduceLutLo, hi));
594     return simde__m128i_from_altivec_u8(lo);
595   #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE)
596     SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) x, y, r, t, m;
597     x = simde__m128i_to_altivec_u8(a);
598     y = simde__m128i_to_altivec_u8(b);
599 
600     const SIMDE_POWER_ALTIVEC_VECTOR(signed char) zero = vec_splat_s8(0);
601 
602     m = vec_splat_u8(0x01);
603 
604     const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) fgp = vec_splats(HEDLEY_STATIC_CAST(unsigned char, SIMDE_X86_GFNI_FGP));
605     t = vec_and(y, m);
606     t = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_cmpeq(t, m));
607     r = vec_and(x, t);
608 
609     #if !defined(__INTEL_COMPILER)
610       SIMDE_VECTORIZE
611     #endif
612     for (int i = 0 ; i < 7 ; i++) {
613       t = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_cmplt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), x), zero));
614       x = vec_add(x, x);
615       t = vec_and(fgp, t);
616       x = vec_xor(x, t);
617       m = vec_add(m, m);
618       t = vec_and(y, m);
619       t = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_cmpeq(t, m));
620       t = vec_and(x, t);
621       r = vec_xor(r, t);
622     }
623 
624     return simde__m128i_from_altivec_u8(r);
625   #elif defined(SIMDE_WASM_SIMD128_NATIVE)
626     v128_t x, y, r, t, m;
627     x = simde__m128i_to_wasm_v128(a);
628     y = simde__m128i_to_wasm_v128(b);
629 
630     m = wasm_i8x16_splat(0x01);
631 
632     const v128_t fgp = wasm_i8x16_splat(SIMDE_X86_GFNI_FGP);
633 
634     t = wasm_v128_and(y, m);
635     t = wasm_i8x16_eq(t, m);
636     r = wasm_v128_and(x, t);
637 
638     #if !defined(__INTEL_COMPILER)
639       SIMDE_VECTORIZE
640     #endif
641     for (int i = 0 ; i < 7 ; i++) {
642       t = wasm_i8x16_shr(x, 7);
643       x = wasm_i8x16_add(x, x);
644       t = wasm_v128_and(fgp, t);
645       x = wasm_v128_xor(x, t);
646       m = wasm_i8x16_add(m, m);
647       t = wasm_v128_and(y, m);
648       t = wasm_i8x16_eq(t, m);
649       t = wasm_v128_and(x, t);
650       r = wasm_v128_xor(r, t);
651     }
652 
653     return simde__m128i_from_wasm_v128(r);
654   #elif defined(SIMDE_X86_AVX512BW_NATIVE)
655     simde__m512i r4, t4, u4;
656     simde__mmask64 ma, mb;
657 
658     simde__m512i a4 = simde_mm512_broadcast_i32x4(a);
659     const simde__m512i zero = simde_mm512_setzero_si512();
660     simde__mmask16 m8 = simde_mm512_cmpeq_epi32_mask(zero, zero);
661 
662     const simde__m512i b4 = simde_mm512_broadcast_i32x4(b);
663 
664     simde__m512i bits = simde_mm512_set_epi64(0x4040404040404040,
665                                               0x4040404040404040,
666                                               0x1010101010101010,
667                                               0x1010101010101010,
668                                               0x0404040404040404,
669                                               0x0404040404040404,
670                                               0x0101010101010101,
671                                               0x0101010101010101);
672 
673     const simde__m512i fgp = simde_mm512_set1_epi8(SIMDE_X86_GFNI_FGP);
674 
675     for (int i = 0 ; i < 3 ; i++) {
676       m8 = simde_kshiftli_mask16(m8, 4);
677 
678       ma = simde_mm512_cmplt_epi8_mask(a4, zero);
679       u4 = simde_mm512_add_epi8(a4, a4);
680       t4 = simde_mm512_maskz_mov_epi8(ma, fgp);
681       u4 = simde_mm512_xor_epi32(u4, t4);
682 
683       ma = simde_mm512_cmplt_epi8_mask(u4, zero);
684       u4 = simde_mm512_add_epi8(u4, u4);
685       t4 = simde_mm512_maskz_mov_epi8(ma, fgp);
686       a4 = simde_mm512_mask_xor_epi32(a4, m8, u4, t4);
687     }
688 
689     mb = simde_mm512_test_epi8_mask(b4, bits);
690     bits = simde_mm512_add_epi8(bits, bits);
691     ma = simde_mm512_cmplt_epi8_mask(a4, zero);
692     r4 = simde_mm512_maskz_mov_epi8(mb, a4);
693     mb = simde_mm512_test_epi8_mask(b4, bits);
694     a4 = simde_mm512_add_epi8(a4, a4);
695     t4 = simde_mm512_maskz_mov_epi8(ma, fgp);
696     a4 = simde_mm512_xor_si512(a4, t4);
697     t4 = simde_mm512_maskz_mov_epi8(mb, a4);
698     r4 = simde_mm512_xor_si512(r4, t4);
699 
700     r4 = simde_mm512_xor_si512(r4, simde_mm512_shuffle_i32x4(r4, r4, (1 << 6) + (0 << 4) + (3 << 2) + 2));
701     r4 = simde_mm512_xor_si512(r4, simde_mm512_shuffle_i32x4(r4, r4, (0 << 6) + (3 << 4) + (2 << 2) + 1));
702 
703     return simde_mm512_extracti32x4_epi32(r4, 0);
704   #elif defined(SIMDE_X86_AVX2_NATIVE)
705     simde__m256i r2, t2;
706     simde__m256i a2 = simde_mm256_broadcastsi128_si256(a);
707     const simde__m256i zero = simde_mm256_setzero_si256();
708     const simde__m256i fgp = simde_mm256_set1_epi8(SIMDE_X86_GFNI_FGP);
709     const simde__m256i ones = simde_mm256_set1_epi8(0x01);
710     simde__m256i b2 = simde_mm256_set_m128i(simde_mm_srli_epi64(b, 4), b);
711 
712     for (int i = 0 ; i < 4 ; i++) {
713       t2 = simde_mm256_cmpgt_epi8(zero, a2);
714       t2 = simde_mm256_and_si256(fgp, t2);
715       a2 = simde_mm256_add_epi8(a2, a2);
716       a2 = simde_mm256_xor_si256(a2, t2);
717     }
718 
719     a2 = simde_mm256_inserti128_si256(a2, a, 0);
720 
721     t2 = simde_mm256_and_si256(b2, ones);
722     t2 = simde_mm256_cmpeq_epi8(t2, ones);
723     r2 = simde_mm256_and_si256(a2, t2);
724 
725     #if !defined(__INTEL_COMPILER)
726       SIMDE_VECTORIZE
727     #endif
728     for (int i = 0 ; i < 3 ; i++) {
729       t2 = simde_mm256_cmpgt_epi8(zero, a2);
730       t2 = simde_mm256_and_si256(fgp, t2);
731       a2 = simde_mm256_add_epi8(a2, a2);
732       a2 = simde_mm256_xor_si256(a2, t2);
733       b2 = simde_mm256_srli_epi64(b2, 1);
734       t2 = simde_mm256_and_si256(b2, ones);
735       t2 = simde_mm256_cmpeq_epi8(t2, ones);
736       t2 = simde_mm256_and_si256(a2, t2);
737       r2 = simde_mm256_xor_si256(r2, t2);
738     }
739 
740     return simde_mm_xor_si128(simde_mm256_extracti128_si256(r2, 1),
741                               simde_mm256_extracti128_si256(r2, 0));
742   #elif defined(SIMDE_X86_SSE2_NATIVE)
743     simde__m128i r, t;
744     const simde__m128i zero = simde_mm_setzero_si128();
745     const simde__m128i ones = simde_mm_set1_epi8(0x01);
746 
747     const simde__m128i fgp = simde_mm_set1_epi8(SIMDE_X86_GFNI_FGP);
748 
749     t = simde_mm_and_si128(b, ones);
750     t = simde_mm_cmpeq_epi8(t, ones);
751     r = simde_mm_and_si128(a, t);
752 
753     #if !defined(__INTEL_COMPILER)
754       SIMDE_VECTORIZE
755     #endif
756     for (int i = 0 ; i < 7 ; i++) {
757       t = simde_mm_cmpgt_epi8(zero, a);
758       t = simde_mm_and_si128(fgp, t);
759       a = simde_mm_add_epi8(a, a);
760       a = simde_mm_xor_si128(a, t);
761       b = simde_mm_srli_epi64(b, 1);
762       t = simde_mm_and_si128(b, ones);
763       t = simde_mm_cmpeq_epi8(t, ones);
764       t = simde_mm_and_si128(a, t);
765       r = simde_mm_xor_si128(r, t);
766     }
767 
768     return r;
769   #else
770     simde__m128i_private
771       r_,
772       a_ = simde__m128i_to_private(a),
773       b_ = simde__m128i_to_private(b);
774 
775     const uint8_t fgp = SIMDE_X86_GFNI_FGP;
776 
777     #if !defined(__INTEL_COMPILER)
778       SIMDE_VECTORIZE
779     #endif
780     for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) {
781       r_.u8[i] = 0;
782       while ((a_.u8[i] != 0) && (b_.u8[i] != 0)) {
783         if (b_.u8[i] & 1)
784           r_.u8[i] ^= a_.u8[i];
785 
786         if (a_.u8[i] & 0x80)
787           a_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, (a_.u8[i] << 1) ^ fgp);
788         else
789           a_.u8[i] <<= 1;
790 
791         b_.u8[i] >>= 1;
792       }
793     }
794 
795     return simde__m128i_from_private(r_);
796   #endif
797 }
798 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
799   #undef _mm_gf2p8mul_epi8
800   #define _mm_gf2p8mul_epi8(a, b) simde_mm_gf2p8mul_epi8(a, b)
801 #endif
802 
803 SIMDE_FUNCTION_ATTRIBUTES
804 simde__m256i
simde_mm256_gf2p8mul_epi8(simde__m256i a,simde__m256i b)805 simde_mm256_gf2p8mul_epi8 (simde__m256i a, simde__m256i b) {
806   #if defined(SIMDE_X86_GFNI_NATIVE) && (defined(SIMDE_X86_AVX512VL_NATIVE) || (defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE)))
807     return _mm256_gf2p8mul_epi8(a, b);
808   #elif !defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE)
809     simde__mmask64 ma, mb;
810     simde__m512i r, t, s;
811     simde__m512i a2 = simde_mm512_broadcast_i64x4(a);
812     const simde__m512i zero = simde_mm512_setzero_si512();
813 
814     const simde__m512i fgp = simde_mm512_set1_epi8(SIMDE_X86_GFNI_FGP);
815 
816     s = simde_mm512_set1_epi8(0x01);
817 
818     for (int i = 0 ; i < 4 ; i++) {
819       ma = simde_mm512_cmplt_epi8_mask(a2, zero);
820       a2 = simde_mm512_add_epi8(a2, a2);
821       t = simde_mm512_xor_si512(a2, fgp);
822       a2 = simde_mm512_mask_mov_epi8(a2, ma, t);
823     }
824 
825     simde__m512i b2 = simde_mm512_inserti64x4(zero, simde_mm256_srli_epi64(b, 4), 1);
826     b2 = simde_mm512_inserti64x4(b2, b, 0);
827     a2 = simde_mm512_inserti64x4(a2, a, 0);
828 
829     mb = simde_mm512_test_epi8_mask(b2, s);
830     r = simde_mm512_maskz_mov_epi8(mb, a2);
831 
832     #if !defined(__INTEL_COMPILER)
833       SIMDE_VECTORIZE
834     #endif
835     for (int i = 0 ; i < 3 ; i++) {
836       ma = simde_mm512_cmplt_epi8_mask(a2, zero);
837       s = simde_mm512_add_epi8(s, s);
838       mb = simde_mm512_test_epi8_mask(b2, s);
839       a2 = simde_mm512_add_epi8(a2, a2);
840       t = simde_mm512_maskz_mov_epi8(ma, fgp);
841       a2 = simde_mm512_xor_si512(a2, t);
842       t = simde_mm512_maskz_mov_epi8(mb, a2);
843       r = simde_mm512_xor_si512(r, t);
844     }
845 
846     return simde_mm256_xor_si256(simde_mm512_extracti64x4_epi64(r, 1),
847                                  simde_mm512_extracti64x4_epi64(r, 0));
848   #elif !defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX2_NATIVE)
849     simde__m256i r, t;
850     const simde__m256i zero = simde_mm256_setzero_si256();
851     const simde__m256i ones = simde_mm256_set1_epi8(0x01);
852 
853     const simde__m256i fgp = simde_mm256_set1_epi8(SIMDE_X86_GFNI_FGP);
854 
855     t = simde_mm256_and_si256(b, ones);
856     t = simde_mm256_cmpeq_epi8(t, ones);
857     r = simde_mm256_and_si256(a, t);
858 
859     #if !defined(__INTEL_COMPILER)
860       SIMDE_VECTORIZE
861     #endif
862     for (int i = 0 ; i < 7 ; i++) {
863       t = simde_mm256_cmpgt_epi8(zero, a);
864       t = simde_mm256_and_si256(fgp, t);
865       a = simde_mm256_add_epi8(a, a);
866       a = simde_mm256_xor_si256(a, t);
867       b = simde_mm256_srli_epi64(b, 1);
868       t = simde_mm256_and_si256(b, ones);
869       t = simde_mm256_cmpeq_epi8(t, ones);
870       t = simde_mm256_and_si256(a, t);
871       r = simde_mm256_xor_si256(r, t);
872     }
873 
874     return r;
875   #else
876     simde__m256i_private
877       r_,
878       a_ = simde__m256i_to_private(a),
879       b_ = simde__m256i_to_private(b);
880 
881     #if !defined(__INTEL_COMPILER)
882       SIMDE_VECTORIZE
883     #endif
884     for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
885       r_.m128i[i] = simde_mm_gf2p8mul_epi8(a_.m128i[i], b_.m128i[i]);
886     }
887 
888     return simde__m256i_from_private(r_);
889   #endif
890 }
891 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES)
892   #undef _mm256_gf2p8mul_epi8
893   #define _mm256_gf2p8mul_epi8(a, b) simde_mm256_gf2p8mul_epi8(a, b)
894 #endif
895 
896 SIMDE_FUNCTION_ATTRIBUTES
897 simde__m512i
simde_mm512_gf2p8mul_epi8(simde__m512i a,simde__m512i b)898 simde_mm512_gf2p8mul_epi8 (simde__m512i a, simde__m512i b) {
899   #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
900     return _mm512_gf2p8mul_epi8(a, b);
901   #elif !defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE)
902     simde__m512i r, s, t;
903     simde__mmask64 ma, mb;
904     const simde__m512i zero = simde_mm512_setzero_si512();
905 
906     const simde__m512i fgp = simde_mm512_set1_epi8(SIMDE_X86_GFNI_FGP);
907 
908     s = simde_mm512_set1_epi8(0x01);
909 
910     mb = simde_mm512_test_epi8_mask(b, s);
911     r = simde_mm512_maskz_mov_epi8(mb, a);
912 
913     #if !defined(__INTEL_COMPILER)
914       SIMDE_VECTORIZE
915     #endif
916     for (int i = 0 ; i < 7 ; i++) {
917       ma = simde_mm512_cmplt_epi8_mask(a, zero);
918       s = simde_mm512_add_epi8(s, s);
919       mb = simde_mm512_test_epi8_mask(b, s);
920       a = simde_mm512_add_epi8(a, a);
921       t = simde_mm512_maskz_mov_epi8(ma, fgp);
922       a = simde_mm512_xor_si512(a, t);
923       t = simde_mm512_maskz_mov_epi8(mb, a);
924       r = simde_mm512_xor_si512(r, t);
925     }
926 
927     return r;
928   #else
929     simde__m512i_private
930       r_,
931       a_ = simde__m512i_to_private(a),
932       b_ = simde__m512i_to_private(b);
933 
934     #if !defined(__INTEL_COMPILER)
935       SIMDE_VECTORIZE
936     #endif
937     for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) {
938       r_.m128i[i] = simde_mm_gf2p8mul_epi8(a_.m128i[i], b_.m128i[i]);
939     }
940 
941     return simde__m512i_from_private(r_);
942   #endif
943 }
944 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
945   #undef _mm512_gf2p8mul_epi8
946   #define _mm512_gf2p8mul_epi8(a, b) simde_mm512_gf2p8mul_epi8(a, b)
947 #endif
948 
949 SIMDE_FUNCTION_ATTRIBUTES
950 simde__m128i
simde_mm_mask_gf2p8mul_epi8(simde__m128i src,simde__mmask16 k,simde__m128i a,simde__m128i b)951 simde_mm_mask_gf2p8mul_epi8 (simde__m128i src, simde__mmask16 k, simde__m128i a, simde__m128i b) {
952   #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
953     return _mm_mask_gf2p8mul_epi8(src, k, a, b);
954   #else
955     return simde_mm_mask_mov_epi8(src, k, simde_mm_gf2p8mul_epi8(a, b));
956   #endif
957 }
958 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
959   #undef _mm_mask_gf2p8mul_epi8
960   #define _mm_mask_gf2p8mul_epi8(src, k, a, b) simde_mm_mask_gf2p8mul_epi8(src, k, a, b)
961 #endif
962 
963 SIMDE_FUNCTION_ATTRIBUTES
964 simde__m256i
simde_mm256_mask_gf2p8mul_epi8(simde__m256i src,simde__mmask32 k,simde__m256i a,simde__m256i b)965 simde_mm256_mask_gf2p8mul_epi8 (simde__m256i src, simde__mmask32 k, simde__m256i a, simde__m256i b) {
966   #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
967     return _mm256_mask_gf2p8mul_epi8(src, k, a, b);
968   #else
969     return simde_mm256_mask_mov_epi8(src, k, simde_mm256_gf2p8mul_epi8(a, b));
970   #endif
971 }
972 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
973   #undef _mm256_mask_gf2p8mul_epi8
974   #define _mm256_mask_gf2p8mul_epi8(src, k, a, b) simde_mm256_mask_gf2p8mul_epi8(src, k, a, b)
975 #endif
976 
977 SIMDE_FUNCTION_ATTRIBUTES
978 simde__m512i
simde_mm512_mask_gf2p8mul_epi8(simde__m512i src,simde__mmask64 k,simde__m512i a,simde__m512i b)979 simde_mm512_mask_gf2p8mul_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) {
980   #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
981     return _mm512_mask_gf2p8mul_epi8(src, k, a, b);
982   #else
983     return simde_mm512_mask_mov_epi8(src, k, simde_mm512_gf2p8mul_epi8(a, b));
984   #endif
985 }
986 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
987   #undef _mm512_mask_gf2p8mul_epi8
988   #define _mm512_mask_gf2p8mul_epi8(src, k, a, b) simde_mm512_mask_gf2p8mul_epi8(src, k, a, b)
989 #endif
990 
991 SIMDE_FUNCTION_ATTRIBUTES
992 simde__m128i
simde_mm_maskz_gf2p8mul_epi8(simde__mmask16 k,simde__m128i a,simde__m128i b)993 simde_mm_maskz_gf2p8mul_epi8 (simde__mmask16 k, simde__m128i a, simde__m128i b) {
994   #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
995     return _mm_maskz_gf2p8mul_epi8(k, a, b);
996   #else
997     return simde_mm_maskz_mov_epi8(k, simde_mm_gf2p8mul_epi8(a, b));
998   #endif
999 }
1000 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
1001   #undef _mm_maskz_gf2p8mul_epi8
1002   #define _mm_maskz_gf2p8mul_epi8(k, a, b) simde_mm_maskz_gf2p8mul_epi8(k, a, b)
1003 #endif
1004 
1005 SIMDE_FUNCTION_ATTRIBUTES
1006 simde__m256i
simde_mm256_maskz_gf2p8mul_epi8(simde__mmask32 k,simde__m256i a,simde__m256i b)1007 simde_mm256_maskz_gf2p8mul_epi8 (simde__mmask32 k, simde__m256i a, simde__m256i b) {
1008   #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE)
1009     return _mm256_maskz_gf2p8mul_epi8(k, a, b);
1010   #else
1011     return  simde_mm256_maskz_mov_epi8(k, simde_mm256_gf2p8mul_epi8(a, b));
1012   #endif
1013 }
1014 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES)
1015   #undef _mm256_maskz_gf2p8mul_epi8
1016   #define _mm256_maskz_gf2p8mul_epi8(k, a, b) simde_mm256_maskz_gf2p8mul_epi8(k, a, b)
1017 #endif
1018 
1019 SIMDE_FUNCTION_ATTRIBUTES
1020 simde__m512i
simde_mm512_maskz_gf2p8mul_epi8(simde__mmask64 k,simde__m512i a,simde__m512i b)1021 simde_mm512_maskz_gf2p8mul_epi8 (simde__mmask64 k, simde__m512i a, simde__m512i b) {
1022   #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE)
1023     return _mm512_maskz_gf2p8mul_epi8(k, a, b);
1024   #else
1025     return simde_mm512_maskz_mov_epi8(k, simde_mm512_gf2p8mul_epi8(a, b));
1026   #endif
1027 }
1028 #if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES)
1029   #undef _mm512_maskz_gf2p8mul_epi8
1030   #define _mm512_maskz_gf2p8mul_epi8(k, a, b) simde_mm512_maskz_gf2p8mul_epi8(k, a, b)
1031 #endif
1032 
1033 SIMDE_END_DECLS_
1034 
1035 HEDLEY_DIAGNOSTIC_POP
1036 
1037 #endif /* !defined(SIMDE_X86_GFNI_H) */
1038