1 //! Galois Field New Instructions (GFNI)
2 //!
3 //! The intrinsics here correspond to those in the `immintrin.h` C header.
4 //!
5 //! The reference is [Intel 64 and IA-32 Architectures Software Developer's
6 //! Manual Volume 2: Instruction Set Reference, A-Z][intel64_ref].
7 //!
8 //! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
9 
10 use crate::core_arch::simd::i8x16;
11 use crate::core_arch::simd::i8x32;
12 use crate::core_arch::simd::i8x64;
13 use crate::core_arch::simd_llvm::simd_select_bitmask;
14 use crate::core_arch::x86::__m128i;
15 use crate::core_arch::x86::__m256i;
16 use crate::core_arch::x86::__m512i;
17 use crate::core_arch::x86::__mmask16;
18 use crate::core_arch::x86::__mmask32;
19 use crate::core_arch::x86::__mmask64;
20 use crate::core_arch::x86::_mm256_setzero_si256;
21 use crate::core_arch::x86::_mm512_setzero_si512;
22 use crate::core_arch::x86::_mm_setzero_si128;
23 use crate::core_arch::x86::m128iExt;
24 use crate::core_arch::x86::m256iExt;
25 use crate::core_arch::x86::m512iExt;
26 use crate::mem::transmute;
27 
28 #[cfg(test)]
29 use stdarch_test::assert_instr;
30 
31 #[allow(improper_ctypes)]
32 extern "C" {
33     #[link_name = "llvm.x86.vgf2p8affineinvqb.512"]
vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x6434     fn vgf2p8affineinvqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
35     #[link_name = "llvm.x86.vgf2p8affineinvqb.256"]
vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x3236     fn vgf2p8affineinvqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
37     #[link_name = "llvm.x86.vgf2p8affineinvqb.128"]
vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x1638     fn vgf2p8affineinvqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
39     #[link_name = "llvm.x86.vgf2p8affineqb.512"]
vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x6440     fn vgf2p8affineqb_512(x: i8x64, a: i8x64, imm8: u8) -> i8x64;
41     #[link_name = "llvm.x86.vgf2p8affineqb.256"]
vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x3242     fn vgf2p8affineqb_256(x: i8x32, a: i8x32, imm8: u8) -> i8x32;
43     #[link_name = "llvm.x86.vgf2p8affineqb.128"]
vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x1644     fn vgf2p8affineqb_128(x: i8x16, a: i8x16, imm8: u8) -> i8x16;
45     #[link_name = "llvm.x86.vgf2p8mulb.512"]
vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x6446     fn vgf2p8mulb_512(a: i8x64, b: i8x64) -> i8x64;
47     #[link_name = "llvm.x86.vgf2p8mulb.256"]
vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x3248     fn vgf2p8mulb_256(a: i8x32, b: i8x32) -> i8x32;
49     #[link_name = "llvm.x86.vgf2p8mulb.128"]
vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x1650     fn vgf2p8mulb_128(a: i8x16, b: i8x16) -> i8x16;
51 }
52 
53 // LLVM requires AVX512BW for a lot of these instructions, see
54 // https://github.com/llvm/llvm-project/blob/release/9.x/clang/include/clang/Basic/BuiltinsX86.def#L457
55 // however our tests also require the target feature list to match Intel's
56 // which *doesn't* require AVX512BW but only AVX512F, so we added the redundant AVX512F
57 // requirement (for now)
58 // also see
59 // https://github.com/llvm/llvm-project/blob/release/9.x/clang/lib/Headers/gfniintrin.h
60 // for forcing GFNI, BW and optionally VL extension
61 
62 /// Performs a multiplication in GF(2^8) on the packed bytes.
63 /// The field is in polynomial representation with the reduction polynomial
64 ///  x^8 + x^4 + x^3 + x + 1.
65 ///
66 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8mul_epi8)
67 #[inline]
68 #[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
69 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
_mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i70 pub unsafe fn _mm512_gf2p8mul_epi8(a: __m512i, b: __m512i) -> __m512i {
71     transmute(vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()))
72 }
73 
74 /// Performs a multiplication in GF(2^8) on the packed bytes.
75 /// The field is in polynomial representation with the reduction polynomial
76 ///  x^8 + x^4 + x^3 + x + 1.
77 ///
78 /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
79 /// Otherwise the computation result is written into the result.
80 ///
81 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_gf2p8mul_epi8)
82 #[inline]
83 #[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
84 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
_mm512_mask_gf2p8mul_epi8( src: __m512i, k: __mmask64, a: __m512i, b: __m512i, ) -> __m512i85 pub unsafe fn _mm512_mask_gf2p8mul_epi8(
86     src: __m512i,
87     k: __mmask64,
88     a: __m512i,
89     b: __m512i,
90 ) -> __m512i {
91     transmute(simd_select_bitmask(
92         k,
93         vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
94         src.as_i8x64(),
95     ))
96 }
97 
98 /// Performs a multiplication in GF(2^8) on the packed bytes.
99 /// The field is in polynomial representation with the reduction polynomial
100 ///  x^8 + x^4 + x^3 + x + 1.
101 ///
102 /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
103 /// Otherwise the computation result is written into the result.
104 ///
105 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_gf2p8mul_epi8)
106 #[inline]
107 #[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
108 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
_mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i109 pub unsafe fn _mm512_maskz_gf2p8mul_epi8(k: __mmask64, a: __m512i, b: __m512i) -> __m512i {
110     let zero = _mm512_setzero_si512().as_i8x64();
111     transmute(simd_select_bitmask(
112         k,
113         vgf2p8mulb_512(a.as_i8x64(), b.as_i8x64()),
114         zero,
115     ))
116 }
117 
118 /// Performs a multiplication in GF(2^8) on the packed bytes.
119 /// The field is in polynomial representation with the reduction polynomial
120 ///  x^8 + x^4 + x^3 + x + 1.
121 ///
122 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_gf2p8mul_epi8)
123 #[inline]
124 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
125 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
_mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i126 pub unsafe fn _mm256_gf2p8mul_epi8(a: __m256i, b: __m256i) -> __m256i {
127     transmute(vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()))
128 }
129 
130 /// Performs a multiplication in GF(2^8) on the packed bytes.
131 /// The field is in polynomial representation with the reduction polynomial
132 ///  x^8 + x^4 + x^3 + x + 1.
133 ///
134 /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
135 /// Otherwise the computation result is written into the result.
136 ///
137 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_gf2p8mul_epi8)
138 #[inline]
139 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
140 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
_mm256_mask_gf2p8mul_epi8( src: __m256i, k: __mmask32, a: __m256i, b: __m256i, ) -> __m256i141 pub unsafe fn _mm256_mask_gf2p8mul_epi8(
142     src: __m256i,
143     k: __mmask32,
144     a: __m256i,
145     b: __m256i,
146 ) -> __m256i {
147     transmute(simd_select_bitmask(
148         k,
149         vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
150         src.as_i8x32(),
151     ))
152 }
153 
154 /// Performs a multiplication in GF(2^8) on the packed bytes.
155 /// The field is in polynomial representation with the reduction polynomial
156 ///  x^8 + x^4 + x^3 + x + 1.
157 ///
158 /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
159 /// Otherwise the computation result is written into the result.
160 ///
161 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_gf2p8mul_epi8)
162 #[inline]
163 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
164 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
_mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i165 pub unsafe fn _mm256_maskz_gf2p8mul_epi8(k: __mmask32, a: __m256i, b: __m256i) -> __m256i {
166     let zero = _mm256_setzero_si256().as_i8x32();
167     transmute(simd_select_bitmask(
168         k,
169         vgf2p8mulb_256(a.as_i8x32(), b.as_i8x32()),
170         zero,
171     ))
172 }
173 
174 /// Performs a multiplication in GF(2^8) on the packed bytes.
175 /// The field is in polynomial representation with the reduction polynomial
176 ///  x^8 + x^4 + x^3 + x + 1.
177 ///
178 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8mul_epi8)
179 #[inline]
180 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
181 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
_mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i182 pub unsafe fn _mm_gf2p8mul_epi8(a: __m128i, b: __m128i) -> __m128i {
183     transmute(vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()))
184 }
185 
186 /// Performs a multiplication in GF(2^8) on the packed bytes.
187 /// The field is in polynomial representation with the reduction polynomial
188 ///  x^8 + x^4 + x^3 + x + 1.
189 ///
190 /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
191 /// Otherwise the computation result is written into the result.
192 ///
193 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_gf2p8mul_epi8)
194 #[inline]
195 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
196 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
_mm_mask_gf2p8mul_epi8( src: __m128i, k: __mmask16, a: __m128i, b: __m128i, ) -> __m128i197 pub unsafe fn _mm_mask_gf2p8mul_epi8(
198     src: __m128i,
199     k: __mmask16,
200     a: __m128i,
201     b: __m128i,
202 ) -> __m128i {
203     transmute(simd_select_bitmask(
204         k,
205         vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
206         src.as_i8x16(),
207     ))
208 }
209 
210 /// Performs a multiplication in GF(2^8) on the packed bytes.
211 /// The field is in polynomial representation with the reduction polynomial
212 ///  x^8 + x^4 + x^3 + x + 1.
213 ///
214 /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
215 /// Otherwise the computation result is written into the result.
216 ///
217 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_gf2p8mul_epi8)
218 #[inline]
219 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
220 #[cfg_attr(test, assert_instr(vgf2p8mulb))]
_mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i221 pub unsafe fn _mm_maskz_gf2p8mul_epi8(k: __mmask16, a: __m128i, b: __m128i) -> __m128i {
222     let zero = _mm_setzero_si128().as_i8x16();
223     transmute(simd_select_bitmask(
224         k,
225         vgf2p8mulb_128(a.as_i8x16(), b.as_i8x16()),
226         zero,
227     ))
228 }
229 
230 /// Performs an affine transformation on the packed bytes in x.
231 /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
232 /// and b being a constant 8-bit immediate value.
233 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
234 ///
235 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8affine_epi8)
236 #[inline]
237 #[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
238 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
239 #[rustc_legacy_const_generics(2)]
_mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i240 pub unsafe fn _mm512_gf2p8affine_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
241     static_assert_imm8!(B);
242     let b = B as u8;
243     let x = x.as_i8x64();
244     let a = a.as_i8x64();
245     let r = vgf2p8affineqb_512(x, a, b);
246     transmute(r)
247 }
248 
249 /// Performs an affine transformation on the packed bytes in x.
250 /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
251 /// and b being a constant 8-bit immediate value.
252 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
253 ///
254 /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
255 /// Otherwise the computation result is written into the result.
256 ///
257 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_gf2p8affine_epi8)
258 #[inline]
259 #[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
260 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
261 #[rustc_legacy_const_generics(3)]
_mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>( k: __mmask64, x: __m512i, a: __m512i, ) -> __m512i262 pub unsafe fn _mm512_maskz_gf2p8affine_epi64_epi8<const B: i32>(
263     k: __mmask64,
264     x: __m512i,
265     a: __m512i,
266 ) -> __m512i {
267     static_assert_imm8!(B);
268     let b = B as u8;
269     let zero = _mm512_setzero_si512().as_i8x64();
270     let x = x.as_i8x64();
271     let a = a.as_i8x64();
272     let r = vgf2p8affineqb_512(x, a, b);
273     transmute(simd_select_bitmask(k, r, zero))
274 }
275 
276 /// Performs an affine transformation on the packed bytes in x.
277 /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
278 /// and b being a constant 8-bit immediate value.
279 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
280 ///
281 /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
282 /// Otherwise the computation result is written into the result.
283 ///
284 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_gf2p8affine_epi8)
285 #[inline]
286 #[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
287 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
288 #[rustc_legacy_const_generics(4)]
_mm512_mask_gf2p8affine_epi64_epi8<const B: i32>( src: __m512i, k: __mmask64, x: __m512i, a: __m512i, ) -> __m512i289 pub unsafe fn _mm512_mask_gf2p8affine_epi64_epi8<const B: i32>(
290     src: __m512i,
291     k: __mmask64,
292     x: __m512i,
293     a: __m512i,
294 ) -> __m512i {
295     static_assert_imm8!(B);
296     let b = B as u8;
297     let x = x.as_i8x64();
298     let a = a.as_i8x64();
299     let r = vgf2p8affineqb_512(x, a, b);
300     transmute(simd_select_bitmask(k, r, src.as_i8x64()))
301 }
302 
303 /// Performs an affine transformation on the packed bytes in x.
304 /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
305 /// and b being a constant 8-bit immediate value.
306 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
307 ///
308 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_gf2p8affine_epi8)
309 #[inline]
310 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
311 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
312 #[rustc_legacy_const_generics(2)]
_mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i313 pub unsafe fn _mm256_gf2p8affine_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
314     static_assert_imm8!(B);
315     let b = B as u8;
316     let x = x.as_i8x32();
317     let a = a.as_i8x32();
318     let r = vgf2p8affineqb_256(x, a, b);
319     transmute(r)
320 }
321 
322 /// Performs an affine transformation on the packed bytes in x.
323 /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
324 /// and b being a constant 8-bit immediate value.
325 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
326 ///
327 /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
328 /// Otherwise the computation result is written into the result.
329 ///
330 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_gf2p8affine_epi8)
331 #[inline]
332 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
333 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
334 #[rustc_legacy_const_generics(3)]
_mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>( k: __mmask32, x: __m256i, a: __m256i, ) -> __m256i335 pub unsafe fn _mm256_maskz_gf2p8affine_epi64_epi8<const B: i32>(
336     k: __mmask32,
337     x: __m256i,
338     a: __m256i,
339 ) -> __m256i {
340     static_assert_imm8!(B);
341     let b = B as u8;
342     let zero = _mm256_setzero_si256().as_i8x32();
343     let x = x.as_i8x32();
344     let a = a.as_i8x32();
345     let r = vgf2p8affineqb_256(x, a, b);
346     transmute(simd_select_bitmask(k, r, zero))
347 }
348 
349 /// Performs an affine transformation on the packed bytes in x.
350 /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
351 /// and b being a constant 8-bit immediate value.
352 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
353 ///
354 /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
355 /// Otherwise the computation result is written into the result.
356 ///
357 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_gf2p8affine_epi8)
358 #[inline]
359 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
360 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
361 #[rustc_legacy_const_generics(4)]
_mm256_mask_gf2p8affine_epi64_epi8<const B: i32>( src: __m256i, k: __mmask32, x: __m256i, a: __m256i, ) -> __m256i362 pub unsafe fn _mm256_mask_gf2p8affine_epi64_epi8<const B: i32>(
363     src: __m256i,
364     k: __mmask32,
365     x: __m256i,
366     a: __m256i,
367 ) -> __m256i {
368     static_assert_imm8!(B);
369     let b = B as u8;
370     let x = x.as_i8x32();
371     let a = a.as_i8x32();
372     let r = vgf2p8affineqb_256(x, a, b);
373     transmute(simd_select_bitmask(k, r, src.as_i8x32()))
374 }
375 
376 /// Performs an affine transformation on the packed bytes in x.
377 /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
378 /// and b being a constant 8-bit immediate value.
379 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
380 ///
381 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8affine_epi8)
382 #[inline]
383 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
384 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
385 #[rustc_legacy_const_generics(2)]
_mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i386 pub unsafe fn _mm_gf2p8affine_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
387     static_assert_imm8!(B);
388     let b = B as u8;
389     let x = x.as_i8x16();
390     let a = a.as_i8x16();
391     let r = vgf2p8affineqb_128(x, a, b);
392     transmute(r)
393 }
394 
395 /// Performs an affine transformation on the packed bytes in x.
396 /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
397 /// and b being a constant 8-bit immediate value.
398 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
399 ///
400 /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
401 /// Otherwise the computation result is written into the result.
402 ///
403 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_gf2p8affine_epi8)
404 #[inline]
405 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
406 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
407 #[rustc_legacy_const_generics(3)]
_mm_maskz_gf2p8affine_epi64_epi8<const B: i32>( k: __mmask16, x: __m128i, a: __m128i, ) -> __m128i408 pub unsafe fn _mm_maskz_gf2p8affine_epi64_epi8<const B: i32>(
409     k: __mmask16,
410     x: __m128i,
411     a: __m128i,
412 ) -> __m128i {
413     static_assert_imm8!(B);
414     let b = B as u8;
415     let zero = _mm_setzero_si128().as_i8x16();
416     let x = x.as_i8x16();
417     let a = a.as_i8x16();
418     let r = vgf2p8affineqb_128(x, a, b);
419     transmute(simd_select_bitmask(k, r, zero))
420 }
421 
422 /// Performs an affine transformation on the packed bytes in x.
423 /// That is computes a*x+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
424 /// and b being a constant 8-bit immediate value.
425 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
426 ///
427 /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
428 /// Otherwise the computation result is written into the result.
429 ///
430 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_gf2p8affine_epi8)
431 #[inline]
432 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
433 #[cfg_attr(test, assert_instr(vgf2p8affineqb, B = 0))]
434 #[rustc_legacy_const_generics(4)]
_mm_mask_gf2p8affine_epi64_epi8<const B: i32>( src: __m128i, k: __mmask16, x: __m128i, a: __m128i, ) -> __m128i435 pub unsafe fn _mm_mask_gf2p8affine_epi64_epi8<const B: i32>(
436     src: __m128i,
437     k: __mmask16,
438     x: __m128i,
439     a: __m128i,
440 ) -> __m128i {
441     static_assert_imm8!(B);
442     let b = B as u8;
443     let x = x.as_i8x16();
444     let a = a.as_i8x16();
445     let r = vgf2p8affineqb_128(x, a, b);
446     transmute(simd_select_bitmask(k, r, src.as_i8x16()))
447 }
448 
449 /// Performs an affine transformation on the inverted packed bytes in x.
450 /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
451 /// and b being a constant 8-bit immediate value.
452 /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
453 /// The inverse of 0 is 0.
454 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
455 ///
456 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8affineinv_epi64_epi8)
457 #[inline]
458 #[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
459 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
460 #[rustc_legacy_const_generics(2)]
_mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i461 pub unsafe fn _mm512_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m512i, a: __m512i) -> __m512i {
462     static_assert_imm8!(B);
463     let b = B as u8;
464     let x = x.as_i8x64();
465     let a = a.as_i8x64();
466     let r = vgf2p8affineinvqb_512(x, a, b);
467     transmute(r)
468 }
469 
470 /// Performs an affine transformation on the inverted packed bytes in x.
471 /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
472 /// and b being a constant 8-bit immediate value.
473 /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
474 /// The inverse of 0 is 0.
475 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
476 ///
477 /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
478 /// Otherwise the computation result is written into the result.
479 ///
480 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_maskz_gf2p8affineinv_epi64_epi8)
481 #[inline]
482 #[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
483 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
484 #[rustc_legacy_const_generics(3)]
_mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>( k: __mmask64, x: __m512i, a: __m512i, ) -> __m512i485 pub unsafe fn _mm512_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
486     k: __mmask64,
487     x: __m512i,
488     a: __m512i,
489 ) -> __m512i {
490     static_assert_imm8!(B);
491     let b = B as u8;
492     let zero = _mm512_setzero_si512().as_i8x64();
493     let x = x.as_i8x64();
494     let a = a.as_i8x64();
495     let r = vgf2p8affineinvqb_512(x, a, b);
496     transmute(simd_select_bitmask(k, r, zero))
497 }
498 
499 /// Performs an affine transformation on the inverted packed bytes in x.
500 /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
501 /// and b being a constant 8-bit immediate value.
502 /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
503 /// The inverse of 0 is 0.
504 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
505 ///
506 /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
507 /// Otherwise the computation result is written into the result.
508 ///
509 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_mask_gf2p8affineinv_epi64_epi8)
510 #[inline]
511 #[target_feature(enable = "avx512gfni,avx512bw,avx512f")]
512 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
513 #[rustc_legacy_const_generics(4)]
_mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>( src: __m512i, k: __mmask64, x: __m512i, a: __m512i, ) -> __m512i514 pub unsafe fn _mm512_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
515     src: __m512i,
516     k: __mmask64,
517     x: __m512i,
518     a: __m512i,
519 ) -> __m512i {
520     static_assert_imm8!(B);
521     let b = B as u8;
522     let x = x.as_i8x64();
523     let a = a.as_i8x64();
524     let r = vgf2p8affineinvqb_512(x, a, b);
525     transmute(simd_select_bitmask(k, r, src.as_i8x64()))
526 }
527 
528 /// Performs an affine transformation on the inverted packed bytes in x.
529 /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
530 /// and b being a constant 8-bit immediate value.
531 /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
532 /// The inverse of 0 is 0.
533 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
534 ///
535 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_gf2p8affineinv_epi64_epi8)
536 #[inline]
537 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
538 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
539 #[rustc_legacy_const_generics(2)]
_mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i540 pub unsafe fn _mm256_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m256i, a: __m256i) -> __m256i {
541     static_assert_imm8!(B);
542     let b = B as u8;
543     let x = x.as_i8x32();
544     let a = a.as_i8x32();
545     let r = vgf2p8affineinvqb_256(x, a, b);
546     transmute(r)
547 }
548 
549 /// Performs an affine transformation on the inverted packed bytes in x.
550 /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
551 /// and b being a constant 8-bit immediate value.
552 /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
553 /// The inverse of 0 is 0.
554 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
555 ///
556 /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
557 /// Otherwise the computation result is written into the result.
558 ///
559 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_maskz_gf2p8affineinv_epi64_epi8)
560 #[inline]
561 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
562 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
563 #[rustc_legacy_const_generics(3)]
_mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>( k: __mmask32, x: __m256i, a: __m256i, ) -> __m256i564 pub unsafe fn _mm256_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
565     k: __mmask32,
566     x: __m256i,
567     a: __m256i,
568 ) -> __m256i {
569     static_assert_imm8!(B);
570     let b = B as u8;
571     let zero = _mm256_setzero_si256().as_i8x32();
572     let x = x.as_i8x32();
573     let a = a.as_i8x32();
574     let r = vgf2p8affineinvqb_256(x, a, b);
575     transmute(simd_select_bitmask(k, r, zero))
576 }
577 
578 /// Performs an affine transformation on the inverted packed bytes in x.
579 /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
580 /// and b being a constant 8-bit immediate value.
581 /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
582 /// The inverse of 0 is 0.
583 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
584 ///
585 /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
586 /// Otherwise the computation result is written into the result.
587 ///
588 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm256_mask_gf2p8affineinv_epi64_epi8)
589 #[inline]
590 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
591 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
592 #[rustc_legacy_const_generics(4)]
_mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>( src: __m256i, k: __mmask32, x: __m256i, a: __m256i, ) -> __m256i593 pub unsafe fn _mm256_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
594     src: __m256i,
595     k: __mmask32,
596     x: __m256i,
597     a: __m256i,
598 ) -> __m256i {
599     static_assert_imm8!(B);
600     let b = B as u8;
601     let x = x.as_i8x32();
602     let a = a.as_i8x32();
603     let r = vgf2p8affineinvqb_256(x, a, b);
604     transmute(simd_select_bitmask(k, r, src.as_i8x32()))
605 }
606 
607 /// Performs an affine transformation on the inverted packed bytes in x.
608 /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
609 /// and b being a constant 8-bit immediate value.
610 /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
611 /// The inverse of 0 is 0.
612 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
613 ///
614 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8affineinv_epi64_epi8)
615 #[inline]
616 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
617 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
618 #[rustc_legacy_const_generics(2)]
_mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i619 pub unsafe fn _mm_gf2p8affineinv_epi64_epi8<const B: i32>(x: __m128i, a: __m128i) -> __m128i {
620     static_assert_imm8!(B);
621     let b = B as u8;
622     let x = x.as_i8x16();
623     let a = a.as_i8x16();
624     let r = vgf2p8affineinvqb_128(x, a, b);
625     transmute(r)
626 }
627 
628 /// Performs an affine transformation on the inverted packed bytes in x.
629 /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
630 /// and b being a constant 8-bit immediate value.
631 /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
632 /// The inverse of 0 is 0.
633 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
634 ///
635 /// Uses the writemask in k - elements are zeroed in the result if the corresponding mask bit is not set.
636 /// Otherwise the computation result is written into the result.
637 ///
638 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskz_gf2p8affineinv_epi64_epi8)
639 #[inline]
640 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
641 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
642 #[rustc_legacy_const_generics(3)]
_mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>( k: __mmask16, x: __m128i, a: __m128i, ) -> __m128i643 pub unsafe fn _mm_maskz_gf2p8affineinv_epi64_epi8<const B: i32>(
644     k: __mmask16,
645     x: __m128i,
646     a: __m128i,
647 ) -> __m128i {
648     static_assert_imm8!(B);
649     let b = B as u8;
650     let zero = _mm_setzero_si128().as_i8x16();
651     let x = x.as_i8x16();
652     let a = a.as_i8x16();
653     let r = vgf2p8affineinvqb_128(x, a, b);
654     transmute(simd_select_bitmask(k, r, zero))
655 }
656 
657 /// Performs an affine transformation on the inverted packed bytes in x.
658 /// That is computes a*inv(x)+b over the Galois Field 2^8 for each packed byte with a being a 8x8 bit matrix
659 /// and b being a constant 8-bit immediate value.
660 /// The inverse of a byte is defined with respect to the reduction polynomial x^8+x^4+x^3+x+1.
661 /// The inverse of 0 is 0.
662 /// Each pack of 8 bytes in x is paired with the 64-bit word at the same position in a.
663 ///
664 /// Uses the writemask in k - elements are copied from src if the corresponding mask bit is not set.
665 /// Otherwise the computation result is written into the result.
666 ///
667 /// [Intel's documentation](https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mask_gf2p8affineinv_epi64_epi8)
668 #[inline]
669 #[target_feature(enable = "avx512gfni,avx512bw,avx512vl")]
670 #[cfg_attr(test, assert_instr(vgf2p8affineinvqb, B = 0))]
671 #[rustc_legacy_const_generics(4)]
_mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>( src: __m128i, k: __mmask16, x: __m128i, a: __m128i, ) -> __m128i672 pub unsafe fn _mm_mask_gf2p8affineinv_epi64_epi8<const B: i32>(
673     src: __m128i,
674     k: __mmask16,
675     x: __m128i,
676     a: __m128i,
677 ) -> __m128i {
678     static_assert_imm8!(B);
679     let b = B as u8;
680     let x = x.as_i8x16();
681     let a = a.as_i8x16();
682     let r = vgf2p8affineinvqb_128(x, a, b);
683     transmute(simd_select_bitmask(k, r, src.as_i8x16()))
684 }
685 
686 #[cfg(test)]
687 mod tests {
688     // The constants in the tests below are just bit patterns. They should not
689     // be interpreted as integers; signedness does not make sense for them, but
690     // __mXXXi happens to be defined in terms of signed integers.
691     #![allow(overflowing_literals)]
692 
693     use core::hint::black_box;
694     use core::intrinsics::size_of;
695     use stdarch_test::simd_test;
696 
697     use crate::core_arch::x86::*;
698 
mulbyte(left: u8, right: u8) -> u8699     fn mulbyte(left: u8, right: u8) -> u8 {
700         // this implementation follows the description in
701         // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm512_gf2p8mul_epi8
702         const REDUCTION_POLYNOMIAL: u16 = 0x11b;
703         let left: u16 = left.into();
704         let right: u16 = right.into();
705         let mut carryless_product: u16 = 0;
706 
707         // Carryless multiplication
708         for i in 0..8 {
709             if ((left >> i) & 0x01) != 0 {
710                 carryless_product ^= right << i;
711             }
712         }
713 
714         // reduction, adding in "0" where appropriate to clear out high bits
715         // note that REDUCTION_POLYNOMIAL is zero in this context
716         for i in (8..=14).rev() {
717             if ((carryless_product >> i) & 0x01) != 0 {
718                 carryless_product ^= REDUCTION_POLYNOMIAL << (i - 8);
719             }
720         }
721 
722         carryless_product as u8
723     }
724 
725     const NUM_TEST_WORDS_512: usize = 4;
726     const NUM_TEST_WORDS_256: usize = NUM_TEST_WORDS_512 * 2;
727     const NUM_TEST_WORDS_128: usize = NUM_TEST_WORDS_256 * 2;
728     const NUM_TEST_ENTRIES: usize = NUM_TEST_WORDS_512 * 64;
729     const NUM_TEST_WORDS_64: usize = NUM_TEST_WORDS_128 * 2;
730     const NUM_BYTES: usize = 256;
731     const NUM_BYTES_WORDS_128: usize = NUM_BYTES / 16;
732     const NUM_BYTES_WORDS_256: usize = NUM_BYTES_WORDS_128 / 2;
733     const NUM_BYTES_WORDS_512: usize = NUM_BYTES_WORDS_256 / 2;
734 
parity(input: u8) -> u8735     fn parity(input: u8) -> u8 {
736         let mut accumulator = 0;
737         for i in 0..8 {
738             accumulator ^= (input >> i) & 0x01;
739         }
740         accumulator
741     }
742 
mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8743     fn mat_vec_multiply_affine(matrix: u64, x: u8, b: u8) -> u8 {
744         // this implementation follows the description in
745         // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_gf2p8affine_epi64_epi8
746         let mut accumulator = 0;
747 
748         for bit in 0..8 {
749             accumulator |= parity(x & matrix.to_le_bytes()[bit]) << (7 - bit);
750         }
751 
752         accumulator ^ b
753     }
754 
generate_affine_mul_test_data( immediate: u8, ) -> ( [u64; NUM_TEST_WORDS_64], [u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES], )755     fn generate_affine_mul_test_data(
756         immediate: u8,
757     ) -> (
758         [u64; NUM_TEST_WORDS_64],
759         [u8; NUM_TEST_ENTRIES],
760         [u8; NUM_TEST_ENTRIES],
761     ) {
762         let mut left: [u64; NUM_TEST_WORDS_64] = [0; NUM_TEST_WORDS_64];
763         let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
764         let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
765 
766         for i in 0..NUM_TEST_WORDS_64 {
767             left[i] = (i as u64) * 103 * 101;
768             for j in 0..8 {
769                 let j64 = j as u64;
770                 right[i * 8 + j] = ((left[i] + j64) % 256) as u8;
771                 result[i * 8 + j] = mat_vec_multiply_affine(left[i], right[i * 8 + j], immediate);
772             }
773         }
774 
775         (left, right, result)
776     }
777 
generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES])778     fn generate_inv_tests_data() -> ([u8; NUM_BYTES], [u8; NUM_BYTES]) {
779         let mut input: [u8; NUM_BYTES] = [0; NUM_BYTES];
780         let mut result: [u8; NUM_BYTES] = [0; NUM_BYTES];
781 
782         for i in 0..NUM_BYTES {
783             input[i] = (i % 256) as u8;
784             result[i] = if i == 0 { 0 } else { 1 };
785         }
786 
787         (input, result)
788     }
789 
790     const AES_S_BOX: [u8; NUM_BYTES] = [
791         0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab,
792         0x76, 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4,
793         0x72, 0xc0, 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71,
794         0xd8, 0x31, 0x15, 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2,
795         0xeb, 0x27, 0xb2, 0x75, 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6,
796         0xb3, 0x29, 0xe3, 0x2f, 0x84, 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb,
797         0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45,
798         0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
799         0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44,
800         0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a,
801         0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, 0xe0, 0x32, 0x3a, 0x0a, 0x49,
802         0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, 0xe7, 0xc8, 0x37, 0x6d,
803         0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, 0xba, 0x78, 0x25,
804         0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, 0x70, 0x3e,
805         0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, 0xe1,
806         0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
807         0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb,
808         0x16,
809     ];
810 
generate_byte_mul_test_data() -> ( [u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES], [u8; NUM_TEST_ENTRIES], )811     fn generate_byte_mul_test_data() -> (
812         [u8; NUM_TEST_ENTRIES],
813         [u8; NUM_TEST_ENTRIES],
814         [u8; NUM_TEST_ENTRIES],
815     ) {
816         let mut left: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
817         let mut right: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
818         let mut result: [u8; NUM_TEST_ENTRIES] = [0; NUM_TEST_ENTRIES];
819 
820         for i in 0..NUM_TEST_ENTRIES {
821             left[i] = (i % 256) as u8;
822             right[i] = left[i] * 101;
823             result[i] = mulbyte(left[i], right[i]);
824         }
825 
826         (left, right, result)
827     }
828 
829     #[target_feature(enable = "sse2")]
load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i830     unsafe fn load_m128i_word<T>(data: &[T], word_index: usize) -> __m128i {
831         let byte_offset = word_index * 16 / size_of::<T>();
832         let pointer = data.as_ptr().offset(byte_offset as isize) as *const __m128i;
833         _mm_loadu_si128(black_box(pointer))
834     }
835 
836     #[target_feature(enable = "avx")]
load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i837     unsafe fn load_m256i_word<T>(data: &[T], word_index: usize) -> __m256i {
838         let byte_offset = word_index * 32 / size_of::<T>();
839         let pointer = data.as_ptr().offset(byte_offset as isize) as *const __m256i;
840         _mm256_loadu_si256(black_box(pointer))
841     }
842 
843     #[target_feature(enable = "avx512f")]
load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i844     unsafe fn load_m512i_word<T>(data: &[T], word_index: usize) -> __m512i {
845         let byte_offset = word_index * 64 / size_of::<T>();
846         let pointer = data.as_ptr().offset(byte_offset as isize) as *const i32;
847         _mm512_loadu_si512(black_box(pointer))
848     }
849 
850     #[simd_test(enable = "avx512gfni,avx512bw")]
test_mm512_gf2p8mul_epi8()851     unsafe fn test_mm512_gf2p8mul_epi8() {
852         let (left, right, expected) = generate_byte_mul_test_data();
853 
854         for i in 0..NUM_TEST_WORDS_512 {
855             let left = load_m512i_word(&left, i);
856             let right = load_m512i_word(&right, i);
857             let expected = load_m512i_word(&expected, i);
858             let result = _mm512_gf2p8mul_epi8(left, right);
859             assert_eq_m512i(result, expected);
860         }
861     }
862 
863     #[simd_test(enable = "avx512gfni,avx512bw")]
test_mm512_maskz_gf2p8mul_epi8()864     unsafe fn test_mm512_maskz_gf2p8mul_epi8() {
865         let (left, right, _expected) = generate_byte_mul_test_data();
866 
867         for i in 0..NUM_TEST_WORDS_512 {
868             let left = load_m512i_word(&left, i);
869             let right = load_m512i_word(&right, i);
870             let result_zero = _mm512_maskz_gf2p8mul_epi8(0, left, right);
871             assert_eq_m512i(result_zero, _mm512_setzero_si512());
872             let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
873             let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
874             let expected_result = _mm512_gf2p8mul_epi8(left, right);
875             let result_masked = _mm512_maskz_gf2p8mul_epi8(mask_bytes, left, right);
876             let expected_masked =
877                 _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
878             assert_eq_m512i(result_masked, expected_masked);
879         }
880     }
881 
882     #[simd_test(enable = "avx512gfni,avx512bw")]
test_mm512_mask_gf2p8mul_epi8()883     unsafe fn test_mm512_mask_gf2p8mul_epi8() {
884         let (left, right, _expected) = generate_byte_mul_test_data();
885 
886         for i in 0..NUM_TEST_WORDS_512 {
887             let left = load_m512i_word(&left, i);
888             let right = load_m512i_word(&right, i);
889             let result_left = _mm512_mask_gf2p8mul_epi8(left, 0, left, right);
890             assert_eq_m512i(result_left, left);
891             let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
892             let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
893             let expected_result = _mm512_gf2p8mul_epi8(left, right);
894             let result_masked = _mm512_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
895             let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
896             assert_eq_m512i(result_masked, expected_masked);
897         }
898     }
899 
900     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm256_gf2p8mul_epi8()901     unsafe fn test_mm256_gf2p8mul_epi8() {
902         let (left, right, expected) = generate_byte_mul_test_data();
903 
904         for i in 0..NUM_TEST_WORDS_256 {
905             let left = load_m256i_word(&left, i);
906             let right = load_m256i_word(&right, i);
907             let expected = load_m256i_word(&expected, i);
908             let result = _mm256_gf2p8mul_epi8(left, right);
909             assert_eq_m256i(result, expected);
910         }
911     }
912 
913     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm256_maskz_gf2p8mul_epi8()914     unsafe fn test_mm256_maskz_gf2p8mul_epi8() {
915         let (left, right, _expected) = generate_byte_mul_test_data();
916 
917         for i in 0..NUM_TEST_WORDS_256 {
918             let left = load_m256i_word(&left, i);
919             let right = load_m256i_word(&right, i);
920             let result_zero = _mm256_maskz_gf2p8mul_epi8(0, left, right);
921             assert_eq_m256i(result_zero, _mm256_setzero_si256());
922             let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
923             const MASK_WORDS: i32 = 0b01_10_11_00;
924             let expected_result = _mm256_gf2p8mul_epi8(left, right);
925             let result_masked = _mm256_maskz_gf2p8mul_epi8(mask_bytes, left, right);
926             let expected_masked =
927                 _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
928             assert_eq_m256i(result_masked, expected_masked);
929         }
930     }
931 
932     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm256_mask_gf2p8mul_epi8()933     unsafe fn test_mm256_mask_gf2p8mul_epi8() {
934         let (left, right, _expected) = generate_byte_mul_test_data();
935 
936         for i in 0..NUM_TEST_WORDS_256 {
937             let left = load_m256i_word(&left, i);
938             let right = load_m256i_word(&right, i);
939             let result_left = _mm256_mask_gf2p8mul_epi8(left, 0, left, right);
940             assert_eq_m256i(result_left, left);
941             let mask_bytes: __mmask32 = 0x0F_F0_FF_00;
942             const MASK_WORDS: i32 = 0b01_10_11_00;
943             let expected_result = _mm256_gf2p8mul_epi8(left, right);
944             let result_masked = _mm256_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
945             let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
946             assert_eq_m256i(result_masked, expected_masked);
947         }
948     }
949 
950     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm_gf2p8mul_epi8()951     unsafe fn test_mm_gf2p8mul_epi8() {
952         let (left, right, expected) = generate_byte_mul_test_data();
953 
954         for i in 0..NUM_TEST_WORDS_128 {
955             let left = load_m128i_word(&left, i);
956             let right = load_m128i_word(&right, i);
957             let expected = load_m128i_word(&expected, i);
958             let result = _mm_gf2p8mul_epi8(left, right);
959             assert_eq_m128i(result, expected);
960         }
961     }
962 
963     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm_maskz_gf2p8mul_epi8()964     unsafe fn test_mm_maskz_gf2p8mul_epi8() {
965         let (left, right, _expected) = generate_byte_mul_test_data();
966 
967         for i in 0..NUM_TEST_WORDS_128 {
968             let left = load_m128i_word(&left, i);
969             let right = load_m128i_word(&right, i);
970             let result_zero = _mm_maskz_gf2p8mul_epi8(0, left, right);
971             assert_eq_m128i(result_zero, _mm_setzero_si128());
972             let mask_bytes: __mmask16 = 0x0F_F0;
973             const MASK_WORDS: i32 = 0b01_10;
974             let expected_result = _mm_gf2p8mul_epi8(left, right);
975             let result_masked = _mm_maskz_gf2p8mul_epi8(mask_bytes, left, right);
976             let expected_masked =
977                 _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
978             assert_eq_m128i(result_masked, expected_masked);
979         }
980     }
981 
982     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm_mask_gf2p8mul_epi8()983     unsafe fn test_mm_mask_gf2p8mul_epi8() {
984         let (left, right, _expected) = generate_byte_mul_test_data();
985 
986         for i in 0..NUM_TEST_WORDS_128 {
987             let left = load_m128i_word(&left, i);
988             let right = load_m128i_word(&right, i);
989             let result_left = _mm_mask_gf2p8mul_epi8(left, 0, left, right);
990             assert_eq_m128i(result_left, left);
991             let mask_bytes: __mmask16 = 0x0F_F0;
992             const MASK_WORDS: i32 = 0b01_10;
993             let expected_result = _mm_gf2p8mul_epi8(left, right);
994             let result_masked = _mm_mask_gf2p8mul_epi8(left, mask_bytes, left, right);
995             let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
996             assert_eq_m128i(result_masked, expected_masked);
997         }
998     }
999 
1000     #[simd_test(enable = "avx512gfni,avx512bw")]
test_mm512_gf2p8affine_epi64_epi8()1001     unsafe fn test_mm512_gf2p8affine_epi64_epi8() {
1002         let identity: i64 = 0x01_02_04_08_10_20_40_80;
1003         const IDENTITY_BYTE: i32 = 0;
1004         let constant: i64 = 0;
1005         const CONSTANT_BYTE: i32 = 0x63;
1006         let identity = _mm512_set1_epi64(identity);
1007         let constant = _mm512_set1_epi64(constant);
1008         let constant_reference = _mm512_set1_epi8(CONSTANT_BYTE as i8);
1009 
1010         let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1011         let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1012 
1013         for i in 0..NUM_TEST_WORDS_512 {
1014             let data = load_m512i_word(&bytes, i);
1015             let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1016             assert_eq_m512i(result, data);
1017             let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1018             assert_eq_m512i(result, constant_reference);
1019             let data = load_m512i_word(&more_bytes, i);
1020             let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1021             assert_eq_m512i(result, data);
1022             let result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1023             assert_eq_m512i(result, constant_reference);
1024 
1025             let matrix = load_m512i_word(&matrices, i);
1026             let vector = load_m512i_word(&vectors, i);
1027             let reference = load_m512i_word(&references, i);
1028 
1029             let result = _mm512_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1030             assert_eq_m512i(result, reference);
1031         }
1032     }
1033 
1034     #[simd_test(enable = "avx512gfni,avx512bw")]
test_mm512_maskz_gf2p8affine_epi64_epi8()1035     unsafe fn test_mm512_maskz_gf2p8affine_epi64_epi8() {
1036         const CONSTANT_BYTE: i32 = 0x63;
1037         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1038 
1039         for i in 0..NUM_TEST_WORDS_512 {
1040             let matrix = load_m512i_word(&matrices, i);
1041             let vector = load_m512i_word(&vectors, i);
1042             let result_zero =
1043                 _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1044             assert_eq_m512i(result_zero, _mm512_setzero_si512());
1045             let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1046             let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1047             let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1048             let result_masked =
1049                 _mm512_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1050             let expected_masked =
1051                 _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1052             assert_eq_m512i(result_masked, expected_masked);
1053         }
1054     }
1055 
1056     #[simd_test(enable = "avx512gfni,avx512bw")]
test_mm512_mask_gf2p8affine_epi64_epi8()1057     unsafe fn test_mm512_mask_gf2p8affine_epi64_epi8() {
1058         const CONSTANT_BYTE: i32 = 0x63;
1059         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1060 
1061         for i in 0..NUM_TEST_WORDS_512 {
1062             let left = load_m512i_word(&vectors, i);
1063             let right = load_m512i_word(&matrices, i);
1064             let result_left =
1065                 _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1066             assert_eq_m512i(result_left, left);
1067             let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1068             let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1069             let expected_result = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1070             let result_masked =
1071                 _mm512_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1072             let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1073             assert_eq_m512i(result_masked, expected_masked);
1074         }
1075     }
1076 
1077     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm256_gf2p8affine_epi64_epi8()1078     unsafe fn test_mm256_gf2p8affine_epi64_epi8() {
1079         let identity: i64 = 0x01_02_04_08_10_20_40_80;
1080         const IDENTITY_BYTE: i32 = 0;
1081         let constant: i64 = 0;
1082         const CONSTANT_BYTE: i32 = 0x63;
1083         let identity = _mm256_set1_epi64x(identity);
1084         let constant = _mm256_set1_epi64x(constant);
1085         let constant_reference = _mm256_set1_epi8(CONSTANT_BYTE as i8);
1086 
1087         let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1088         let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1089 
1090         for i in 0..NUM_TEST_WORDS_256 {
1091             let data = load_m256i_word(&bytes, i);
1092             let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1093             assert_eq_m256i(result, data);
1094             let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1095             assert_eq_m256i(result, constant_reference);
1096             let data = load_m256i_word(&more_bytes, i);
1097             let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1098             assert_eq_m256i(result, data);
1099             let result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1100             assert_eq_m256i(result, constant_reference);
1101 
1102             let matrix = load_m256i_word(&matrices, i);
1103             let vector = load_m256i_word(&vectors, i);
1104             let reference = load_m256i_word(&references, i);
1105 
1106             let result = _mm256_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1107             assert_eq_m256i(result, reference);
1108         }
1109     }
1110 
1111     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm256_maskz_gf2p8affine_epi64_epi8()1112     unsafe fn test_mm256_maskz_gf2p8affine_epi64_epi8() {
1113         const CONSTANT_BYTE: i32 = 0x63;
1114         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1115 
1116         for i in 0..NUM_TEST_WORDS_256 {
1117             let matrix = load_m256i_word(&matrices, i);
1118             let vector = load_m256i_word(&vectors, i);
1119             let result_zero =
1120                 _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1121             assert_eq_m256i(result_zero, _mm256_setzero_si256());
1122             let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1123             const MASK_WORDS: i32 = 0b11_01_10_00;
1124             let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1125             let result_masked =
1126                 _mm256_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1127             let expected_masked =
1128                 _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1129             assert_eq_m256i(result_masked, expected_masked);
1130         }
1131     }
1132 
1133     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm256_mask_gf2p8affine_epi64_epi8()1134     unsafe fn test_mm256_mask_gf2p8affine_epi64_epi8() {
1135         const CONSTANT_BYTE: i32 = 0x63;
1136         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1137 
1138         for i in 0..NUM_TEST_WORDS_256 {
1139             let left = load_m256i_word(&vectors, i);
1140             let right = load_m256i_word(&matrices, i);
1141             let result_left =
1142                 _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1143             assert_eq_m256i(result_left, left);
1144             let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1145             const MASK_WORDS: i32 = 0b11_01_10_00;
1146             let expected_result = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1147             let result_masked =
1148                 _mm256_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1149             let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1150             assert_eq_m256i(result_masked, expected_masked);
1151         }
1152     }
1153 
1154     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm_gf2p8affine_epi64_epi8()1155     unsafe fn test_mm_gf2p8affine_epi64_epi8() {
1156         let identity: i64 = 0x01_02_04_08_10_20_40_80;
1157         const IDENTITY_BYTE: i32 = 0;
1158         let constant: i64 = 0;
1159         const CONSTANT_BYTE: i32 = 0x63;
1160         let identity = _mm_set1_epi64x(identity);
1161         let constant = _mm_set1_epi64x(constant);
1162         let constant_reference = _mm_set1_epi8(CONSTANT_BYTE as i8);
1163 
1164         let (bytes, more_bytes, _) = generate_byte_mul_test_data();
1165         let (matrices, vectors, references) = generate_affine_mul_test_data(IDENTITY_BYTE as u8);
1166 
1167         for i in 0..NUM_TEST_WORDS_128 {
1168             let data = load_m128i_word(&bytes, i);
1169             let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1170             assert_eq_m128i(result, data);
1171             let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1172             assert_eq_m128i(result, constant_reference);
1173             let data = load_m128i_word(&more_bytes, i);
1174             let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(data, identity);
1175             assert_eq_m128i(result, data);
1176             let result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(data, constant);
1177             assert_eq_m128i(result, constant_reference);
1178 
1179             let matrix = load_m128i_word(&matrices, i);
1180             let vector = load_m128i_word(&vectors, i);
1181             let reference = load_m128i_word(&references, i);
1182 
1183             let result = _mm_gf2p8affine_epi64_epi8::<IDENTITY_BYTE>(vector, matrix);
1184             assert_eq_m128i(result, reference);
1185         }
1186     }
1187 
1188     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm_maskz_gf2p8affine_epi64_epi8()1189     unsafe fn test_mm_maskz_gf2p8affine_epi64_epi8() {
1190         const CONSTANT_BYTE: i32 = 0x63;
1191         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1192 
1193         for i in 0..NUM_TEST_WORDS_128 {
1194             let matrix = load_m128i_word(&matrices, i);
1195             let vector = load_m128i_word(&vectors, i);
1196             let result_zero = _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1197             assert_eq_m128i(result_zero, _mm_setzero_si128());
1198             let mask_bytes: __mmask16 = 0x0F_F0;
1199             const MASK_WORDS: i32 = 0b01_10;
1200             let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1201             let result_masked =
1202                 _mm_maskz_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1203             let expected_masked =
1204                 _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1205             assert_eq_m128i(result_masked, expected_masked);
1206         }
1207     }
1208 
1209     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm_mask_gf2p8affine_epi64_epi8()1210     unsafe fn test_mm_mask_gf2p8affine_epi64_epi8() {
1211         const CONSTANT_BYTE: i32 = 0x63;
1212         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1213 
1214         for i in 0..NUM_TEST_WORDS_128 {
1215             let left = load_m128i_word(&vectors, i);
1216             let right = load_m128i_word(&matrices, i);
1217             let result_left =
1218                 _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1219             assert_eq_m128i(result_left, left);
1220             let mask_bytes: __mmask16 = 0x0F_F0;
1221             const MASK_WORDS: i32 = 0b01_10;
1222             let expected_result = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, right);
1223             let result_masked =
1224                 _mm_mask_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1225             let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1226             assert_eq_m128i(result_masked, expected_masked);
1227         }
1228     }
1229 
1230     #[simd_test(enable = "avx512gfni,avx512bw")]
test_mm512_gf2p8affineinv_epi64_epi8()1231     unsafe fn test_mm512_gf2p8affineinv_epi64_epi8() {
1232         let identity: i64 = 0x01_02_04_08_10_20_40_80;
1233         const IDENTITY_BYTE: i32 = 0;
1234         const CONSTANT_BYTE: i32 = 0x63;
1235         let identity = _mm512_set1_epi64(identity);
1236 
1237         // validate inversion
1238         let (inputs, results) = generate_inv_tests_data();
1239 
1240         for i in 0..NUM_BYTES_WORDS_512 {
1241             let input = load_m512i_word(&inputs, i);
1242             let reference = load_m512i_word(&results, i);
1243             let result = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1244             let remultiplied = _mm512_gf2p8mul_epi8(result, input);
1245             assert_eq_m512i(remultiplied, reference);
1246         }
1247 
1248         // validate subsequent affine operation
1249         let (matrices, vectors, _affine_expected) =
1250             generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1251 
1252         for i in 0..NUM_TEST_WORDS_512 {
1253             let vector = load_m512i_word(&vectors, i);
1254             let matrix = load_m512i_word(&matrices, i);
1255 
1256             let inv_vec = _mm512_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1257             let reference = _mm512_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1258             let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1259             assert_eq_m512i(result, reference);
1260         }
1261 
1262         // validate everything by virtue of checking against the AES SBox
1263         const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1264         let sbox_matrix = _mm512_set1_epi64(AES_S_BOX_MATRIX);
1265 
1266         for i in 0..NUM_BYTES_WORDS_512 {
1267             let reference = load_m512i_word(&AES_S_BOX, i);
1268             let input = load_m512i_word(&inputs, i);
1269             let result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1270             assert_eq_m512i(result, reference);
1271         }
1272     }
1273 
1274     #[simd_test(enable = "avx512gfni,avx512bw")]
test_mm512_maskz_gf2p8affineinv_epi64_epi8()1275     unsafe fn test_mm512_maskz_gf2p8affineinv_epi64_epi8() {
1276         const CONSTANT_BYTE: i32 = 0x63;
1277         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1278 
1279         for i in 0..NUM_TEST_WORDS_512 {
1280             let matrix = load_m512i_word(&matrices, i);
1281             let vector = load_m512i_word(&vectors, i);
1282             let result_zero =
1283                 _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1284             assert_eq_m512i(result_zero, _mm512_setzero_si512());
1285             let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1286             let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1287             let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1288             let result_masked =
1289                 _mm512_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1290             let expected_masked =
1291                 _mm512_mask_blend_epi32(mask_words, _mm512_setzero_si512(), expected_result);
1292             assert_eq_m512i(result_masked, expected_masked);
1293         }
1294     }
1295 
1296     #[simd_test(enable = "avx512gfni,avx512bw")]
test_mm512_mask_gf2p8affineinv_epi64_epi8()1297     unsafe fn test_mm512_mask_gf2p8affineinv_epi64_epi8() {
1298         const CONSTANT_BYTE: i32 = 0x63;
1299         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1300 
1301         for i in 0..NUM_TEST_WORDS_512 {
1302             let left = load_m512i_word(&vectors, i);
1303             let right = load_m512i_word(&matrices, i);
1304             let result_left =
1305                 _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1306             assert_eq_m512i(result_left, left);
1307             let mask_bytes: __mmask64 = 0x0F_0F_0F_0F_FF_FF_00_00;
1308             let mask_words: __mmask16 = 0b01_01_01_01_11_11_00_00;
1309             let expected_result = _mm512_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1310             let result_masked = _mm512_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1311                 left, mask_bytes, left, right,
1312             );
1313             let expected_masked = _mm512_mask_blend_epi32(mask_words, left, expected_result);
1314             assert_eq_m512i(result_masked, expected_masked);
1315         }
1316     }
1317 
1318     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm256_gf2p8affineinv_epi64_epi8()1319     unsafe fn test_mm256_gf2p8affineinv_epi64_epi8() {
1320         let identity: i64 = 0x01_02_04_08_10_20_40_80;
1321         const IDENTITY_BYTE: i32 = 0;
1322         const CONSTANT_BYTE: i32 = 0x63;
1323         let identity = _mm256_set1_epi64x(identity);
1324 
1325         // validate inversion
1326         let (inputs, results) = generate_inv_tests_data();
1327 
1328         for i in 0..NUM_BYTES_WORDS_256 {
1329             let input = load_m256i_word(&inputs, i);
1330             let reference = load_m256i_word(&results, i);
1331             let result = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1332             let remultiplied = _mm256_gf2p8mul_epi8(result, input);
1333             assert_eq_m256i(remultiplied, reference);
1334         }
1335 
1336         // validate subsequent affine operation
1337         let (matrices, vectors, _affine_expected) =
1338             generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1339 
1340         for i in 0..NUM_TEST_WORDS_256 {
1341             let vector = load_m256i_word(&vectors, i);
1342             let matrix = load_m256i_word(&matrices, i);
1343 
1344             let inv_vec = _mm256_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1345             let reference = _mm256_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1346             let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1347             assert_eq_m256i(result, reference);
1348         }
1349 
1350         // validate everything by virtue of checking against the AES SBox
1351         const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1352         let sbox_matrix = _mm256_set1_epi64x(AES_S_BOX_MATRIX);
1353 
1354         for i in 0..NUM_BYTES_WORDS_256 {
1355             let reference = load_m256i_word(&AES_S_BOX, i);
1356             let input = load_m256i_word(&inputs, i);
1357             let result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1358             assert_eq_m256i(result, reference);
1359         }
1360     }
1361 
1362     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm256_maskz_gf2p8affineinv_epi64_epi8()1363     unsafe fn test_mm256_maskz_gf2p8affineinv_epi64_epi8() {
1364         const CONSTANT_BYTE: i32 = 0x63;
1365         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1366 
1367         for i in 0..NUM_TEST_WORDS_256 {
1368             let matrix = load_m256i_word(&matrices, i);
1369             let vector = load_m256i_word(&vectors, i);
1370             let result_zero =
1371                 _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1372             assert_eq_m256i(result_zero, _mm256_setzero_si256());
1373             let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1374             const MASK_WORDS: i32 = 0b11_01_10_00;
1375             let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1376             let result_masked =
1377                 _mm256_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1378             let expected_masked =
1379                 _mm256_blend_epi32::<MASK_WORDS>(_mm256_setzero_si256(), expected_result);
1380             assert_eq_m256i(result_masked, expected_masked);
1381         }
1382     }
1383 
1384     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm256_mask_gf2p8affineinv_epi64_epi8()1385     unsafe fn test_mm256_mask_gf2p8affineinv_epi64_epi8() {
1386         const CONSTANT_BYTE: i32 = 0x63;
1387         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1388 
1389         for i in 0..NUM_TEST_WORDS_256 {
1390             let left = load_m256i_word(&vectors, i);
1391             let right = load_m256i_word(&matrices, i);
1392             let result_left =
1393                 _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1394             assert_eq_m256i(result_left, left);
1395             let mask_bytes: __mmask32 = 0xFF_0F_F0_00;
1396             const MASK_WORDS: i32 = 0b11_01_10_00;
1397             let expected_result = _mm256_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1398             let result_masked = _mm256_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(
1399                 left, mask_bytes, left, right,
1400             );
1401             let expected_masked = _mm256_blend_epi32::<MASK_WORDS>(left, expected_result);
1402             assert_eq_m256i(result_masked, expected_masked);
1403         }
1404     }
1405 
1406     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm_gf2p8affineinv_epi64_epi8()1407     unsafe fn test_mm_gf2p8affineinv_epi64_epi8() {
1408         let identity: i64 = 0x01_02_04_08_10_20_40_80;
1409         const IDENTITY_BYTE: i32 = 0;
1410         const CONSTANT_BYTE: i32 = 0x63;
1411         let identity = _mm_set1_epi64x(identity);
1412 
1413         // validate inversion
1414         let (inputs, results) = generate_inv_tests_data();
1415 
1416         for i in 0..NUM_BYTES_WORDS_128 {
1417             let input = load_m128i_word(&inputs, i);
1418             let reference = load_m128i_word(&results, i);
1419             let result = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(input, identity);
1420             let remultiplied = _mm_gf2p8mul_epi8(result, input);
1421             assert_eq_m128i(remultiplied, reference);
1422         }
1423 
1424         // validate subsequent affine operation
1425         let (matrices, vectors, _affine_expected) =
1426             generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1427 
1428         for i in 0..NUM_TEST_WORDS_128 {
1429             let vector = load_m128i_word(&vectors, i);
1430             let matrix = load_m128i_word(&matrices, i);
1431 
1432             let inv_vec = _mm_gf2p8affineinv_epi64_epi8::<IDENTITY_BYTE>(vector, identity);
1433             let reference = _mm_gf2p8affine_epi64_epi8::<CONSTANT_BYTE>(inv_vec, matrix);
1434             let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1435             assert_eq_m128i(result, reference);
1436         }
1437 
1438         // validate everything by virtue of checking against the AES SBox
1439         const AES_S_BOX_MATRIX: i64 = 0xF1_E3_C7_8F_1F_3E_7C_F8;
1440         let sbox_matrix = _mm_set1_epi64x(AES_S_BOX_MATRIX);
1441 
1442         for i in 0..NUM_BYTES_WORDS_128 {
1443             let reference = load_m128i_word(&AES_S_BOX, i);
1444             let input = load_m128i_word(&inputs, i);
1445             let result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(input, sbox_matrix);
1446             assert_eq_m128i(result, reference);
1447         }
1448     }
1449 
1450     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm_maskz_gf2p8affineinv_epi64_epi8()1451     unsafe fn test_mm_maskz_gf2p8affineinv_epi64_epi8() {
1452         const CONSTANT_BYTE: i32 = 0x63;
1453         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1454 
1455         for i in 0..NUM_TEST_WORDS_128 {
1456             let matrix = load_m128i_word(&matrices, i);
1457             let vector = load_m128i_word(&vectors, i);
1458             let result_zero =
1459                 _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(0, vector, matrix);
1460             assert_eq_m128i(result_zero, _mm_setzero_si128());
1461             let mask_bytes: __mmask16 = 0x0F_F0;
1462             const MASK_WORDS: i32 = 0b01_10;
1463             let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(vector, matrix);
1464             let result_masked =
1465                 _mm_maskz_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(mask_bytes, vector, matrix);
1466             let expected_masked =
1467                 _mm_blend_epi32::<MASK_WORDS>(_mm_setzero_si128(), expected_result);
1468             assert_eq_m128i(result_masked, expected_masked);
1469         }
1470     }
1471 
1472     #[simd_test(enable = "avx512gfni,avx512bw,avx512vl")]
test_mm_mask_gf2p8affineinv_epi64_epi8()1473     unsafe fn test_mm_mask_gf2p8affineinv_epi64_epi8() {
1474         const CONSTANT_BYTE: i32 = 0x63;
1475         let (matrices, vectors, _expected) = generate_affine_mul_test_data(CONSTANT_BYTE as u8);
1476 
1477         for i in 0..NUM_TEST_WORDS_128 {
1478             let left = load_m128i_word(&vectors, i);
1479             let right = load_m128i_word(&matrices, i);
1480             let result_left =
1481                 _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, 0, left, right);
1482             assert_eq_m128i(result_left, left);
1483             let mask_bytes: __mmask16 = 0x0F_F0;
1484             const MASK_WORDS: i32 = 0b01_10;
1485             let expected_result = _mm_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, right);
1486             let result_masked =
1487                 _mm_mask_gf2p8affineinv_epi64_epi8::<CONSTANT_BYTE>(left, mask_bytes, left, right);
1488             let expected_masked = _mm_blend_epi32::<MASK_WORDS>(left, expected_result);
1489             assert_eq_m128i(result_masked, expected_masked);
1490         }
1491     }
1492 }
1493