1 /* SPDX-License-Identifier: MIT
2  *
3  * Permission is hereby granted, free of charge, to any person
4  * obtaining a copy of this software and associated documentation
5  * files (the "Software"), to deal in the Software without
6  * restriction, including without limitation the rights to use, copy,
7  * modify, merge, publish, distribute, sublicense, and/or sell copies
8  * of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  *
23  * Copyright:
24  *   2020      Evan Nemerson <evan@nemerson.com>
25  *   2016      Thomas Pornin <pornin@bolet.org>
26  */
27 
28 /* The portable version is based on the implementation in BearSSL,
29  * which is MIT licensed, constant-time / branch-free, and documented
30  * at https://www.bearssl.org/constanttime.html (specifically, we use
31  * the implementation from ghash_ctmul64.c). */
32 
33 #if !defined(SIMDE_X86_CLMUL_H)
34 #define SIMDE_X86_CLMUL_H
35 
36 #include "avx512/set.h"
37 #include "avx512/setzero.h"
38 
39 #if !defined(SIMDE_X86_PCLMUL_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES)
40 #  define SIMDE_X86_PCLMUL_ENABLE_NATIVE_ALIASES
41 #endif
42 
43 HEDLEY_DIAGNOSTIC_PUSH
44 SIMDE_DISABLE_UNWANTED_DIAGNOSTICS
45 SIMDE_BEGIN_DECLS_
46 
47 SIMDE_FUNCTION_ATTRIBUTES
48 uint64_t
simde_x_clmul_u64(uint64_t x,uint64_t y)49 simde_x_clmul_u64(uint64_t x, uint64_t y) {
50   uint64_t x0, x1, x2, x3;
51   uint64_t y0, y1, y2, y3;
52   uint64_t z0, z1, z2, z3;
53 
54   x0 = x & UINT64_C(0x1111111111111111);
55   x1 = x & UINT64_C(0x2222222222222222);
56   x2 = x & UINT64_C(0x4444444444444444);
57   x3 = x & UINT64_C(0x8888888888888888);
58   y0 = y & UINT64_C(0x1111111111111111);
59   y1 = y & UINT64_C(0x2222222222222222);
60   y2 = y & UINT64_C(0x4444444444444444);
61   y3 = y & UINT64_C(0x8888888888888888);
62 
63   z0 = (x0 * y0) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1);
64   z1 = (x0 * y1) ^ (x1 * y0) ^ (x2 * y3) ^ (x3 * y2);
65   z2 = (x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y3);
66   z3 = (x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0);
67 
68   z0 &= UINT64_C(0x1111111111111111);
69   z1 &= UINT64_C(0x2222222222222222);
70   z2 &= UINT64_C(0x4444444444444444);
71   z3 &= UINT64_C(0x8888888888888888);
72 
73   return z0 | z1 | z2 | z3;
74 }
75 
76 static uint64_t
simde_x_bitreverse_u64(uint64_t v)77 simde_x_bitreverse_u64(uint64_t v) {
78   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE)
79     uint8x8_t bytes = vreinterpret_u8_u64(vmov_n_u64(v));
80     bytes = vrbit_u8(bytes);
81     bytes = vrev64_u8(bytes);
82     return vget_lane_u64(vreinterpret_u64_u8(bytes), 0);
83   #elif defined(SIMDE_X86_GFNI_NATIVE)
84     /* I don't think there is (or likely will ever be) a CPU with GFNI
85      * but not pclmulq, but this may be useful for things other than
86      * _mm_clmulepi64_si128. */
87     __m128i vec = _mm_cvtsi64_si128(HEDLEY_STATIC_CAST(int64_t, v));
88 
89     /* Reverse bits within each byte */
90     vec = _mm_gf2p8affine_epi64_epi8(vec, _mm_cvtsi64_si128(HEDLEY_STATIC_CAST(int64_t, UINT64_C(0x8040201008040201))), 0);
91 
92     /* Reverse bytes */
93     #if defined(SIMDE_X86_SSSE3_NATIVE)
94       vec = _mm_shuffle_epi8(vec, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7));
95     #else
96       vec = _mm_or_si128(_mm_slli_epi16(vec, 8), _mm_srli_epi16(vec, 8));
97       vec = _mm_shufflelo_epi16(vec, _MM_SHUFFLE(0, 1, 2, 3));
98       vec = _mm_shufflehi_epi16(vec, _MM_SHUFFLE(0, 1, 2, 3));
99     #endif
100 
101     return HEDLEY_STATIC_CAST(uint64_t, _mm_cvtsi128_si64(vec));
102   #elif HEDLEY_HAS_BUILTIN(__builtin_bitreverse64)
103     return __builtin_bitreverse64(v);
104   #else
105     v = ((v >>  1) & UINT64_C(0x5555555555555555)) | ((v & UINT64_C(0x5555555555555555)) <<  1);
106     v = ((v >>  2) & UINT64_C(0x3333333333333333)) | ((v & UINT64_C(0x3333333333333333)) <<  2);
107     v = ((v >>  4) & UINT64_C(0x0F0F0F0F0F0F0F0F)) | ((v & UINT64_C(0x0F0F0F0F0F0F0F0F)) <<  4);
108     v = ((v >>  8) & UINT64_C(0x00FF00FF00FF00FF)) | ((v & UINT64_C(0x00FF00FF00FF00FF)) <<  8);
109     v = ((v >> 16) & UINT64_C(0x0000FFFF0000FFFF)) | ((v & UINT64_C(0x0000FFFF0000FFFF)) << 16);
110     return (v >> 32) | (v << 32);
111   #endif
112 }
113 
114 SIMDE_FUNCTION_ATTRIBUTES
115 simde__m128i
simde_mm_clmulepi64_si128(simde__m128i a,simde__m128i b,const int imm8)116 simde_mm_clmulepi64_si128 (simde__m128i a, simde__m128i b, const int imm8)
117     SIMDE_REQUIRE_CONSTANT(imm8) {
118   simde__m128i_private
119     a_ = simde__m128i_to_private(a),
120     b_ = simde__m128i_to_private(b),
121     r_;
122 
123   #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_AES)
124     uint64x1_t A = ((imm8) & 0x01) ? vget_high_u64(a_.neon_u64) : vget_low_u64(a_.neon_u64);
125     uint64x1_t B = ((imm8) & 0x10) ? vget_high_u64(b_.neon_u64) : vget_low_u64(b_.neon_u64);
126     #if defined(SIMDE_BUG_CLANG_48257)
127       HEDLEY_DIAGNOSTIC_PUSH
128       SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_
129     #endif
130     poly64_t A_ = vget_lane_p64(vreinterpret_p64_u64(A), 0);
131     poly64_t B_ = vget_lane_p64(vreinterpret_p64_u64(B), 0);
132     #if defined(SIMDE_BUG_CLANG_48257)
133       HEDLEY_DIAGNOSTIC_POP
134     #endif
135     poly128_t R = vmull_p64(A_, B_);
136     r_.neon_u64 = vreinterpretq_u64_p128(R);
137   #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128)
138     #if defined(SIMDE_SHUFFLE_VECTOR_)
139       switch (imm8 & 0x11) {
140         case 0x00:
141           b_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, b_.u64, b_.u64, 0, 0);
142           a_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, a_.u64, 0, 0);
143           break;
144         case 0x01:
145           b_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, b_.u64, b_.u64, 0, 0);
146           a_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, a_.u64, 1, 1);
147           break;
148         case 0x10:
149           b_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, b_.u64, b_.u64, 1, 1);
150           a_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, a_.u64, 0, 0);
151           break;
152         case 0x11:
153           b_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, b_.u64, b_.u64, 1, 1);
154           a_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, a_.u64, 1, 1);
155           break;
156       }
157     #else
158       {
159         const uint64_t A = a_.u64[(imm8     ) & 1];
160         const uint64_t B = b_.u64[(imm8 >> 4) & 1];
161 
162         SIMDE_VECTORIZE
163         for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) {
164           a_.u64[i] = A;
165           b_.u64[i] = B;
166         }
167       }
168     #endif
169 
170     simde__m128i_private reversed_;
171     {
172       #if defined(SIMDE_SHUFFLE_VECTOR_)
173         reversed_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, b_.u64, 1, 3);
174       #else
175         reversed_.u64[0] = a_.u64[1];
176         reversed_.u64[1] = b_.u64[1];
177       #endif
178 
179       SIMDE_VECTORIZE
180       for (size_t i = 0 ; i < (sizeof(reversed_.u64) / sizeof(reversed_.u64[0])) ; i++) {
181         reversed_.u64[i] = simde_x_bitreverse_u64(reversed_.u64[i]);
182       }
183     }
184 
185     #if defined(SIMDE_SHUFFLE_VECTOR_)
186       a_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, reversed_.u64, 0, 2);
187       b_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, b_.u64, reversed_.u64, 1, 3);
188     #else
189       a_.u64[1] = reversed_.u64[0];
190       b_.u64[1] = reversed_.u64[1];
191     #endif
192 
193     SIMDE_VECTORIZE
194     for (size_t i = 0 ; i < (sizeof(reversed_.u64) / sizeof(reversed_.u64[0])) ; i++) {
195       r_.u64[i] = simde_x_clmul_u64(a_.u64[i], b_.u64[i]);
196     }
197 
198     r_.u64[1] = simde_x_bitreverse_u64(r_.u64[1]) >> 1;
199   #else
200     r_.u64[0] =                        simde_x_clmul_u64(                       a_.u64[imm8 & 1],                         b_.u64[(imm8 >> 4) & 1]);
201     r_.u64[1] = simde_x_bitreverse_u64(simde_x_clmul_u64(simde_x_bitreverse_u64(a_.u64[imm8 & 1]), simde_x_bitreverse_u64(b_.u64[(imm8 >> 4) & 1]))) >> 1;
202   #endif
203 
204   return simde__m128i_from_private(r_);
205 }
206 #if defined(SIMDE_X86_PCLMUL_NATIVE)
207   #define simde_mm_clmulepi64_si128(a, b, imm8) _mm_clmulepi64_si128(a, b, imm8)
208 #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_AES)
209   #define simde_mm_clmulepi64_si128(a, b, imm8) \
210     simde__m128i_from_neon_u64( \
211       vreinterpretq_u64_p128( \
212         vmull_p64( \
213           vgetq_lane_p64(vreinterpretq_p64_u64(simde__m128i_to_neon_u64(a)), (imm8     ) & 1), \
214           vgetq_lane_p64(vreinterpretq_p64_u64(simde__m128i_to_neon_u64(b)), (imm8 >> 4) & 1) \
215         ) \
216       ) \
217     )
218 #endif
219 #if defined(SIMDE_X86_PCLMUL_ENABLE_NATIVE_ALIASES)
220   #undef _mm_clmulepi64_si128
221   #define _mm_clmulepi64_si128(a, b, imm8) simde_mm_clmulepi64_si128(a, b, imm8)
222 #endif
223 
224 SIMDE_FUNCTION_ATTRIBUTES
225 simde__m256i
simde_mm256_clmulepi64_epi128(simde__m256i a,simde__m256i b,const int imm8)226 simde_mm256_clmulepi64_epi128 (simde__m256i a, simde__m256i b, const int imm8)
227     SIMDE_REQUIRE_CONSTANT(imm8) {
228   simde__m256i_private
229     a_ = simde__m256i_to_private(a),
230     b_ = simde__m256i_to_private(b),
231     r_;
232 
233   #if defined(SIMDE_X86_PCLMUL_NATIVE)
234     switch (imm8 & 0x11) {
235       case 0x00:
236         r_.m128i[0] = _mm_clmulepi64_si128(a_.m128i[0], b_.m128i[0], 0x00);
237         r_.m128i[1] = _mm_clmulepi64_si128(a_.m128i[1], b_.m128i[1], 0x00);
238         break;
239       case 0x01:
240         r_.m128i[0] = _mm_clmulepi64_si128(a_.m128i[0], b_.m128i[0], 0x01);
241         r_.m128i[1] = _mm_clmulepi64_si128(a_.m128i[1], b_.m128i[1], 0x01);
242         break;
243       case 0x10:
244         r_.m128i[0] = _mm_clmulepi64_si128(a_.m128i[0], b_.m128i[0], 0x10);
245         r_.m128i[1] = _mm_clmulepi64_si128(a_.m128i[1], b_.m128i[1], 0x10);
246         break;
247       case 0x11:
248         r_.m128i[0] = _mm_clmulepi64_si128(a_.m128i[0], b_.m128i[0], 0x11);
249         r_.m128i[1] = _mm_clmulepi64_si128(a_.m128i[1], b_.m128i[1], 0x11);
250         break;
251     }
252   #else
253     simde__m128i_private a_lo_, b_lo_, r_lo_, a_hi_, b_hi_, r_hi_;
254 
255     #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION)
256       switch (imm8 & 0x01) {
257         case 0x00:
258           a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 0, 2);
259           break;
260         case 0x01:
261           a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 1, 3);
262           break;
263       }
264       switch (imm8 & 0x10) {
265         case 0x00:
266           b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 0, 2);
267           break;
268         case 0x10:
269           b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 1, 3);
270           break;
271       }
272     #else
273       a_lo_.u64[0] = a_.u64[((imm8 >> 0) & 1) + 0];
274       a_lo_.u64[1] = a_.u64[((imm8 >> 0) & 1) + 2];
275       b_lo_.u64[0] = b_.u64[((imm8 >> 4) & 1) + 0];
276       b_lo_.u64[1] = b_.u64[((imm8 >> 4) & 1) + 2];
277     #endif
278 
279     SIMDE_VECTORIZE
280     for (size_t i = 0 ; i < (sizeof(r_hi_.u64) / sizeof(r_hi_.u64[0])) ; i++) {
281       a_hi_.u64[i] = simde_x_bitreverse_u64(a_lo_.u64[i]);
282       b_hi_.u64[i] = simde_x_bitreverse_u64(b_lo_.u64[i]);
283 
284       r_lo_.u64[i] = simde_x_clmul_u64(a_lo_.u64[i], b_lo_.u64[i]);
285       r_hi_.u64[i] = simde_x_clmul_u64(a_hi_.u64[i], b_hi_.u64[i]);
286 
287       r_hi_.u64[i] = simde_x_bitreverse_u64(r_hi_.u64[i]) >> 1;
288     }
289 
290     #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION)
291       r_.u64 = __builtin_shufflevector(r_lo_.u64, r_hi_.u64, 0, 2, 1, 3);
292     #elif defined(SIMDE_SHUFFLE_VECTOR_)
293       r_ = simde__m256i_to_private(simde_mm256_set_m128i(simde__m128i_from_private(r_hi_), simde__m128i_from_private(r_lo_)));
294       r_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 32, r_.u64, r_.u64, 0, 2, 1, 3);
295     #else
296       r_.u64[0] = r_lo_.u64[0];
297       r_.u64[1] = r_hi_.u64[0];
298       r_.u64[2] = r_lo_.u64[1];
299       r_.u64[3] = r_hi_.u64[1];
300     #endif
301   #endif
302 
303   return simde__m256i_from_private(r_);
304 }
305 #if defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && defined(SIMDE_X86_AVX_NATIVE)
306   #define simde_mm256_clmulepi64_epi128(a, b, imm8) _mm256_clmulepi64_epi128(a, b, imm8)
307 #endif
308 #if defined(SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES)
309   #undef _mm256_clmulepi64_epi128
310   #define _mm256_clmulepi64_epi128(a, b, imm8) simde_mm256_clmulepi64_epi128(a, b, imm8)
311 #endif
312 
313 SIMDE_FUNCTION_ATTRIBUTES
314 simde__m512i
simde_mm512_clmulepi64_epi128(simde__m512i a,simde__m512i b,const int imm8)315 simde_mm512_clmulepi64_epi128 (simde__m512i a, simde__m512i b, const int imm8)
316     SIMDE_REQUIRE_CONSTANT(imm8) {
317   simde__m512i_private
318     a_ = simde__m512i_to_private(a),
319     b_ = simde__m512i_to_private(b),
320     r_;
321 
322   #if defined(HEDLEY_MSVC_VERSION)
323     r_ = simde__m512i_to_private(simde_mm512_setzero_si512());
324   #endif
325   #if SIMDE_NATURAL_VECTOR_SIZE_LE(256)
326     switch (imm8 & 0x11) {
327       case 0x00:
328         r_.m256i[0] = simde_mm256_clmulepi64_epi128(a_.m256i[0], b_.m256i[0], 0x00);
329         r_.m256i[1] = simde_mm256_clmulepi64_epi128(a_.m256i[1], b_.m256i[1], 0x00);
330         break;
331       case 0x01:
332         r_.m256i[0] = simde_mm256_clmulepi64_epi128(a_.m256i[0], b_.m256i[0], 0x01);
333         r_.m256i[1] = simde_mm256_clmulepi64_epi128(a_.m256i[1], b_.m256i[1], 0x01);
334         break;
335       case 0x10:
336         r_.m256i[0] = simde_mm256_clmulepi64_epi128(a_.m256i[0], b_.m256i[0], 0x10);
337         r_.m256i[1] = simde_mm256_clmulepi64_epi128(a_.m256i[1], b_.m256i[1], 0x10);
338         break;
339       case 0x11:
340         r_.m256i[0] = simde_mm256_clmulepi64_epi128(a_.m256i[0], b_.m256i[0], 0x11);
341         r_.m256i[1] = simde_mm256_clmulepi64_epi128(a_.m256i[1], b_.m256i[1], 0x11);
342         break;
343     }
344   #else
345     simde__m256i_private a_lo_, b_lo_, r_lo_, a_hi_, b_hi_, r_hi_;
346 
347     #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION)
348       switch (imm8 & 0x01) {
349         case 0x00:
350           a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 0, 2, 4, 6);
351           break;
352         case 0x01:
353           a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 1, 3, 5, 7);
354           break;
355       }
356       switch (imm8 & 0x10) {
357         case 0x00:
358           b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 0, 2, 4, 6);
359           break;
360         case 0x10:
361           b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 1, 3, 5, 7);
362           break;
363       }
364     #else
365       a_lo_.u64[0] = a_.u64[((imm8 >> 0) & 1) + 0];
366       a_lo_.u64[1] = a_.u64[((imm8 >> 0) & 1) + 2];
367       a_lo_.u64[2] = a_.u64[((imm8 >> 0) & 1) + 4];
368       a_lo_.u64[3] = a_.u64[((imm8 >> 0) & 1) + 6];
369       b_lo_.u64[0] = b_.u64[((imm8 >> 4) & 1) + 0];
370       b_lo_.u64[1] = b_.u64[((imm8 >> 4) & 1) + 2];
371       b_lo_.u64[2] = b_.u64[((imm8 >> 4) & 1) + 4];
372       b_lo_.u64[3] = b_.u64[((imm8 >> 4) & 1) + 6];
373     #endif
374 
375     SIMDE_VECTORIZE
376     for (size_t i = 0 ; i < (sizeof(r_hi_.u64) / sizeof(r_hi_.u64[0])) ; i++) {
377       a_hi_.u64[i] = simde_x_bitreverse_u64(a_lo_.u64[i]);
378       b_hi_.u64[i] = simde_x_bitreverse_u64(b_lo_.u64[i]);
379 
380       r_lo_.u64[i] = simde_x_clmul_u64(a_lo_.u64[i], b_lo_.u64[i]);
381       r_hi_.u64[i] = simde_x_clmul_u64(a_hi_.u64[i], b_hi_.u64[i]);
382 
383       r_hi_.u64[i] = simde_x_bitreverse_u64(r_hi_.u64[i]) >> 1;
384     }
385 
386     #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION)
387       r_.u64 = __builtin_shufflevector(r_lo_.u64, r_hi_.u64, 0, 4, 1, 5, 2, 6, 3, 7);
388     #else
389       r_.u64[0] = r_lo_.u64[0];
390       r_.u64[1] = r_hi_.u64[0];
391       r_.u64[2] = r_lo_.u64[1];
392       r_.u64[3] = r_hi_.u64[1];
393       r_.u64[4] = r_lo_.u64[2];
394       r_.u64[5] = r_hi_.u64[2];
395       r_.u64[6] = r_lo_.u64[3];
396       r_.u64[7] = r_hi_.u64[3];
397     #endif
398   #endif
399 
400   return simde__m512i_from_private(r_);
401 }
402 #if defined(SIMDE_X86_VPCLMULQDQ_NATIVE)
403   #define simde_mm512_clmulepi64_epi128(a, b, imm8) _mm512_clmulepi64_epi128(a, b, imm8)
404 #endif
405 #if defined(SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES)
406   #undef _mm512_clmulepi64_epi128
407   #define _mm512_clmulepi64_epi128(a, b, imm8) simde_mm512_clmulepi64_epi128(a, b, imm8)
408 #endif
409 
410 SIMDE_END_DECLS_
411 
412 HEDLEY_DIAGNOSTIC_POP
413 
414 #endif /* !defined(SIMDE_X86_CLMUL_H) */
415