1 #ifndef __Vec_Intrin_H
2 #define __Vec_Intrin_H
3 
4 #include <sys/types.h>
5 
6 // # DEBUGGING FLAGS
7 // =================
8 // It is possible to debug the trace of the primitives defined in
9 // this file by using the [DEBUG_VECTOR_TRACE] C flag.
10 // As we use the same vector types to manipulate blocks of uint32 and blocks
11 // of uint64, the log results will vary with the endianess, in particular for
12 // some generic operations like [and] or [xor]. By default, the printing is
13 // performed as if we were manipulating blocks of uint32. If you want to
14 // switch to blocks of uint64, use the flag: [DEBUG_VECTOR_TRACE_ELEMENTS_64].
15 // Note that if those flags are activated, it may be necessary to tweak a bit
16 // the compilation options to build HACL. More specifically, you may need to
17 // always activate the compiler options to use vector support (even for files
18 // which actually don't make use of vectors, if they have libintvector.h as
19 // a dependency). When comparing traces, note that some instructions are not
20 // compiled in the same order on the different platforms, but it doesn't lead
21 // to a lot of discrepancies in practice.
22 
23 #define Lib_IntVector_Intrinsics_bit_mask64(x) -((x)&1)
24 
25 #if defined(__x86_64__) || defined(_M_X64)
26 
27 // The following functions are only available on machines that support Intel AVX
28 
29 #include <emmintrin.h>
30 #include <tmmintrin.h>
31 #include <smmintrin.h>
32 
33 typedef __m128i Lib_IntVector_Intrinsics_vec128;
34 
35 #define Lib_IntVector_Intrinsics_ni_aes_enc(x0, x1) \
36     (_mm_aesenc_si128(x0, x1))
37 
38 #define Lib_IntVector_Intrinsics_ni_aes_enc_last(x0, x1) \
39     (_mm_aesenclast_si128(x0, x1))
40 
41 #define Lib_IntVector_Intrinsics_ni_aes_keygen_assist(x0, x1) \
42     (_mm_aeskeygenassist_si128(x0, x1))
43 
44 #define Lib_IntVector_Intrinsics_ni_clmul(x0, x1, x2) \
45     (_mm_clmulepi64_si128(x0, x1, x2))
46 
47 #define Lib_IntVector_Intrinsics_vec128_xor(x0, x1) \
48     (_mm_xor_si128(x0, x1))
49 
50 #define Lib_IntVector_Intrinsics_vec128_eq64(x0, x1) \
51     (_mm_cmpeq_epi64(x0, x1))
52 
53 #define Lib_IntVector_Intrinsics_vec128_eq32(x0, x1) \
54     (_mm_cmpeq_epi32(x0, x1))
55 
56 #define Lib_IntVector_Intrinsics_vec128_gt64(x0, x1) \
57     (_mm_cmpgt_epi64(x0, x1))
58 
59 #define Lib_IntVector_Intrinsics_vec128_gt32(x0, x1) \
60     (_mm_cmpgt_epi32(x0, x1))
61 
62 #define Lib_IntVector_Intrinsics_vec128_or(x0, x1) \
63     (_mm_or_si128(x0, x1))
64 
65 #define Lib_IntVector_Intrinsics_vec128_and(x0, x1) \
66     (_mm_and_si128(x0, x1))
67 
68 #define Lib_IntVector_Intrinsics_vec128_lognot(x0) \
69     (_mm_xor_si128(x0, _mm_set1_epi32(-1)))
70 
71 #define Lib_IntVector_Intrinsics_vec128_shift_left(x0, x1) \
72     (_mm_slli_si128(x0, (x1) / 8))
73 
74 #define Lib_IntVector_Intrinsics_vec128_shift_right(x0, x1) \
75     (_mm_srli_si128(x0, (x1) / 8))
76 
77 #define Lib_IntVector_Intrinsics_vec128_shift_left64(x0, x1) \
78     (_mm_slli_epi64(x0, x1))
79 
80 #define Lib_IntVector_Intrinsics_vec128_shift_right64(x0, x1) \
81     (_mm_srli_epi64(x0, x1))
82 
83 #define Lib_IntVector_Intrinsics_vec128_shift_left32(x0, x1) \
84     (_mm_slli_epi32(x0, x1))
85 
86 #define Lib_IntVector_Intrinsics_vec128_shift_right32(x0, x1) \
87     (_mm_srli_epi32(x0, x1))
88 
89 #define Lib_IntVector_Intrinsics_vec128_rotate_left32_8(x0) \
90     (_mm_shuffle_epi8(x0, _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3)))
91 
92 #define Lib_IntVector_Intrinsics_vec128_rotate_left32_16(x0) \
93     (_mm_shuffle_epi8(x0, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)))
94 
95 #define Lib_IntVector_Intrinsics_vec128_rotate_left32_24(x0) \
96     (_mm_shuffle_epi8(x0, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)))
97 
98 #define Lib_IntVector_Intrinsics_vec128_rotate_left32(x0, x1) \
99     (((x1) == 8 ? Lib_IntVector_Intrinsics_vec128_rotate_left32_8(x0) : ((x1) == 16 ? Lib_IntVector_Intrinsics_vec128_rotate_left32_16(x0) : ((x1) == 24 ? Lib_IntVector_Intrinsics_vec128_rotate_left32_24(x0) : _mm_xor_si128(_mm_slli_epi32(x0, x1), _mm_srli_epi32(x0, 32 - (x1)))))))
100 
101 #define Lib_IntVector_Intrinsics_vec128_rotate_right32(x0, x1) \
102     (Lib_IntVector_Intrinsics_vec128_rotate_left32(x0, 32 - (x1)))
103 
104 #define Lib_IntVector_Intrinsics_vec128_shuffle32(x0, x1, x2, x3, x4) \
105     (_mm_shuffle_epi32(x0, _MM_SHUFFLE(x4, x3, x2, x1)))
106 
107 #define Lib_IntVector_Intrinsics_vec128_shuffle64(x0, x1, x2) \
108     (_mm_shuffle_epi32(x0, _MM_SHUFFLE(2 * x1 + 1, 2 * x1, 2 * x2 + 1, 2 * x2)))
109 
110 #define Lib_IntVector_Intrinsics_vec128_rotate_right_lanes32(x0, x1) \
111     (_mm_shuffle_epi32(x0, _MM_SHUFFLE((x1 + 3) % 4, (x1 + 2) % 4, (x1 + 1) % 4, x1 % 4)))
112 
113 #define Lib_IntVector_Intrinsics_vec128_rotate_right_lanes64(x0, x1) \
114     (_mm_shuffle_epi32(x0, _MM_SHUFFLE((2 * x1 + 3) % 4, (2 * x1 + 2) % 4, (2 * x1 + 1) % 4, (2 * x1) % 4)))
115 
116 #define Lib_IntVector_Intrinsics_vec128_load32_le(x0) \
117     (_mm_loadu_si128((__m128i*)(x0)))
118 
119 #define Lib_IntVector_Intrinsics_vec128_load64_le(x0) \
120     (_mm_loadu_si128((__m128i*)(x0)))
121 
122 #define Lib_IntVector_Intrinsics_vec128_store32_le(x0, x1) \
123     (_mm_storeu_si128((__m128i*)(x0), x1))
124 
125 #define Lib_IntVector_Intrinsics_vec128_store64_le(x0, x1) \
126     (_mm_storeu_si128((__m128i*)(x0), x1))
127 
128 #define Lib_IntVector_Intrinsics_vec128_load_be(x0) \
129     (_mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(x0)), _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)))
130 
131 #define Lib_IntVector_Intrinsics_vec128_load32_be(x0) \
132     (_mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(x0)), _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)))
133 
134 #define Lib_IntVector_Intrinsics_vec128_load64_be(x0) \
135     (_mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(x0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)))
136 
137 #define Lib_IntVector_Intrinsics_vec128_store_be(x0, x1) \
138     (_mm_storeu_si128((__m128i*)(x0), _mm_shuffle_epi8(x1, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))))
139 
140 #define Lib_IntVector_Intrinsics_vec128_store32_be(x0, x1) \
141     (_mm_storeu_si128((__m128i*)(x0), _mm_shuffle_epi8(x1, _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3))))
142 
143 #define Lib_IntVector_Intrinsics_vec128_store64_be(x0, x1) \
144     (_mm_storeu_si128((__m128i*)(x0), _mm_shuffle_epi8(x1, _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7))))
145 
146 #define Lib_IntVector_Intrinsics_vec128_insert8(x0, x1, x2) \
147     (_mm_insert_epi8(x0, x1, x2))
148 
149 #define Lib_IntVector_Intrinsics_vec128_insert32(x0, x1, x2) \
150     (_mm_insert_epi32(x0, x1, x2))
151 
152 #define Lib_IntVector_Intrinsics_vec128_insert64(x0, x1, x2) \
153     (_mm_insert_epi64(x0, x1, x2))
154 
155 #define Lib_IntVector_Intrinsics_vec128_extract8(x0, x1) \
156     (_mm_extract_epi8(x0, x1))
157 
158 #define Lib_IntVector_Intrinsics_vec128_extract32(x0, x1) \
159     (_mm_extract_epi32(x0, x1))
160 
161 #define Lib_IntVector_Intrinsics_vec128_extract64(x0, x1) \
162     (_mm_extract_epi64(x0, x1))
163 
164 #define Lib_IntVector_Intrinsics_vec128_zero \
165     (_mm_setzero_si128())
166 
167 #define Lib_IntVector_Intrinsics_vec128_add64(x0, x1) \
168     (_mm_add_epi64(x0, x1))
169 
170 #define Lib_IntVector_Intrinsics_vec128_sub64(x0, x1) \
171     (_mm_sub_epi64(x0, x1))
172 
173 #define Lib_IntVector_Intrinsics_vec128_mul64(x0, x1) \
174     (_mm_mul_epu32(x0, x1))
175 
176 #define Lib_IntVector_Intrinsics_vec128_smul64(x0, x1) \
177     (_mm_mul_epu32(x0, _mm_set1_epi64x(x1)))
178 
179 #define Lib_IntVector_Intrinsics_vec128_add32(x0, x1) \
180     (_mm_add_epi32(x0, x1))
181 
182 #define Lib_IntVector_Intrinsics_vec128_sub32(x0, x1) \
183     (_mm_sub_epi32(x0, x1))
184 
185 #define Lib_IntVector_Intrinsics_vec128_mul32(x0, x1) \
186     (_mm_mullo_epi32(x0, x1))
187 
188 #define Lib_IntVector_Intrinsics_vec128_smul32(x0, x1) \
189     (_mm_mullo_epi32(x0, _mm_set1_epi32(x1)))
190 
191 #define Lib_IntVector_Intrinsics_vec128_load128(x) \
192     ((__m128i)x)
193 
194 #define Lib_IntVector_Intrinsics_vec128_load64(x) \
195     (_mm_set1_epi64x(x)) /* hi lo */
196 
197 #define Lib_IntVector_Intrinsics_vec128_load64s(x0, x1) \
198     (_mm_set_epi64x(x1, x0)) /* hi lo */
199 
200 #define Lib_IntVector_Intrinsics_vec128_load32(x) \
201     (_mm_set1_epi32(x))
202 
203 #define Lib_IntVector_Intrinsics_vec128_load32s(x0, x1, x2, x3) \
204     (_mm_set_epi32(x3, x2, x1, x0)) /* hi lo */
205 
206 #define Lib_IntVector_Intrinsics_vec128_interleave_low32(x1, x2) \
207     (_mm_unpacklo_epi32(x1, x2))
208 
209 #define Lib_IntVector_Intrinsics_vec128_interleave_high32(x1, x2) \
210     (_mm_unpackhi_epi32(x1, x2))
211 
212 #define Lib_IntVector_Intrinsics_vec128_interleave_low64(x1, x2) \
213     (_mm_unpacklo_epi64(x1, x2))
214 
215 #define Lib_IntVector_Intrinsics_vec128_interleave_high64(x1, x2) \
216     (_mm_unpackhi_epi64(x1, x2))
217 
218 // The following functions are only available on machines that support Intel AVX2
219 
220 #include <immintrin.h>
221 #include <wmmintrin.h>
222 
223 typedef __m256i Lib_IntVector_Intrinsics_vec256;
224 
225 #define Lib_IntVector_Intrinsics_vec256_eq64(x0, x1) \
226     (_mm256_cmpeq_epi64(x0, x1))
227 
228 #define Lib_IntVector_Intrinsics_vec256_eq32(x0, x1) \
229     (_mm256_cmpeq_epi32(x0, x1))
230 
231 #define Lib_IntVector_Intrinsics_vec256_gt64(x0, x1) \
232     (_mm256_cmpgt_epi64(x0, x1))
233 
234 #define Lib_IntVector_Intrinsics_vec256_gt32(x0, x1) \
235     (_mm256_cmpgt_epi32(x0, x1))
236 
237 #define Lib_IntVector_Intrinsics_vec256_xor(x0, x1) \
238     (_mm256_xor_si256(x0, x1))
239 
240 #define Lib_IntVector_Intrinsics_vec256_or(x0, x1) \
241     (_mm256_or_si256(x0, x1))
242 
243 #define Lib_IntVector_Intrinsics_vec256_and(x0, x1) \
244     (_mm256_and_si256(x0, x1))
245 
246 #define Lib_IntVector_Intrinsics_vec256_lognot(x0) \
247     (_mm256_xor_si256(x0, _mm256_set1_epi32(-1)))
248 
249 #define Lib_IntVector_Intrinsics_vec256_shift_left(x0, x1) \
250     (_mm256_slli_si256(x0, (x1) / 8))
251 
252 #define Lib_IntVector_Intrinsics_vec256_shift_right(x0, x1) \
253     (_mm256_srli_si256(x0, (x1) / 8))
254 
255 #define Lib_IntVector_Intrinsics_vec256_shift_left64(x0, x1) \
256     (_mm256_slli_epi64(x0, x1))
257 
258 #define Lib_IntVector_Intrinsics_vec256_shift_right64(x0, x1) \
259     (_mm256_srli_epi64(x0, x1))
260 
261 #define Lib_IntVector_Intrinsics_vec256_shift_left32(x0, x1) \
262     (_mm256_slli_epi32(x0, x1))
263 
264 #define Lib_IntVector_Intrinsics_vec256_shift_right32(x0, x1) \
265     (_mm256_srli_epi32(x0, x1))
266 
267 #define Lib_IntVector_Intrinsics_vec256_rotate_left32_8(x0) \
268     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3)))
269 
270 #define Lib_IntVector_Intrinsics_vec256_rotate_left32_16(x0) \
271     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)))
272 
273 #define Lib_IntVector_Intrinsics_vec256_rotate_left32_24(x0) \
274     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)))
275 
276 #define Lib_IntVector_Intrinsics_vec256_rotate_left32(x0, x1) \
277     ((x1 == 8 ? Lib_IntVector_Intrinsics_vec256_rotate_left32_8(x0) : (x1 == 16 ? Lib_IntVector_Intrinsics_vec256_rotate_left32_16(x0) : (x1 == 24 ? Lib_IntVector_Intrinsics_vec256_rotate_left32_24(x0) : _mm256_or_si256(_mm256_slli_epi32(x0, x1), _mm256_srli_epi32(x0, 32 - (x1)))))))
278 
279 #define Lib_IntVector_Intrinsics_vec256_rotate_right32(x0, x1) \
280     (Lib_IntVector_Intrinsics_vec256_rotate_left32(x0, 32 - (x1)))
281 
282 #define Lib_IntVector_Intrinsics_vec256_rotate_right64_8(x0) \
283     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(8, 15, 14, 13, 12, 11, 10, 9, 0, 7, 6, 5, 4, 3, 2, 1, 8, 15, 14, 13, 12, 11, 10, 9, 0, 7, 6, 5, 4, 3, 2, 1)))
284 
285 #define Lib_IntVector_Intrinsics_vec256_rotate_right64_16(x0) \
286     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(9, 8, 15, 14, 13, 12, 11, 10, 1, 0, 7, 6, 5, 4, 3, 2, 9, 8, 15, 14, 13, 12, 11, 10, 1, 0, 7, 6, 5, 4, 3, 2)))
287 
288 #define Lib_IntVector_Intrinsics_vec256_rotate_right64_24(x0) \
289     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(10, 9, 8, 15, 14, 13, 12, 11, 2, 1, 0, 7, 6, 5, 4, 3, 10, 9, 8, 15, 14, 13, 12, 11, 2, 1, 0, 7, 6, 5, 4, 3)))
290 
291 #define Lib_IntVector_Intrinsics_vec256_rotate_right64_32(x0) \
292     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4)))
293 
294 #define Lib_IntVector_Intrinsics_vec256_rotate_right64_40(x0) \
295     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(12, 11, 10, 9, 8, 15, 14, 13, 4, 3, 2, 1, 0, 7, 6, 5, 12, 11, 10, 9, 8, 15, 14, 13, 4, 3, 2, 1, 0, 7, 6, 5)))
296 
297 #define Lib_IntVector_Intrinsics_vec256_rotate_right64_48(x0) \
298     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(13, 12, 11, 10, 9, 8, 15, 14, 5, 4, 3, 2, 1, 0, 7, 6, 13, 12, 11, 10, 9, 8, 15, 14, 5, 4, 3, 2, 1, 0, 7, 6)))
299 
300 #define Lib_IntVector_Intrinsics_vec256_rotate_right64_56(x0) \
301     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(14, 13, 12, 11, 10, 9, 8, 15, 6, 5, 4, 3, 2, 1, 0, 7, 14, 13, 12, 11, 10, 9, 8, 15, 6, 5, 4, 3, 2, 1, 0, 7)))
302 
303 #define Lib_IntVector_Intrinsics_vec256_rotate_right64(x0, x1) \
304     ((x1 == 8 ? Lib_IntVector_Intrinsics_vec256_rotate_right64_8(x0) : (x1 == 16 ? Lib_IntVector_Intrinsics_vec256_rotate_right64_16(x0) : (x1 == 24 ? Lib_IntVector_Intrinsics_vec256_rotate_right64_24(x0) : (x1 == 32 ? Lib_IntVector_Intrinsics_vec256_rotate_right64_32(x0) : (x1 == 40 ? Lib_IntVector_Intrinsics_vec256_rotate_right64_40(x0) : (x1 == 48 ? Lib_IntVector_Intrinsics_vec256_rotate_right64_48(x0) : (x1 == 56 ? Lib_IntVector_Intrinsics_vec256_rotate_right64_56(x0) : _mm256_xor_si256(_mm256_srli_epi64((x0), (x1)), _mm256_slli_epi64((x0), (64 - (x1))))))))))))
305 
306 #define Lib_IntVector_Intrinsics_vec256_rotate_left64(x0, x1) \
307     (Lib_IntVector_Intrinsics_vec256_rotate_right64(x0, 64 - (x1)))
308 
309 #define Lib_IntVector_Intrinsics_vec256_shuffle64(x0, x1, x2, x3, x4) \
310     (_mm256_permute4x64_epi64(x0, _MM_SHUFFLE(x4, x3, x2, x1)))
311 
312 #define Lib_IntVector_Intrinsics_vec256_shuffle32(x0, x1, x2, x3, x4, x5, x6, x7, x8) \
313     (_mm256_permutevar8x32_epi32(x0, _mm256_set_epi32(x8, x7, x6, x5, x4, x3, x2, x1)))
314 
315 #define Lib_IntVector_Intrinsics_vec256_rotate_right_lanes32(x0, x1) \
316     (_mm256_permutevar8x32_epi32(x0, _mm256_set_epi32((x1 + 7) % 8, (x1 + 6) % 8, (x1 + 5) % 8, (x1 + 4) % 8, (x1 + 3 % 8), (x1 + 2) % 8, (x1 + 1) % 8, x1 % 8)))
317 
318 #define Lib_IntVector_Intrinsics_vec256_rotate_right_lanes64(x0, x1) \
319     (_mm256_permute4x64_epi64(x0, _MM_SHUFFLE((x1 + 3) % 4, (x1 + 2) % 4, (x1 + 1) % 4, x1 % 4)))
320 
321 #define Lib_IntVector_Intrinsics_vec256_load32_le(x0) \
322     (_mm256_loadu_si256((__m256i*)(x0)))
323 
324 #define Lib_IntVector_Intrinsics_vec256_load64_le(x0) \
325     (_mm256_loadu_si256((__m256i*)(x0)))
326 
327 #define Lib_IntVector_Intrinsics_vec256_load32_be(x0) \
328     (_mm256_shuffle_epi8(_mm256_loadu_si256((__m256i*)(x0)), _mm256_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)))
329 
330 #define Lib_IntVector_Intrinsics_vec256_load64_be(x0) \
331     (_mm256_shuffle_epi8(_mm256_loadu_si256((__m256i*)(x0)), _mm256_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)))
332 
333 #define Lib_IntVector_Intrinsics_vec256_store32_le(x0, x1) \
334     (_mm256_storeu_si256((__m256i*)(x0), x1))
335 
336 #define Lib_IntVector_Intrinsics_vec256_store64_le(x0, x1) \
337     (_mm256_storeu_si256((__m256i*)(x0), x1))
338 
339 #define Lib_IntVector_Intrinsics_vec256_store32_be(x0, x1) \
340     (_mm256_storeu_si256((__m256i*)(x0), _mm256_shuffle_epi8(x1, _mm256_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3))))
341 
342 #define Lib_IntVector_Intrinsics_vec256_store64_be(x0, x1) \
343     (_mm256_storeu_si256((__m256i*)(x0), _mm256_shuffle_epi8(x1, _mm256_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7))))
344 
345 #define Lib_IntVector_Intrinsics_vec256_insert8(x0, x1, x2) \
346     (_mm256_insert_epi8(x0, x1, x2))
347 
348 #define Lib_IntVector_Intrinsics_vec256_insert32(x0, x1, x2) \
349     (_mm256_insert_epi32(x0, x1, x2))
350 
351 #define Lib_IntVector_Intrinsics_vec256_insert64(x0, x1, x2) \
352     (_mm256_insert_epi64(x0, x1, x2))
353 
354 #define Lib_IntVector_Intrinsics_vec256_extract8(x0, x1) \
355     (_mm256_extract_epi8(x0, x1))
356 
357 #define Lib_IntVector_Intrinsics_vec256_extract32(x0, x1) \
358     (_mm256_extract_epi32(x0, x1))
359 
360 #define Lib_IntVector_Intrinsics_vec256_extract64(x0, x1) \
361     (_mm256_extract_epi64(x0, x1))
362 
363 #define Lib_IntVector_Intrinsics_vec256_zero \
364     (_mm256_setzero_si256())
365 
366 #define Lib_IntVector_Intrinsics_vec256_add64(x0, x1) \
367     (_mm256_add_epi64(x0, x1))
368 
369 #define Lib_IntVector_Intrinsics_vec256_sub64(x0, x1) \
370     (_mm256_sub_epi64(x0, x1))
371 
372 #define Lib_IntVector_Intrinsics_vec256_mul64(x0, x1) \
373     (_mm256_mul_epu32(x0, x1))
374 
375 #define Lib_IntVector_Intrinsics_vec256_smul64(x0, x1) \
376     (_mm256_mul_epu32(x0, _mm256_set1_epi64x(x1)))
377 
378 #define Lib_IntVector_Intrinsics_vec256_add32(x0, x1) \
379     (_mm256_add_epi32(x0, x1))
380 
381 #define Lib_IntVector_Intrinsics_vec256_sub32(x0, x1) \
382     (_mm256_sub_epi32(x0, x1))
383 
384 #define Lib_IntVector_Intrinsics_vec256_mul32(x0, x1) \
385     (_mm256_mullo_epi32(x0, x1))
386 
387 #define Lib_IntVector_Intrinsics_vec256_smul32(x0, x1) \
388     (_mm256_mullo_epi32(x0, _mm256_set1_epi32(x1)))
389 
390 #define Lib_IntVector_Intrinsics_vec256_load64(x1) \
391     (_mm256_set1_epi64x(x1)) /* hi lo */
392 
393 #define Lib_IntVector_Intrinsics_vec256_load64s(x0, x1, x2, x3) \
394     (_mm256_set_epi64x(x3, x2, x1, x0)) /* hi lo */
395 
396 #define Lib_IntVector_Intrinsics_vec256_load32(x) \
397     (_mm256_set1_epi32(x))
398 
399 #define Lib_IntVector_Intrinsics_vec256_load32s(x0, x1, x2, x3, x4, x5, x6, x7) \
400     (_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0)) /* hi lo */
401 
402 #define Lib_IntVector_Intrinsics_vec256_load128(x) \
403     (_mm256_set_m128i((__m128i)x))
404 
405 #define Lib_IntVector_Intrinsics_vec256_load128s(x0, x1) \
406     (_mm256_set_m128i((__m128i)x1, (__m128i)x0))
407 
408 #define Lib_IntVector_Intrinsics_vec256_interleave_low32(x1, x2) \
409     (_mm256_unpacklo_epi32(x1, x2))
410 
411 #define Lib_IntVector_Intrinsics_vec256_interleave_high32(x1, x2) \
412     (_mm256_unpackhi_epi32(x1, x2))
413 
414 #define Lib_IntVector_Intrinsics_vec256_interleave_low64(x1, x2) \
415     (_mm256_unpacklo_epi64(x1, x2))
416 
417 #define Lib_IntVector_Intrinsics_vec256_interleave_high64(x1, x2) \
418     (_mm256_unpackhi_epi64(x1, x2))
419 
420 #define Lib_IntVector_Intrinsics_vec256_interleave_low128(x1, x2) \
421     (_mm256_permute2x128_si256(x1, x2, 0x20))
422 
423 #define Lib_IntVector_Intrinsics_vec256_interleave_high128(x1, x2) \
424     (_mm256_permute2x128_si256(x1, x2, 0x31))
425 
426 #elif (defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)) && !defined(__ARM_32BIT_STATE)
427 #include <arm_neon.h>
428 
429 typedef uint32x4_t Lib_IntVector_Intrinsics_vec128;
430 
431 #define Lib_IntVector_Intrinsics_vec128_xor(x0, x1) \
432     (veorq_u32(x0, x1))
433 
434 #define Lib_IntVector_Intrinsics_vec128_eq64(x0, x1) \
435     (vceqq_u32(x0, x1))
436 
437 #define Lib_IntVector_Intrinsics_vec128_eq32(x0, x1) \
438     (vceqq_u32(x0, x1))
439 
440 #define Lib_IntVector_Intrinsics_vec128_gt32(x0, x1) \
441     (vcgtq_u32(x0, x1))
442 
443 #define high32(x0) \
444     (vmovn_u64(vshrq_n_u64(vreinterpretq_u64_u32(x0), 32)))
445 
446 #define low32(x0) \
447     (vmovn_u64(vreinterpretq_u64_u32(x0)))
448 
449 #define Lib_IntVector_Intrinsics_vec128_gt64(x0, x1) \
450     (vreinterpretq_u32_u64(vmovl_u32(vorr_u32(vcgt_u32(high32(x0), high32(x1)), vand_u32(vceq_u32(high32(x0), high32(x1)), vcgt_u32(low32(x0), low32(x1)))))))
451 
452 #define Lib_IntVector_Intrinsics_vec128_or(x0, x1) \
453     (vorrq_u32(x0, x1))
454 
455 #define Lib_IntVector_Intrinsics_vec128_and(x0, x1) \
456     (vandq_u32(x0, x1))
457 
458 #define Lib_IntVector_Intrinsics_vec128_lognot(x0) \
459     (vmvnq_u32(x0))
460 
461 #define Lib_IntVector_Intrinsics_vec128_shift_left(x0, x1) \
462     (vextq_u32(x0, vdupq_n_u8(0), 16 - (x1) / 8))
463 
464 #define Lib_IntVector_Intrinsics_vec128_shift_right(x0, x1) \
465     (vextq_u32(x0, vdupq_n_u8(0), (x1) / 8))
466 
467 #define Lib_IntVector_Intrinsics_vec128_shift_left64(x0, x1) \
468     (vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x0), x1)))
469 
470 #define Lib_IntVector_Intrinsics_vec128_shift_right64(x0, x1) \
471     (vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x0), x1)))
472 
473 #define Lib_IntVector_Intrinsics_vec128_shift_left32(x0, x1) \
474     (vshlq_n_u32(x0, x1))
475 
476 #define Lib_IntVector_Intrinsics_vec128_shift_right32(x0, x1) \
477     (vshrq_n_u32(x0, x1))
478 
479 #define Lib_IntVector_Intrinsics_vec128_rotate_left32_16(x1) \
480     (vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x1))))
481 
482 #define Lib_IntVector_Intrinsics_vec128_rotate_left32(x0, x1) \
483     (((x1) == 16 ? Lib_IntVector_Intrinsics_vec128_rotate_left32_16(x0) : vsriq_n_u32(vshlq_n_u32((x0), (x1)), (x0), 32 - (x1))))
484 
485 #define Lib_IntVector_Intrinsics_vec128_rotate_right32_16(x1) \
486     (vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x1))))
487 
488 #define Lib_IntVector_Intrinsics_vec128_rotate_right32(x0, x1) \
489     (((x1) == 16 ? Lib_IntVector_Intrinsics_vec128_rotate_right32_16(x0) : vsriq_n_u32(vshlq_n_u32((x0), 32 - (x1)), (x0), (x1))))
490 
491 #define Lib_IntVector_Intrinsics_vec128_rotate_right_lanes32(x0, x1) \
492     (vextq_u32(x0, x0, x1))
493 
494 #define Lib_IntVector_Intrinsics_vec128_rotate_right_lanes64(x0, x1) \
495     (vextq_u64(x0, x0, x1))
496 
497 /*
498 #define Lib_IntVector_Intrinsics_vec128_shuffle32(x0, x1, x2, x3, x4)	\
499   (_mm_shuffle_epi32(x0, _MM_SHUFFLE(x1,x2,x3,x4)))
500 
501 #define Lib_IntVector_Intrinsics_vec128_shuffle64(x0, x1, x2) \
502   (_mm_shuffle_epi32(x0, _MM_SHUFFLE(2*x1+1,2*x1,2*x2+1,2*x2)))
503 */
504 
505 #define Lib_IntVector_Intrinsics_vec128_load32_le(x0) \
506     (vld1q_u32((const uint32_t*)(x0)))
507 
508 #define Lib_IntVector_Intrinsics_vec128_load64_le(x0) \
509     (vld1q_u32((const uint32_t*)(x0)))
510 
511 #define Lib_IntVector_Intrinsics_vec128_store32_le(x0, x1) \
512     (vst1q_u32((uint32_t*)(x0), (x1)))
513 
514 #define Lib_IntVector_Intrinsics_vec128_store64_le(x0, x1) \
515     (vst1q_u32((uint32_t*)(x0), (x1)))
516 
517 /*
518 #define Lib_IntVector_Intrinsics_vec128_load_be(x0)		\
519   (     Lib_IntVector_Intrinsics_vec128 l = vrev64q_u8(vld1q_u32((uint32_t*)(x0)));
520 
521 */
522 
523 #define Lib_IntVector_Intrinsics_vec128_load32_be(x0) \
524     (vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t*)(x0))))))
525 
526 #define Lib_IntVector_Intrinsics_vec128_load64_be(x0) \
527     (vreinterpretq_u32_u8(vrev64q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t*)(x0))))))
528 
529 /*
530 #define Lib_IntVector_Intrinsics_vec128_store_be(x0, x1)	\
531   (_mm_storeu_si128((__m128i*)(x0), _mm_shuffle_epi8(x1, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))))
532 */
533 
534 #define Lib_IntVector_Intrinsics_vec128_store32_be(x0, x1) \
535     (vst1q_u32((uint32_t*)(x0), (vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x1))))))
536 
537 #define Lib_IntVector_Intrinsics_vec128_store64_be(x0, x1) \
538     (vst1q_u32((uint32_t*)(x0), (vreinterpretq_u32_u8(vrev64q_u8(vreinterpretq_u8_u32(x1))))))
539 
540 #define Lib_IntVector_Intrinsics_vec128_insert8(x0, x1, x2) \
541     (vsetq_lane_u8(x1, x0, x2))
542 
543 #define Lib_IntVector_Intrinsics_vec128_insert32(x0, x1, x2) \
544     (vsetq_lane_u32(x1, x0, x2))
545 
546 #define Lib_IntVector_Intrinsics_vec128_insert64(x0, x1, x2) \
547     (vreinterpretq_u32_u64(vsetq_lane_u64(x1, vreinterpretq_u64_u32(x0), x2)))
548 
549 #define Lib_IntVector_Intrinsics_vec128_extract8(x0, x1) \
550     (vgetq_lane_u8(x0, x1))
551 
552 #define Lib_IntVector_Intrinsics_vec128_extract32(x0, x1) \
553     (vgetq_lane_u32(x0, x1))
554 
555 #define Lib_IntVector_Intrinsics_vec128_extract64(x0, x1) \
556     (vgetq_lane_u64(vreinterpretq_u64_u32(x0), x1))
557 
558 #define Lib_IntVector_Intrinsics_vec128_zero \
559     (vdupq_n_u32(0))
560 
561 #define Lib_IntVector_Intrinsics_vec128_add64(x0, x1) \
562     (vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(x0), vreinterpretq_u64_u32(x1))))
563 
564 #define Lib_IntVector_Intrinsics_vec128_sub64(x0, x1) \
565     (vreinterpretq_u32_u64(vsubq_u64(vreinterpretq_u64_u32(x0), vreinterpretq_u64_u32(x1))))
566 
567 #define Lib_IntVector_Intrinsics_vec128_mul64(x0, x1) \
568     (vreinterpretq_u32_u64(vmull_u32(vmovn_u64(vreinterpretq_u64_u32(x0)), vmovn_u64(vreinterpretq_u64_u32(x1)))))
569 
570 #define Lib_IntVector_Intrinsics_vec128_smul64(x0, x1) \
571     (vreinterpretq_u32_u64(vmull_n_u32(vmovn_u64(vreinterpretq_u64_u32(x0)), (uint32_t)x1)))
572 
573 #define Lib_IntVector_Intrinsics_vec128_add32(x0, x1) \
574     (vaddq_u32(x0, x1))
575 
576 #define Lib_IntVector_Intrinsics_vec128_sub32(x0, x1) \
577     (vsubq_u32(x0, x1))
578 
579 #define Lib_IntVector_Intrinsics_vec128_mul32(x0, x1) \
580     (vmulq_lane_u32(x0, x1))
581 
582 #define Lib_IntVector_Intrinsics_vec128_smul32(x0, x1) \
583     (vmulq_lane_u32(x0, vdupq_n_u32(x1)))
584 
585 #define Lib_IntVector_Intrinsics_vec128_load128(x) \
586     ((uint32x4_t)(x))
587 
588 #define Lib_IntVector_Intrinsics_vec128_load64(x) \
589     (vreinterpretq_u32_u64(vdupq_n_u64(x))) /* hi lo */
590 
591 #define Lib_IntVector_Intrinsics_vec128_load32(x) \
592     (vdupq_n_u32(x)) /* hi lo */
593 
594 static inline Lib_IntVector_Intrinsics_vec128
Lib_IntVector_Intrinsics_vec128_load64s(uint64_t x1,uint64_t x2)595 Lib_IntVector_Intrinsics_vec128_load64s(uint64_t x1, uint64_t x2)
596 {
597     const uint64_t a[2] = { x1, x2 };
598     return vreinterpretq_u32_u64(vld1q_u64(a));
599 }
600 
601 static inline Lib_IntVector_Intrinsics_vec128
Lib_IntVector_Intrinsics_vec128_load32s(uint32_t x1,uint32_t x2,uint32_t x3,uint32_t x4)602 Lib_IntVector_Intrinsics_vec128_load32s(uint32_t x1, uint32_t x2, uint32_t x3, uint32_t x4)
603 {
604     const uint32_t a[4] = { x1, x2, x3, x4 };
605     return vld1q_u32(a);
606 }
607 
608 #define Lib_IntVector_Intrinsics_vec128_interleave_low32(x1, x2) \
609     (vzip1q_u32(x1, x2))
610 
611 #define Lib_IntVector_Intrinsics_vec128_interleave_high32(x1, x2) \
612     (vzip2q_u32(x1, x2))
613 
614 #define Lib_IntVector_Intrinsics_vec128_interleave_low64(x1, x2) \
615     (vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(x1), vreinterpretq_u64_u32(x2))))
616 
617 #define Lib_IntVector_Intrinsics_vec128_interleave_high64(x1, x2) \
618     (vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(x1), vreinterpretq_u64_u32(x2))))
619 
620 // IBM z architecture
621 #elif defined(__s390x__) // this flag is for GCC only
622 
623 #include <vecintrin.h>
624 
625 // The main vector 128 type
626 // We can't use uint8_t, uint32_t, uint64_t... instead of unsigned char,
627 // unsigned int, unsigned long long: the compiler complains that the parameter
628 // combination is invalid.
629 typedef unsigned char vector128_8 __attribute__((vector_size(16)));
630 typedef unsigned int vector128_32 __attribute__((vector_size(16)));
631 typedef unsigned long long vector128_64 __attribute__((vector_size(16)));
632 
633 typedef vector128_8 Lib_IntVector_Intrinsics_vec128;
634 typedef vector128_8 vector128;
635 
636 // Small helper to change the endianess of the vector's elements, seen as uint32.
637 // Note that we can't use vec_revb.
638 #define Lib_IntVector_Intrinsics_vec128_load_store_switch_endian32(x0) \
639     ((vector128)(vec_perm((vector128_8)(x0), (vector128_8){},          \
640                           (vector128_8){ 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 })))
641 
642 // Small helper to change the endianess of the vector's elements, seen as uint64
643 // Note that we can't use vec_revb.
644 #define Lib_IntVector_Intrinsics_vec128_load_store_switch_endian64(x0) \
645     ((vector128)(vec_perm((vector128_8)(x0), (vector128_8){},          \
646                           (vector128_8){ 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 })))
647 
648 #define Lib_IntVector_Intrinsics_vec128_load32_le(x)                        \
649     ((vector128)Lib_IntVector_Intrinsics_vec128_load_store_switch_endian32( \
650         ((vector128_8)vec_load_len((const uint8_t*)(x), 16))))
651 
652 #define Lib_IntVector_Intrinsics_vec128_load64_le(x)                        \
653     ((vector128)Lib_IntVector_Intrinsics_vec128_load_store_switch_endian64( \
654         ((vector128_8)vec_load_len((const uint8_t*)(x), 16))))
655 
656 #define Lib_IntVector_Intrinsics_vec128_store32_le(x0, x1)                                        \
657     (vec_store_len(((vector128_8)Lib_IntVector_Intrinsics_vec128_load_store_switch_endian32(x1)), \
658                    ((uint8_t*)(x0)), (uint32_t)16))
659 
660 #define Lib_IntVector_Intrinsics_vec128_store64_le(x0, x1)                                        \
661     (vec_store_len(((vector128_8)Lib_IntVector_Intrinsics_vec128_load_store_switch_endian64(x1)), \
662                    ((uint8_t*)(x0)), (uint32_t)16))
663 
664 #define Lib_IntVector_Intrinsics_vec128_add32(x0, x1) \
665     ((vector128)((vector128_32)(((vector128_32)(x0)) + ((vector128_32)(x1)))))
666 
667 #define Lib_IntVector_Intrinsics_vec128_add64(x0, x1) \
668     ((vector128)((vector128_64)(((vector128_64)(x0)) + ((vector128_64)(x1)))))
669 
670 #define Lib_IntVector_Intrinsics_vec128_and(x0, x1) \
671     ((vector128)(vec_and((vector128)(x0), (vector128)(x1))))
672 
673 #define Lib_IntVector_Intrinsics_vec128_eq32(x0, x1) \
674     ((vector128)(vec_cmpeq(((vector128_32)(x0)), ((vector128_32)(x1)))))
675 
676 #define Lib_IntVector_Intrinsics_vec128_eq64(x0, x1) \
677     ((vector128)(vec_cmpeq(((vector128_64)(x0)), ((vector128_64)(x1)))))
678 
679 #define Lib_IntVector_Intrinsics_vec128_extract32(x0, x1) \
680     ((unsigned int)(vec_extract((vector128_32)(x0), x1)))
681 
682 #define Lib_IntVector_Intrinsics_vec128_extract64(x0, x1) \
683     ((unsigned long long)(vec_extract((vector128_64)(x0), x1)))
684 
685 #define Lib_IntVector_Intrinsics_vec128_gt32(x0, x1) \
686     ((vector128)((vector128_32)(((vector128_32)(x0)) > ((vector128_32)(x1)))))
687 
688 #define Lib_IntVector_Intrinsics_vec128_gt64(x0, x1) \
689     ((vector128)((vector128_64)(((vector128_64)(x0)) > ((vector128_64)(x1)))))
690 
691 #define Lib_IntVector_Intrinsics_vec128_insert32(x0, x1, x2) \
692     ((vector128)((vector128_32)vec_insert((unsigned int)(x1), (vector128_32)(x0), x2)))
693 
694 #define Lib_IntVector_Intrinsics_vec128_insert64(x0, x1, x2) \
695     ((vector128)((vector128_64)vec_insert((unsigned long long)(x1), (vector128_64)(x0), x2)))
696 
697 #define Lib_IntVector_Intrinsics_vec128_interleave_high32(x0, x1) \
698     ((vector128)((vector128_32)vec_mergel((vector128_32)(x0), (vector128_32)(x1))))
699 
700 #define Lib_IntVector_Intrinsics_vec128_interleave_high64(x0, x1) \
701     ((vector128)((vector128_64)vec_mergel((vector128_64)(x0), (vector128_64)(x1))))
702 
703 #define Lib_IntVector_Intrinsics_vec128_interleave_low32(x0, x1) \
704     ((vector128)((vector128_32)vec_mergeh((vector128_32)(x0), (vector128_32)(x1))))
705 
706 #define Lib_IntVector_Intrinsics_vec128_interleave_low64(x0, x1) \
707     ((vector128)((vector128_64)vec_mergeh((vector128_64)(x0), (vector128_64)(x1))))
708 
709 #define Lib_IntVector_Intrinsics_vec128_load32(x)                      \
710     ((vector128)((vector128_32){ (unsigned int)(x), (unsigned int)(x), \
711                                  (unsigned int)(x), (unsigned int)(x) }))
712 
713 #define Lib_IntVector_Intrinsics_vec128_load32s(x0, x1, x2, x3) \
714     ((vector128)((vector128_32){ (unsigned int)(x0), (unsigned int)(x1), (unsigned int)(x2), (unsigned int)(x3) }))
715 
716 #define Lib_IntVector_Intrinsics_vec128_load64(x) \
717     ((vector128)((vector128_64)vec_load_pair((unsigned long long)(x), (unsigned long long)(x))))
718 
719 #define Lib_IntVector_Intrinsics_vec128_lognot(x0) \
720     ((vector128)(vec_xor((vector128)(x0), (vector128)vec_splat_u32(-1))))
721 
722 // We need to permute the low and high components of the uint64
723 // before calling vec_mule. The following helper does that.
724 #define Lib_IntVector_Intrinsics_vec128_mul64_perm_low_high_(x0) \
725     ((vector128)(vec_perm((vector128_8)(x0), (vector128_8){},    \
726                           (vector128_8){ 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 })))
727 
728 #define Lib_IntVector_Intrinsics_vec128_mul64(x0, x1)                                             \
729     ((vector128)(vec_mule((vector128_32)Lib_IntVector_Intrinsics_vec128_mul64_perm_low_high_(x0), \
730                           (vector128_32)Lib_IntVector_Intrinsics_vec128_mul64_perm_low_high_(x1))))
731 
732 #define Lib_IntVector_Intrinsics_vec128_or(x0, x1) \
733     ((vector128)(vec_or((vector128)(x0), (vector128)(x1))))
734 
735 #define Lib_IntVector_Intrinsics_vec128_rotate_left32(x0, x1) \
736     ((vector128)(vec_rli((vector128_32)(x0), (unsigned long)(x1))))
737 
738 #define Lib_IntVector_Intrinsics_vec128_rotate_right32(x0, x1) \
739     (Lib_IntVector_Intrinsics_vec128_rotate_left32(x0, (uint32_t)(32 - (x1))))
740 
741 #define Lib_IntVector_Intrinsics_vec128_rotate_right_lanes32(x0, x1)                                                                                          \
742     ((vector128)(vec_perm((vector128)(x0), (vector128){}, (vector128_8){                                                                                      \
743                                                               (x1 % 4) * 4 + 0, (x1 % 4) * 4 + 1, (x1 % 4) * 4 + 2, (x1 % 4) * 4 + 3,                         \
744                                                               ((x1 + 1) % 4) * 4 + 0, ((x1 + 1) % 4) * 4 + 1, ((x1 + 1) % 4) * 4 + 2, ((x1 + 1) % 4) * 4 + 3, \
745                                                               ((x1 + 2) % 4) * 4 + 0, ((x1 + 2) % 4) * 4 + 1, ((x1 + 2) % 4) * 4 + 2, ((x1 + 2) % 4) * 4 + 3, \
746                                                               ((x1 + 3) % 4) * 4 + 0, ((x1 + 3) % 4) * 4 + 1, ((x1 + 3) % 4) * 4 + 2, ((x1 + 3) % 4) * 4 + 3 })))
747 
748 #define Lib_IntVector_Intrinsics_vec128_shift_left64(x0, x1)                         \
749     (((vector128)((vector128_64)vec_rli((vector128_64)(x0), (unsigned long)(x1)))) & \
750      ((vector128)((vector128_64){ 0xffffffffffffffff << (x1), 0xffffffffffffffff << (x1) })))
751 
752 #define Lib_IntVector_Intrinsics_vec128_shift_right64(x0, x1)                               \
753     (((vector128)((vector128_64)vec_rli((vector128_64)(x0), (unsigned long)(64 - (x1))))) & \
754      ((vector128)((vector128_64){ 0xffffffffffffffff >> (x1), 0xffffffffffffffff >> (x1) })))
755 
756 // Doesn't work with vec_splat_u64
757 #define Lib_IntVector_Intrinsics_vec128_smul64(x0, x1) \
758     ((vector128)(Lib_IntVector_Intrinsics_vec128_mul64(x0, ((vector128_64){ (unsigned long long)(x1), (unsigned long long)(x1) }))))
759 
760 #define Lib_IntVector_Intrinsics_vec128_sub64(x0, x1) \
761     ((vector128)((vector128_64)(x0) - (vector128_64)(x1)))
762 
763 #define Lib_IntVector_Intrinsics_vec128_xor(x0, x1) \
764     ((vector128)(vec_xor((vector128)(x0), (vector128)(x1))))
765 
766 #define Lib_IntVector_Intrinsics_vec128_zero \
767     ((vector128){})
768 
769 #endif // IBM z architecture
770 
771 #endif
772