1 #ifndef __Vec_Intrin_H
2 #define __Vec_Intrin_H
3 
4 #include <sys/types.h>
5 
6 #define Lib_IntVector_Intrinsics_bit_mask64(x) -((x)&1)
7 
8 #if defined(__x86_64__) || defined(_M_X64)
9 
10 // The following functions are only available on machines that support Intel AVX
11 
12 #include <emmintrin.h>
13 #include <tmmintrin.h>
14 #include <smmintrin.h>
15 
16 typedef __m128i Lib_IntVector_Intrinsics_vec128;
17 
18 #define Lib_IntVector_Intrinsics_ni_aes_enc(x0, x1) \
19     (_mm_aesenc_si128(x0, x1))
20 
21 #define Lib_IntVector_Intrinsics_ni_aes_enc_last(x0, x1) \
22     (_mm_aesenclast_si128(x0, x1))
23 
24 #define Lib_IntVector_Intrinsics_ni_aes_keygen_assist(x0, x1) \
25     (_mm_aeskeygenassist_si128(x0, x1))
26 
27 #define Lib_IntVector_Intrinsics_ni_clmul(x0, x1, x2) \
28     (_mm_clmulepi64_si128(x0, x1, x2))
29 
30 #define Lib_IntVector_Intrinsics_vec128_xor(x0, x1) \
31     (_mm_xor_si128(x0, x1))
32 
33 #define Lib_IntVector_Intrinsics_vec128_eq64(x0, x1) \
34     (_mm_cmpeq_epi64(x0, x1))
35 
36 #define Lib_IntVector_Intrinsics_vec128_eq32(x0, x1) \
37     (_mm_cmpeq_epi32(x0, x1))
38 
39 #define Lib_IntVector_Intrinsics_vec128_gt64(x0, x1) \
40     (_mm_cmpgt_epi64(x0, x1))
41 
42 #define Lib_IntVector_Intrinsics_vec128_gt32(x0, x1) \
43     (_mm_cmpgt_epi32(x0, x1))
44 
45 #define Lib_IntVector_Intrinsics_vec128_or(x0, x1) \
46     (_mm_or_si128(x0, x1))
47 
48 #define Lib_IntVector_Intrinsics_vec128_and(x0, x1) \
49     (_mm_and_si128(x0, x1))
50 
51 #define Lib_IntVector_Intrinsics_vec128_lognot(x0) \
52     (_mm_xor_si128(x0, _mm_set1_epi32(-1)))
53 
54 #define Lib_IntVector_Intrinsics_vec128_shift_left(x0, x1) \
55     (_mm_slli_si128(x0, (x1) / 8))
56 
57 #define Lib_IntVector_Intrinsics_vec128_shift_right(x0, x1) \
58     (_mm_srli_si128(x0, (x1) / 8))
59 
60 #define Lib_IntVector_Intrinsics_vec128_shift_left64(x0, x1) \
61     (_mm_slli_epi64(x0, x1))
62 
63 #define Lib_IntVector_Intrinsics_vec128_shift_right64(x0, x1) \
64     (_mm_srli_epi64(x0, x1))
65 
66 #define Lib_IntVector_Intrinsics_vec128_shift_left32(x0, x1) \
67     (_mm_slli_epi32(x0, x1))
68 
69 #define Lib_IntVector_Intrinsics_vec128_shift_right32(x0, x1) \
70     (_mm_srli_epi32(x0, x1))
71 
72 #define Lib_IntVector_Intrinsics_vec128_rotate_left32_8(x0) \
73     (_mm_shuffle_epi8(x0, _mm_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3)))
74 
75 #define Lib_IntVector_Intrinsics_vec128_rotate_left32_16(x0) \
76     (_mm_shuffle_epi8(x0, _mm_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)))
77 
78 #define Lib_IntVector_Intrinsics_vec128_rotate_left32_24(x0) \
79     (_mm_shuffle_epi8(x0, _mm_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)))
80 
81 #define Lib_IntVector_Intrinsics_vec128_rotate_left32(x0, x1) \
82     (((x1) == 8 ? Lib_IntVector_Intrinsics_vec128_rotate_left32_8(x0) : ((x1) == 16 ? Lib_IntVector_Intrinsics_vec128_rotate_left32_16(x0) : ((x1) == 24 ? Lib_IntVector_Intrinsics_vec128_rotate_left32_24(x0) : _mm_xor_si128(_mm_slli_epi32(x0, x1), _mm_srli_epi32(x0, 32 - (x1)))))))
83 
84 #define Lib_IntVector_Intrinsics_vec128_rotate_right32(x0, x1) \
85     (Lib_IntVector_Intrinsics_vec128_rotate_left32(x0, 32 - (x1)))
86 
87 #define Lib_IntVector_Intrinsics_vec128_shuffle32(x0, x1, x2, x3, x4) \
88     (_mm_shuffle_epi32(x0, _MM_SHUFFLE(x4, x3, x2, x1)))
89 
90 #define Lib_IntVector_Intrinsics_vec128_shuffle64(x0, x1, x2) \
91     (_mm_shuffle_epi32(x0, _MM_SHUFFLE(2 * x1 + 1, 2 * x1, 2 * x2 + 1, 2 * x2)))
92 
93 #define Lib_IntVector_Intrinsics_vec128_rotate_right_lanes32(x0, x1) \
94     (_mm_shuffle_epi32(x0, _MM_SHUFFLE((x1 + 3) % 4, (x1 + 2) % 4, (x1 + 1) % 4, x1 % 4)))
95 
96 #define Lib_IntVector_Intrinsics_vec128_rotate_right_lanes64(x0, x1) \
97     (_mm_shuffle_epi32(x0, _MM_SHUFFLE((2 * x1 + 3) % 4, (2 * x1 + 2) % 4, (2 * x1 + 1) % 4, (2 * x1) % 4)))
98 
99 #define Lib_IntVector_Intrinsics_vec128_load_le(x0) \
100     (_mm_loadu_si128((__m128i*)(x0)))
101 
102 #define Lib_IntVector_Intrinsics_vec128_store_le(x0, x1) \
103     (_mm_storeu_si128((__m128i*)(x0), x1))
104 
105 #define Lib_IntVector_Intrinsics_vec128_load_be(x0) \
106     (_mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(x0)), _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)))
107 
108 #define Lib_IntVector_Intrinsics_vec128_load32_be(x0) \
109     (_mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(x0)), _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)))
110 
111 #define Lib_IntVector_Intrinsics_vec128_load64_be(x0) \
112     (_mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(x0)), _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)))
113 
114 #define Lib_IntVector_Intrinsics_vec128_store_be(x0, x1) \
115     (_mm_storeu_si128((__m128i*)(x0), _mm_shuffle_epi8(x1, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))))
116 
117 #define Lib_IntVector_Intrinsics_vec128_store32_be(x0, x1) \
118     (_mm_storeu_si128((__m128i*)(x0), _mm_shuffle_epi8(x1, _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3))))
119 
120 #define Lib_IntVector_Intrinsics_vec128_store64_be(x0, x1) \
121     (_mm_storeu_si128((__m128i*)(x0), _mm_shuffle_epi8(x1, _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7))))
122 
123 #define Lib_IntVector_Intrinsics_vec128_insert8(x0, x1, x2) \
124     (_mm_insert_epi8(x0, x1, x2))
125 
126 #define Lib_IntVector_Intrinsics_vec128_insert32(x0, x1, x2) \
127     (_mm_insert_epi32(x0, x1, x2))
128 
129 #define Lib_IntVector_Intrinsics_vec128_insert64(x0, x1, x2) \
130     (_mm_insert_epi64(x0, x1, x2))
131 
132 #define Lib_IntVector_Intrinsics_vec128_extract8(x0, x1) \
133     (_mm_extract_epi8(x0, x1))
134 
135 #define Lib_IntVector_Intrinsics_vec128_extract32(x0, x1) \
136     (_mm_extract_epi32(x0, x1))
137 
138 #define Lib_IntVector_Intrinsics_vec128_extract64(x0, x1) \
139     (_mm_extract_epi64(x0, x1))
140 
141 #define Lib_IntVector_Intrinsics_vec128_zero \
142     (_mm_setzero_si128())
143 
144 #define Lib_IntVector_Intrinsics_vec128_add64(x0, x1) \
145     (_mm_add_epi64(x0, x1))
146 
147 #define Lib_IntVector_Intrinsics_vec128_sub64(x0, x1) \
148     (_mm_sub_epi64(x0, x1))
149 
150 #define Lib_IntVector_Intrinsics_vec128_mul64(x0, x1) \
151     (_mm_mul_epu32(x0, x1))
152 
153 #define Lib_IntVector_Intrinsics_vec128_smul64(x0, x1) \
154     (_mm_mul_epu32(x0, _mm_set1_epi64x(x1)))
155 
156 #define Lib_IntVector_Intrinsics_vec128_add32(x0, x1) \
157     (_mm_add_epi32(x0, x1))
158 
159 #define Lib_IntVector_Intrinsics_vec128_sub32(x0, x1) \
160     (_mm_sub_epi32(x0, x1))
161 
162 #define Lib_IntVector_Intrinsics_vec128_mul32(x0, x1) \
163     (_mm_mullo_epi32(x0, x1))
164 
165 #define Lib_IntVector_Intrinsics_vec128_smul32(x0, x1) \
166     (_mm_mullo_epi32(x0, _mm_set1_epi32(x1)))
167 
168 #define Lib_IntVector_Intrinsics_vec128_load128(x) \
169     ((__m128i)x)
170 
171 #define Lib_IntVector_Intrinsics_vec128_load64(x) \
172     (_mm_set1_epi64x(x)) /* hi lo */
173 
174 #define Lib_IntVector_Intrinsics_vec128_load64s(x0, x1) \
175     (_mm_set_epi64x(x1, x0)) /* hi lo */
176 
177 #define Lib_IntVector_Intrinsics_vec128_load32(x) \
178     (_mm_set1_epi32(x))
179 
180 #define Lib_IntVector_Intrinsics_vec128_load32s(x0, x1, x2, x3) \
181     (_mm_set_epi32(x3, x2, x1, x0)) /* hi lo */
182 
183 #define Lib_IntVector_Intrinsics_vec128_interleave_low32(x1, x2) \
184     (_mm_unpacklo_epi32(x1, x2))
185 
186 #define Lib_IntVector_Intrinsics_vec128_interleave_high32(x1, x2) \
187     (_mm_unpackhi_epi32(x1, x2))
188 
189 #define Lib_IntVector_Intrinsics_vec128_interleave_low64(x1, x2) \
190     (_mm_unpacklo_epi64(x1, x2))
191 
192 #define Lib_IntVector_Intrinsics_vec128_interleave_high64(x1, x2) \
193     (_mm_unpackhi_epi64(x1, x2))
194 
195 // The following functions are only available on machines that support Intel AVX2
196 
197 #include <immintrin.h>
198 #include <wmmintrin.h>
199 
200 typedef __m256i Lib_IntVector_Intrinsics_vec256;
201 
202 #define Lib_IntVector_Intrinsics_vec256_eq64(x0, x1) \
203     (_mm256_cmpeq_epi64(x0, x1))
204 
205 #define Lib_IntVector_Intrinsics_vec256_eq32(x0, x1) \
206     (_mm256_cmpeq_epi32(x0, x1))
207 
208 #define Lib_IntVector_Intrinsics_vec256_gt64(x0, x1) \
209     (_mm256_cmpgt_epi64(x0, x1))
210 
211 #define Lib_IntVector_Intrinsics_vec256_gt32(x0, x1) \
212     (_mm256_cmpgt_epi32(x0, x1))
213 
214 #define Lib_IntVector_Intrinsics_vec256_xor(x0, x1) \
215     (_mm256_xor_si256(x0, x1))
216 
217 #define Lib_IntVector_Intrinsics_vec256_or(x0, x1) \
218     (_mm256_or_si256(x0, x1))
219 
220 #define Lib_IntVector_Intrinsics_vec256_and(x0, x1) \
221     (_mm256_and_si256(x0, x1))
222 
223 #define Lib_IntVector_Intrinsics_vec256_lognot(x0) \
224     (_mm256_xor_si256(x0, _mm256_set1_epi32(-1)))
225 
226 #define Lib_IntVector_Intrinsics_vec256_shift_left(x0, x1) \
227     (_mm256_slli_si256(x0, (x1) / 8))
228 
229 #define Lib_IntVector_Intrinsics_vec256_shift_right(x0, x1) \
230     (_mm256_srli_si256(x0, (x1) / 8))
231 
232 #define Lib_IntVector_Intrinsics_vec256_shift_left64(x0, x1) \
233     (_mm256_slli_epi64(x0, x1))
234 
235 #define Lib_IntVector_Intrinsics_vec256_shift_right64(x0, x1) \
236     (_mm256_srli_epi64(x0, x1))
237 
238 #define Lib_IntVector_Intrinsics_vec256_shift_left32(x0, x1) \
239     (_mm256_slli_epi32(x0, x1))
240 
241 #define Lib_IntVector_Intrinsics_vec256_shift_right32(x0, x1) \
242     (_mm256_srli_epi32(x0, x1))
243 
244 #define Lib_IntVector_Intrinsics_vec256_rotate_left32_8(x0) \
245     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3, 14, 13, 12, 15, 10, 9, 8, 11, 6, 5, 4, 7, 2, 1, 0, 3)))
246 
247 #define Lib_IntVector_Intrinsics_vec256_rotate_left32_16(x0) \
248     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2, 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2)))
249 
250 #define Lib_IntVector_Intrinsics_vec256_rotate_left32_24(x0) \
251     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1, 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1)))
252 
253 #define Lib_IntVector_Intrinsics_vec256_rotate_left32(x0, x1) \
254     ((x1 == 8 ? Lib_IntVector_Intrinsics_vec256_rotate_left32_8(x0) : (x1 == 16 ? Lib_IntVector_Intrinsics_vec256_rotate_left32_16(x0) : (x1 == 24 ? Lib_IntVector_Intrinsics_vec256_rotate_left32_24(x0) : _mm256_or_si256(_mm256_slli_epi32(x0, x1), _mm256_srli_epi32(x0, 32 - (x1)))))))
255 
256 #define Lib_IntVector_Intrinsics_vec256_rotate_right32(x0, x1) \
257     (Lib_IntVector_Intrinsics_vec256_rotate_left32(x0, 32 - (x1)))
258 
259 #define Lib_IntVector_Intrinsics_vec256_rotate_right64_8(x0) \
260     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(8, 15, 14, 13, 12, 11, 10, 9, 0, 7, 6, 5, 4, 3, 2, 1, 8, 15, 14, 13, 12, 11, 10, 9, 0, 7, 6, 5, 4, 3, 2, 1)))
261 
262 #define Lib_IntVector_Intrinsics_vec256_rotate_right64_16(x0) \
263     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(9, 8, 15, 14, 13, 12, 11, 10, 1, 0, 7, 6, 5, 4, 3, 2, 9, 8, 15, 14, 13, 12, 11, 10, 1, 0, 7, 6, 5, 4, 3, 2)))
264 
265 #define Lib_IntVector_Intrinsics_vec256_rotate_right64_24(x0) \
266     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(10, 9, 8, 15, 14, 13, 12, 11, 2, 1, 0, 7, 6, 5, 4, 3, 10, 9, 8, 15, 14, 13, 12, 11, 2, 1, 0, 7, 6, 5, 4, 3)))
267 
268 #define Lib_IntVector_Intrinsics_vec256_rotate_right64_32(x0) \
269     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12, 3, 2, 1, 0, 7, 6, 5, 4)))
270 
271 #define Lib_IntVector_Intrinsics_vec256_rotate_right64_40(x0) \
272     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(12, 11, 10, 9, 8, 15, 14, 13, 4, 3, 2, 1, 0, 7, 6, 5, 12, 11, 10, 9, 8, 15, 14, 13, 4, 3, 2, 1, 0, 7, 6, 5)))
273 
274 #define Lib_IntVector_Intrinsics_vec256_rotate_right64_48(x0) \
275     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(13, 12, 11, 10, 9, 8, 15, 14, 5, 4, 3, 2, 1, 0, 7, 6, 13, 12, 11, 10, 9, 8, 15, 14, 5, 4, 3, 2, 1, 0, 7, 6)))
276 
277 #define Lib_IntVector_Intrinsics_vec256_rotate_right64_56(x0) \
278     (_mm256_shuffle_epi8(x0, _mm256_set_epi8(14, 13, 12, 11, 10, 9, 8, 15, 6, 5, 4, 3, 2, 1, 0, 7, 14, 13, 12, 11, 10, 9, 8, 15, 6, 5, 4, 3, 2, 1, 0, 7)))
279 
280 #define Lib_IntVector_Intrinsics_vec256_rotate_right64(x0, x1) \
281     ((x1 == 8 ? Lib_IntVector_Intrinsics_vec256_rotate_right64_8(x0) : (x1 == 16 ? Lib_IntVector_Intrinsics_vec256_rotate_right64_16(x0) : (x1 == 24 ? Lib_IntVector_Intrinsics_vec256_rotate_right64_24(x0) : (x1 == 32 ? Lib_IntVector_Intrinsics_vec256_rotate_right64_32(x0) : (x1 == 40 ? Lib_IntVector_Intrinsics_vec256_rotate_right64_40(x0) : (x1 == 48 ? Lib_IntVector_Intrinsics_vec256_rotate_right64_48(x0) : (x1 == 56 ? Lib_IntVector_Intrinsics_vec256_rotate_right64_56(x0) : _mm256_xor_si256(_mm256_srli_epi64((x0), (x1)), _mm256_slli_epi64((x0), (64 - (x1))))))))))))
282 
283 #define Lib_IntVector_Intrinsics_vec256_rotate_left64(x0, x1) \
284     (Lib_IntVector_Intrinsics_vec256_rotate_right64(x0, 64 - (x1)))
285 
286 #define Lib_IntVector_Intrinsics_vec256_shuffle64(x0, x1, x2, x3, x4) \
287     (_mm256_permute4x64_epi64(x0, _MM_SHUFFLE(x4, x3, x2, x1)))
288 
289 #define Lib_IntVector_Intrinsics_vec256_shuffle32(x0, x1, x2, x3, x4, x5, x6, x7, x8) \
290     (_mm256_permutevar8x32_epi32(x0, _mm256_set_epi32(x8, x7, x6, x5, x4, x3, x2, x1)))
291 
292 #define Lib_IntVector_Intrinsics_vec256_rotate_right_lanes32(x0, x1) \
293     (_mm256_permutevar8x32_epi32(x0, _mm256_set_epi32((x1 + 7) % 8, (x1 + 6) % 8, (x1 + 5) % 8, (x1 + 4) % 8, (x1 + 3 % 8), (x1 + 2) % 8, (x1 + 1) % 8, x1 % 8)))
294 
295 #define Lib_IntVector_Intrinsics_vec256_rotate_right_lanes64(x0, x1) \
296     (_mm256_permute4x64_epi64(x0, _MM_SHUFFLE((x1 + 3) % 4, (x1 + 2) % 4, (x1 + 1) % 4, x1 % 4)))
297 
298 #define Lib_IntVector_Intrinsics_vec256_load_le(x0) \
299     (_mm256_loadu_si256((__m256i*)(x0)))
300 
301 #define Lib_IntVector_Intrinsics_vec256_load32_be(x0) \
302     (_mm256_shuffle_epi8(_mm256_loadu_si256((__m256i*)(x0)), _mm256_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)))
303 
304 #define Lib_IntVector_Intrinsics_vec256_load64_be(x0) \
305     (_mm256_shuffle_epi8(_mm256_loadu_si256((__m256i*)(x0)), _mm256_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7)))
306 
307 #define Lib_IntVector_Intrinsics_vec256_store_le(x0, x1) \
308     (_mm256_storeu_si256((__m256i*)(x0), x1))
309 
310 #define Lib_IntVector_Intrinsics_vec256_store32_be(x0, x1) \
311     (_mm256_storeu_si256((__m256i*)(x0), _mm256_shuffle_epi8(x1, _mm256_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3))))
312 
313 #define Lib_IntVector_Intrinsics_vec256_store64_be(x0, x1) \
314     (_mm256_storeu_si256((__m256i*)(x0), _mm256_shuffle_epi8(x1, _mm256_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7))))
315 
316 #define Lib_IntVector_Intrinsics_vec256_insert8(x0, x1, x2) \
317     (_mm256_insert_epi8(x0, x1, x2))
318 
319 #define Lib_IntVector_Intrinsics_vec256_insert32(x0, x1, x2) \
320     (_mm256_insert_epi32(x0, x1, x2))
321 
322 #define Lib_IntVector_Intrinsics_vec256_insert64(x0, x1, x2) \
323     (_mm256_insert_epi64(x0, x1, x2))
324 
325 #define Lib_IntVector_Intrinsics_vec256_extract8(x0, x1) \
326     (_mm256_extract_epi8(x0, x1))
327 
328 #define Lib_IntVector_Intrinsics_vec256_extract32(x0, x1) \
329     (_mm256_extract_epi32(x0, x1))
330 
331 #define Lib_IntVector_Intrinsics_vec256_extract64(x0, x1) \
332     (_mm256_extract_epi64(x0, x1))
333 
334 #define Lib_IntVector_Intrinsics_vec256_zero \
335     (_mm256_setzero_si256())
336 
337 #define Lib_IntVector_Intrinsics_vec256_add64(x0, x1) \
338     (_mm256_add_epi64(x0, x1))
339 
340 #define Lib_IntVector_Intrinsics_vec256_sub64(x0, x1) \
341     (_mm256_sub_epi64(x0, x1))
342 
343 #define Lib_IntVector_Intrinsics_vec256_mul64(x0, x1) \
344     (_mm256_mul_epu32(x0, x1))
345 
346 #define Lib_IntVector_Intrinsics_vec256_smul64(x0, x1) \
347     (_mm256_mul_epu32(x0, _mm256_set1_epi64x(x1)))
348 
349 #define Lib_IntVector_Intrinsics_vec256_add32(x0, x1) \
350     (_mm256_add_epi32(x0, x1))
351 
352 #define Lib_IntVector_Intrinsics_vec256_sub32(x0, x1) \
353     (_mm256_sub_epi32(x0, x1))
354 
355 #define Lib_IntVector_Intrinsics_vec256_mul32(x0, x1) \
356     (_mm256_mullo_epi32(x0, x1))
357 
358 #define Lib_IntVector_Intrinsics_vec256_smul32(x0, x1) \
359     (_mm256_mullo_epi32(x0, _mm256_set1_epi32(x1)))
360 
361 #define Lib_IntVector_Intrinsics_vec256_load64(x1) \
362     (_mm256_set1_epi64x(x1)) /* hi lo */
363 
364 #define Lib_IntVector_Intrinsics_vec256_load64s(x0, x1, x2, x3) \
365     (_mm256_set_epi64x(x3, x2, x1, x0)) /* hi lo */
366 
367 #define Lib_IntVector_Intrinsics_vec256_load32(x) \
368     (_mm256_set1_epi32(x))
369 
370 #define Lib_IntVector_Intrinsics_vec256_load32s(x0, x1, x2, x3, x4, x5, x6, x7) \
371     (_mm256_set_epi32(x7, x6, x5, x4, x3, x2, x1, x0)) /* hi lo */
372 
373 #define Lib_IntVector_Intrinsics_vec256_load128(x) \
374     (_mm256_set_m128i((__m128i)x))
375 
376 #define Lib_IntVector_Intrinsics_vec256_load128s(x0, x1) \
377     (_mm256_set_m128i((__m128i)x1, (__m128i)x0))
378 
379 #define Lib_IntVector_Intrinsics_vec256_interleave_low32(x1, x2) \
380     (_mm256_unpacklo_epi32(x1, x2))
381 
382 #define Lib_IntVector_Intrinsics_vec256_interleave_high32(x1, x2) \
383     (_mm256_unpackhi_epi32(x1, x2))
384 
385 #define Lib_IntVector_Intrinsics_vec256_interleave_low64(x1, x2) \
386     (_mm256_unpacklo_epi64(x1, x2))
387 
388 #define Lib_IntVector_Intrinsics_vec256_interleave_high64(x1, x2) \
389     (_mm256_unpackhi_epi64(x1, x2))
390 
391 #define Lib_IntVector_Intrinsics_vec256_interleave_low128(x1, x2) \
392     (_mm256_permute2x128_si256(x1, x2, 0x20))
393 
394 #define Lib_IntVector_Intrinsics_vec256_interleave_high128(x1, x2) \
395     (_mm256_permute2x128_si256(x1, x2, 0x31))
396 
397 #elif defined(__aarch64__) || defined(_M_ARM64) || defined(__arm__) || defined(_M_ARM)
398 #include <arm_neon.h>
399 
400 typedef uint32x4_t Lib_IntVector_Intrinsics_vec128;
401 
402 #define Lib_IntVector_Intrinsics_vec128_xor(x0, x1) \
403     (veorq_u32(x0, x1))
404 
405 #define Lib_IntVector_Intrinsics_vec128_eq64(x0, x1) \
406     (vceqq_u32(x0, x1))
407 
408 #define Lib_IntVector_Intrinsics_vec128_eq32(x0, x1) \
409     (vceqq_u32(x0, x1))
410 
411 #define Lib_IntVector_Intrinsics_vec128_gt32(x0, x1) \
412     (vcgtq_u32(x0, x1))
413 
414 #define high32(x0) \
415     (vmovn_u64(vshrq_n_u64(vreinterpretq_u64_u32(x0), 32)))
416 
417 #define low32(x0) \
418     (vmovn_u64(vreinterpretq_u64_u32(x0)))
419 
420 #define Lib_IntVector_Intrinsics_vec128_gt64(x0, x1) \
421     (vreinterpretq_u32_u64(vmovl_u32(vorr_u32(vcgt_u32(high32(x0), high32(x1)), vand_u32(vceq_u32(high32(x0), high32(x1)), vcgt_u32(low32(x0), low32(x1)))))))
422 
423 #define Lib_IntVector_Intrinsics_vec128_or(x0, x1) \
424     (vorrq_u32(x0, x1))
425 
426 #define Lib_IntVector_Intrinsics_vec128_and(x0, x1) \
427     (vandq_u32(x0, x1))
428 
429 #define Lib_IntVector_Intrinsics_vec128_lognot(x0) \
430     (vmvnq_u32(x0))
431 
432 #define Lib_IntVector_Intrinsics_vec128_shift_left(x0, x1) \
433     (vextq_u32(x0, vdupq_n_u8(0), 16 - (x1) / 8))
434 
435 #define Lib_IntVector_Intrinsics_vec128_shift_right(x0, x1) \
436     (vextq_u32(x0, vdupq_n_u8(0), (x1) / 8))
437 
438 #define Lib_IntVector_Intrinsics_vec128_shift_left64(x0, x1) \
439     (vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x0), x1)))
440 
441 #define Lib_IntVector_Intrinsics_vec128_shift_right64(x0, x1) \
442     (vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(x0), x1)))
443 
444 #define Lib_IntVector_Intrinsics_vec128_shift_left32(x0, x1) \
445     (vshlq_n_u32(x0, x1))
446 
447 #define Lib_IntVector_Intrinsics_vec128_shift_right32(x0, x1) \
448     (vshrq_n_u32(x0, x1))
449 
450 #define Lib_IntVector_Intrinsics_vec128_rotate_left32_16(x1) \
451     (vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x1))))
452 
453 #define Lib_IntVector_Intrinsics_vec128_rotate_left32(x0, x1) \
454     (((x1) == 16 ? Lib_IntVector_Intrinsics_vec128_rotate_left32_16(x0) : vsriq_n_u32(vshlq_n_u32((x0), (x1)), (x0), 32 - (x1))))
455 
456 #define Lib_IntVector_Intrinsics_vec128_rotate_right32_16(x1) \
457     (vreinterpretq_u32_u16(vrev32q_u16(vreinterpretq_u16_u32(x1))))
458 
459 #define Lib_IntVector_Intrinsics_vec128_rotate_right32(x0, x1) \
460     (((x1) == 16 ? Lib_IntVector_Intrinsics_vec128_rotate_right32_16(x0) : vsriq_n_u32(vshlq_n_u32((x0), 32 - (x1)), (x0), (x1))))
461 
462 #define Lib_IntVector_Intrinsics_vec128_rotate_right_lanes32(x0, x1) \
463     (vextq_u32(x0, x0, x1))
464 
465 #define Lib_IntVector_Intrinsics_vec128_rotate_right_lanes64(x0, x1) \
466     (vextq_u64(x0, x0, x1))
467 
468 /*
469 #define Lib_IntVector_Intrinsics_vec128_shuffle32(x0, x1, x2, x3, x4)	\
470   (_mm_shuffle_epi32(x0, _MM_SHUFFLE(x1,x2,x3,x4)))
471 
472 #define Lib_IntVector_Intrinsics_vec128_shuffle64(x0, x1, x2) \
473   (_mm_shuffle_epi32(x0, _MM_SHUFFLE(2*x1+1,2*x1,2*x2+1,2*x2)))
474 */
475 
476 #define Lib_IntVector_Intrinsics_vec128_load_le(x0) \
477     (vld1q_u32((const uint32_t*)(x0)))
478 
479 #define Lib_IntVector_Intrinsics_vec128_store_le(x0, x1) \
480     (vst1q_u32((uint32_t*)(x0), (x1)))
481 
482 /*
483 #define Lib_IntVector_Intrinsics_vec128_load_be(x0)		\
484   (     Lib_IntVector_Intrinsics_vec128 l = vrev64q_u8(vld1q_u32((uint32_t*)(x0)));
485 
486 */
487 
488 #define Lib_IntVector_Intrinsics_vec128_load32_be(x0) \
489     (vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t*)(x0))))))
490 
491 #define Lib_IntVector_Intrinsics_vec128_load64_be(x0) \
492     (vreinterpretq_u32_u8(vrev64q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t*)(x0))))))
493 
494 /*
495 #define Lib_IntVector_Intrinsics_vec128_store_be(x0, x1)	\
496   (_mm_storeu_si128((__m128i*)(x0), _mm_shuffle_epi8(x1, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))))
497 */
498 
499 #define Lib_IntVector_Intrinsics_vec128_store32_be(x0, x1) \
500     (vst1q_u32((uint32_t*)(x0), (vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x1))))))
501 
502 #define Lib_IntVector_Intrinsics_vec128_store64_be(x0, x1) \
503     (vst1q_u32((uint32_t*)(x0), (vreinterpretq_u32_u8(vrev64q_u8(vreinterpretq_u8_u32(x1))))))
504 
505 #define Lib_IntVector_Intrinsics_vec128_insert8(x0, x1, x2) \
506     (vsetq_lane_u8(x1, x0, x2))
507 
508 #define Lib_IntVector_Intrinsics_vec128_insert32(x0, x1, x2) \
509     (vsetq_lane_u32(x1, x0, x2))
510 
511 #define Lib_IntVector_Intrinsics_vec128_insert64(x0, x1, x2) \
512     (vreinterpretq_u32_u64(vsetq_lane_u64(x1, vreinterpretq_u64_u32(x0), x2)))
513 
514 #define Lib_IntVector_Intrinsics_vec128_extract8(x0, x1) \
515     (vgetq_lane_u8(x0, x1))
516 
517 #define Lib_IntVector_Intrinsics_vec128_extract32(x0, x1) \
518     (vgetq_lane_u32(x0, x1))
519 
520 #define Lib_IntVector_Intrinsics_vec128_extract64(x0, x1) \
521     (vgetq_lane_u64(vreinterpretq_u64_u32(x0), x1))
522 
523 #define Lib_IntVector_Intrinsics_vec128_zero \
524     (vdupq_n_u32(0))
525 
526 #define Lib_IntVector_Intrinsics_vec128_add64(x0, x1) \
527     (vreinterpretq_u32_u64(vaddq_u64(vreinterpretq_u64_u32(x0), vreinterpretq_u64_u32(x1))))
528 
529 #define Lib_IntVector_Intrinsics_vec128_sub64(x0, x1) \
530     (vreinterpretq_u32_u64(vsubq_u64(vreinterpretq_u64_u32(x0), vreinterpretq_u64_u32(x1))))
531 
532 #define Lib_IntVector_Intrinsics_vec128_mul64(x0, x1) \
533     (vreinterpretq_u32_u64(vmull_u32(vmovn_u64(vreinterpretq_u64_u32(x0)), vmovn_u64(vreinterpretq_u64_u32(x1)))))
534 
535 #define Lib_IntVector_Intrinsics_vec128_smul64(x0, x1) \
536     (vreinterpretq_u32_u64(vmull_n_u32(vmovn_u64(vreinterpretq_u64_u32(x0)), (uint32_t)x1)))
537 
538 #define Lib_IntVector_Intrinsics_vec128_add32(x0, x1) \
539     (vaddq_u32(x0, x1))
540 
541 #define Lib_IntVector_Intrinsics_vec128_sub32(x0, x1) \
542     (vsubq_u32(x0, x1))
543 
544 #define Lib_IntVector_Intrinsics_vec128_mul32(x0, x1) \
545     (vmulq_lane_u32(x0, x1))
546 
547 #define Lib_IntVector_Intrinsics_vec128_smul32(x0, x1) \
548     (vmulq_lane_u32(x0, vdupq_n_u32(x1)))
549 
550 #define Lib_IntVector_Intrinsics_vec128_load128(x) \
551     ((uint32x4_t)(x))
552 
553 #define Lib_IntVector_Intrinsics_vec128_load64(x) \
554     (vreinterpretq_u32_u64(vdupq_n_u64(x))) /* hi lo */
555 
556 #define Lib_IntVector_Intrinsics_vec128_load32(x) \
557     (vdupq_n_u32(x)) /* hi lo */
558 
559 static inline Lib_IntVector_Intrinsics_vec128
Lib_IntVector_Intrinsics_vec128_load64s(uint64_t x1,uint64_t x2)560 Lib_IntVector_Intrinsics_vec128_load64s(uint64_t x1, uint64_t x2)
561 {
562     const uint64_t a[2] = { x1, x2 };
563     return vreinterpretq_u32_u64(vld1q_u64(a));
564 }
565 
566 static inline Lib_IntVector_Intrinsics_vec128
Lib_IntVector_Intrinsics_vec128_load32s(uint32_t x1,uint32_t x2,uint32_t x3,uint32_t x4)567 Lib_IntVector_Intrinsics_vec128_load32s(uint32_t x1, uint32_t x2, uint32_t x3, uint32_t x4)
568 {
569     const uint32_t a[4] = { x1, x2, x3, x4 };
570     return vld1q_u32(a);
571 }
572 
573 #define Lib_IntVector_Intrinsics_vec128_interleave_low32(x1, x2) \
574     (vzip1q_u32(x1, x2))
575 
576 #define Lib_IntVector_Intrinsics_vec128_interleave_high32(x1, x2) \
577     (vzip2q_u32(x1, x2))
578 
579 #define Lib_IntVector_Intrinsics_vec128_interleave_low64(x1, x2) \
580     (vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(x1), vreinterpretq_u64_u32(x2))))
581 
582 #define Lib_IntVector_Intrinsics_vec128_interleave_high64(x1, x2) \
583     (vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(x1), vreinterpretq_u64_u32(x2))))
584 
585 #endif
586 #endif
587