xref: /reactos/sdk/include/vcruntime/emmintrin.h (revision a67f3688)
1 /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #pragma once
11 #ifndef _INCLUDED_EMM
12 #define _INCLUDED_EMM
13 
14 #include <vcruntime.h>
15 #include <xmmintrin.h>
16 
17 #if defined(_MSC_VER) && !defined(__clang__)
18 
19 typedef union _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128i
20 {
21     __int8  m128i_i8[16];
22     __int16 m128i_i16[8];
23     __int32 m128i_i32[4];
24     __int64 m128i_i64[2];
25     unsigned __int8  m128i_u8[16];
26     unsigned __int16 m128i_u16[8];
27     unsigned __int32 m128i_u32[4];
28     unsigned __int64 m128i_u64[2];
29 } __m128i;
30 _STATIC_ASSERT(sizeof(__m128i) == 16);
31 
32 typedef struct _DECLSPEC_INTRIN_TYPE _CRT_ALIGN(16) __m128d
33 {
34     double m128d_f64[2];
35 } __m128d;
36 
37 typedef __declspec(align(1)) __m128i __m128i_u;
38 
39 #define __ATTRIBUTE_SSE2__
40 
41 #else /* _MSC_VER */
42 
43 typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
44 typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
45 
46 typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
47 typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1)));
48 
49 /* Type defines.  */
50 typedef double __v2df __attribute__((__vector_size__(16)));
51 typedef long long __v2di __attribute__((__vector_size__(16)));
52 typedef short __v8hi __attribute__((__vector_size__(16)));
53 typedef char __v16qi __attribute__((__vector_size__(16)));
54 
55 /* Unsigned types */
56 typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
57 typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
58 typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
59 
60 /* We need an explicitly signed variant for char. Note that this shouldn't
61  * appear in the interface though. */
62 typedef signed char __v16qs __attribute__((__vector_size__(16)));
63 
64 #ifdef __clang__
65 #define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2"),__min_vector_width__(128)))
66 #define __ATTRIBUTE_MMXSSE2__ __attribute__((__target__("mmx,sse2"),__min_vector_width__(128)))
67 #else
68 #define __ATTRIBUTE_SSE2__ __attribute__((__target__("sse2")))
69 #define __ATTRIBUTE_MMXSSE2__ __attribute__((__target__("mmx,sse2")))
70 #endif
71 #define __INTRIN_INLINE_SSE2 __INTRIN_INLINE __ATTRIBUTE_SSE2__
72 #define __INTRIN_INLINE_MMXSSE2 __INTRIN_INLINE __ATTRIBUTE_MMXSSE2__
73 
74 #endif /* _MSC_VER */
75 
76 #ifdef __cplusplus
77 extern "C" {
78 #endif
79 
80 extern __m128d _mm_add_sd(__m128d a, __m128d b);
81 extern __m128d _mm_add_pd(__m128d a, __m128d b);
82 extern __m128d _mm_sub_sd(__m128d a, __m128d b);
83 extern __m128d _mm_sub_pd(__m128d a, __m128d b);
84 extern __m128d _mm_mul_sd(__m128d a, __m128d b);
85 extern __m128d _mm_mul_pd(__m128d a, __m128d b);
86 extern __m128d _mm_div_sd(__m128d a, __m128d b);
87 extern __m128d _mm_div_pd(__m128d a, __m128d b);
88 extern __m128d _mm_sqrt_sd(__m128d a, __m128d b);
89 extern __m128d _mm_sqrt_pd(__m128d a);
90 extern __m128d _mm_min_sd(__m128d a, __m128d b);
91 extern __m128d _mm_min_pd(__m128d a, __m128d b);
92 extern __m128d _mm_max_sd(__m128d a, __m128d b);
93 extern __m128d _mm_max_pd(__m128d a, __m128d b);
94 extern __m128d _mm_and_pd(__m128d a, __m128d b);
95 extern __m128d _mm_andnot_pd(__m128d a, __m128d b);
96 extern __m128d _mm_or_pd(__m128d a, __m128d b);
97 extern __m128d _mm_xor_pd(__m128d a, __m128d b);
98 extern __m128d _mm_cmpeq_pd(__m128d a, __m128d b);
99 extern __m128d _mm_cmplt_pd(__m128d a, __m128d b);
100 extern __m128d _mm_cmple_pd(__m128d a, __m128d b);
101 extern __m128d _mm_cmpgt_pd(__m128d a, __m128d b);
102 extern __m128d _mm_cmpge_pd(__m128d a, __m128d b);
103 extern __m128d _mm_cmpord_pd(__m128d a, __m128d b);
104 extern __m128d _mm_cmpunord_pd(__m128d a, __m128d b);
105 extern __m128d _mm_cmpneq_pd(__m128d a, __m128d b);
106 extern __m128d _mm_cmpnlt_pd(__m128d a, __m128d b);
107 extern __m128d _mm_cmpnle_pd(__m128d a, __m128d b);
108 extern __m128d _mm_cmpngt_pd(__m128d a, __m128d b);
109 extern __m128d _mm_cmpnge_pd(__m128d a, __m128d b);
110 extern __m128d _mm_cmpeq_sd(__m128d a, __m128d b);
111 extern __m128d _mm_cmplt_sd(__m128d a, __m128d b);
112 extern __m128d _mm_cmple_sd(__m128d a, __m128d b);
113 extern __m128d _mm_cmpgt_sd(__m128d a, __m128d b);
114 extern __m128d _mm_cmpge_sd(__m128d a, __m128d b);
115 extern __m128d _mm_cmpord_sd(__m128d a, __m128d b);
116 extern __m128d _mm_cmpunord_sd(__m128d a, __m128d b);
117 extern __m128d _mm_cmpneq_sd(__m128d a, __m128d b);
118 extern __m128d _mm_cmpnlt_sd(__m128d a, __m128d b);
119 extern __m128d _mm_cmpnle_sd(__m128d a, __m128d b);
120 extern __m128d _mm_cmpngt_sd(__m128d a, __m128d b);
121 extern __m128d _mm_cmpnge_sd(__m128d a, __m128d b);
122 extern int _mm_comieq_sd(__m128d a, __m128d b);
123 extern int _mm_comilt_sd(__m128d a, __m128d b);
124 extern int _mm_comile_sd(__m128d a, __m128d b);
125 extern int _mm_comigt_sd(__m128d a, __m128d b);
126 extern int _mm_comige_sd(__m128d a, __m128d b);
127 extern int _mm_comineq_sd(__m128d a, __m128d b);
128 extern int _mm_ucomieq_sd(__m128d a, __m128d b);
129 extern int _mm_ucomilt_sd(__m128d a, __m128d b);
130 extern int _mm_ucomile_sd(__m128d a, __m128d b);
131 extern int _mm_ucomigt_sd(__m128d a, __m128d b);
132 extern int _mm_ucomige_sd(__m128d a, __m128d b);
133 extern int _mm_ucomineq_sd(__m128d a, __m128d b);
134 extern __m128 _mm_cvtpd_ps(__m128d a);
135 extern __m128d _mm_cvtps_pd(__m128 a);
136 extern __m128d _mm_cvtepi32_pd(__m128i a);
137 extern __m128i _mm_cvtpd_epi32(__m128d a);
138 extern int _mm_cvtsd_si32(__m128d a);
139 extern __m128 _mm_cvtsd_ss(__m128 a, __m128d b);
140 extern __m128d _mm_cvtsi32_sd(__m128d a, int b);
141 extern __m128d _mm_cvtss_sd(__m128d a, __m128 b);
142 extern __m128i _mm_cvttpd_epi32(__m128d a);
143 extern int _mm_cvttsd_si32(__m128d a);
144 extern __m64 _mm_cvtpd_pi32(__m128d a);
145 extern __m64 _mm_cvttpd_pi32(__m128d a);
146 extern __m128d _mm_cvtpi32_pd(__m64 a);
147 extern double _mm_cvtsd_f64(__m128d a);
148 extern __m128d _mm_load_pd(double const *dp);
149 extern __m128d _mm_load1_pd(double const *dp);
150 extern __m128d _mm_loadr_pd(double const *dp);
151 extern __m128d _mm_loadu_pd(double const *dp);
152 //extern __m128i _mm_loadu_si64(void const *a);
153 //extern __m128i _mm_loadu_si32(void const *a);
154 //extern __m128i _mm_loadu_si16(void const *a);
155 extern __m128d _mm_load_sd(double const *dp);
156 extern __m128d _mm_loadh_pd(__m128d a, double const *dp);
157 extern __m128d _mm_loadl_pd(__m128d a, double const *dp);
158 //extern __m128d _mm_undefined_pd(void);
159 extern __m128d _mm_set_sd(double w);
160 extern __m128d _mm_set1_pd(double w);
161 extern __m128d _mm_set_pd(double w, double x);
162 extern __m128d _mm_setr_pd(double w, double x);
163 extern __m128d _mm_setzero_pd(void);
164 extern __m128d _mm_move_sd(__m128d a, __m128d b);
165 extern void _mm_store_sd(double *dp, __m128d a);
166 extern void _mm_store_pd(double *dp, __m128d a);
167 extern void _mm_store1_pd(double *dp, __m128d a);
168 extern void _mm_storeu_pd(double *dp, __m128d a);
169 extern void _mm_storer_pd(double *dp, __m128d a);
170 extern void _mm_storeh_pd(double *dp, __m128d a);
171 extern void _mm_storel_pd(double *dp, __m128d a);
172 extern __m128i _mm_add_epi8(__m128i a, __m128i b);
173 extern __m128i _mm_add_epi16(__m128i a, __m128i b);
174 extern __m128i _mm_add_epi32(__m128i a, __m128i b);
175 extern __m64 _mm_add_si64(__m64 a, __m64 b);
176 extern __m128i _mm_add_epi64(__m128i a, __m128i b);
177 extern __m128i _mm_adds_epi8(__m128i a, __m128i b);
178 extern __m128i _mm_adds_epi16(__m128i a, __m128i b);
179 extern __m128i _mm_adds_epu8(__m128i a, __m128i b);
180 extern __m128i _mm_adds_epu16(__m128i a, __m128i b);
181 extern __m128i _mm_avg_epu8(__m128i a, __m128i b);
182 extern __m128i _mm_avg_epu16(__m128i a, __m128i b);
183 extern __m128i _mm_madd_epi16(__m128i a, __m128i b);
184 extern __m128i _mm_max_epi16(__m128i a, __m128i b);
185 extern __m128i _mm_max_epu8(__m128i a, __m128i b);
186 extern __m128i _mm_min_epi16(__m128i a, __m128i b);
187 extern __m128i _mm_min_epu8(__m128i a, __m128i b);
188 extern __m128i _mm_mulhi_epi16(__m128i a, __m128i b);
189 extern __m128i _mm_mulhi_epu16(__m128i a, __m128i b);
190 extern __m128i _mm_mullo_epi16(__m128i a, __m128i b);
191 extern __m64 _mm_mul_su32(__m64 a, __m64 b);
192 extern __m128i _mm_mul_epu32(__m128i a, __m128i b);
193 extern __m128i _mm_sad_epu8(__m128i a, __m128i b);
194 extern __m128i _mm_sub_epi8(__m128i a, __m128i b);
195 extern __m128i _mm_sub_epi16(__m128i a, __m128i b);
196 extern __m128i _mm_sub_epi32(__m128i a, __m128i b);
197 extern __m64 _mm_sub_si64(__m64 a, __m64 b);
198 extern __m128i _mm_sub_epi64(__m128i a, __m128i b);
199 extern __m128i _mm_subs_epi8(__m128i a, __m128i b);
200 extern __m128i _mm_subs_epi16(__m128i a, __m128i b);
201 extern __m128i _mm_subs_epu8(__m128i a, __m128i b);
202 extern __m128i _mm_subs_epu16(__m128i a, __m128i b);
203 extern __m128i _mm_and_si128(__m128i a, __m128i b);
204 extern __m128i _mm_andnot_si128(__m128i a, __m128i b);
205 extern __m128i _mm_or_si128(__m128i a, __m128i b);
206 extern __m128i _mm_xor_si128(__m128i a, __m128i b);
207 extern __m128i _mm_slli_si128(__m128i a, int i);
208 extern __m128i _mm_slli_epi16(__m128i a, int count);
209 extern __m128i _mm_sll_epi16(__m128i a, __m128i count);
210 extern __m128i _mm_slli_epi32(__m128i a, int count);
211 extern __m128i _mm_sll_epi32(__m128i a, __m128i count);
212 extern __m128i _mm_slli_epi64(__m128i a, int count);
213 extern __m128i _mm_sll_epi64(__m128i a, __m128i count);
214 extern __m128i _mm_srai_epi16(__m128i a, int count);
215 extern __m128i _mm_sra_epi16(__m128i a, __m128i count);
216 extern __m128i _mm_srai_epi32(__m128i a, int count);
217 extern __m128i _mm_sra_epi32(__m128i a, __m128i count);
218 extern __m128i _mm_srli_si128(__m128i a, int imm);
219 extern __m128i _mm_srli_epi16(__m128i a, int count);
220 extern __m128i _mm_srl_epi16(__m128i a, __m128i count);
221 extern __m128i _mm_srli_epi32(__m128i a, int count);
222 extern __m128i _mm_srl_epi32(__m128i a, __m128i count);
223 extern __m128i _mm_srli_epi64(__m128i a, int count);
224 extern __m128i _mm_srl_epi64(__m128i a, __m128i count);
225 extern __m128i _mm_cmpeq_epi8(__m128i a, __m128i b);
226 extern __m128i _mm_cmpeq_epi16(__m128i a, __m128i b);
227 extern __m128i _mm_cmpeq_epi32(__m128i a, __m128i b);
228 extern __m128i _mm_cmpgt_epi8(__m128i a, __m128i b);
229 extern __m128i _mm_cmpgt_epi16(__m128i a, __m128i b);
230 extern __m128i _mm_cmpgt_epi32(__m128i a, __m128i b);
231 extern __m128i _mm_cmplt_epi8(__m128i a, __m128i b);
232 extern __m128i _mm_cmplt_epi16(__m128i a, __m128i b);
233 extern __m128i _mm_cmplt_epi32(__m128i a, __m128i b);
234 #ifdef _M_AMD64
235 extern __m128d _mm_cvtsi64_sd(__m128d a, long long b);
236 extern long long _mm_cvtsd_si64(__m128d a);
237 extern long long _mm_cvttsd_si64(__m128d a);
238 #endif
239 extern __m128 _mm_cvtepi32_ps(__m128i a);
240 extern __m128i _mm_cvtps_epi32(__m128 a);
241 extern __m128i _mm_cvttps_epi32(__m128 a);
242 extern __m128i _mm_cvtsi32_si128(int a);
243 #ifdef _M_AMD64
244 extern __m128i _mm_cvtsi64_si128(long long a);
245 #endif
246 extern int _mm_cvtsi128_si32(__m128i a);
247 #ifdef _M_AMD64
248 extern long long _mm_cvtsi128_si64(__m128i a);
249 #endif
250 extern __m128i _mm_load_si128(__m128i const *p);
251 extern __m128i _mm_loadu_si128(__m128i_u const *p);
252 extern __m128i _mm_loadl_epi64(__m128i_u const *p);
253 //extern __m128i _mm_undefined_si128(void);
254 //extern __m128i _mm_set_epi64x(long long q1, long long q0); // FIXME
255 extern __m128i _mm_set_epi64(__m64 q1, __m64 q0);
256 //extern __m128i _mm_set_epi32(int i3, int i1, int i0);
257 extern __m128i _mm_set_epi32(int i3, int i2, int i1, int i0);
258 //extern __m128i _mm_set_epi16(short w7, short w2, short w1, short w0);
259 extern __m128i _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0);
260 //extern __m128i _mm_set_epi8(char b15, char b10, char b4, char b3, char b2, char b1, char b0);
261 extern __m128i _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0);
262 //extern __m128i _mm_set1_epi64x(long long q); // FIXME
263 extern __m128i _mm_set1_epi64(__m64 q);
264 extern __m128i _mm_set1_epi32(int i);
265 extern __m128i _mm_set1_epi16(short w);
266 extern __m128i _mm_set1_epi8(char b);
267 extern __m128i _mm_setl_epi64(__m128i q); // FIXME: clang?
268 extern __m128i _mm_setr_epi64(__m64 q0, __m64 q1);
269 //extern __m128i _mm_setr_epi32(int i0, int i2, int i3);
270 extern __m128i _mm_setr_epi32(int i0, int i1, int i2, int i3);
271 //extern __m128i _mm_setr_epi16(short w0, short w5, short w6, short w7);
272 extern __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7);
273 //extern __m128i _mm_setr_epi8(char b0, char b6, char b11, char b12, char b13, char b14, char b15);
274 extern __m128i _mm_setr_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0);
275 extern __m128i _mm_setzero_si128(void);
276 extern void _mm_store_si128(__m128i *p, __m128i b);
277 extern void _mm_storeu_si128(__m128i_u *p, __m128i b);
278 //extern void _mm_storeu_si64(void *p, __m128i b);
279 //extern void _mm_storeu_si32(void *p, __m128i b);
280 //extern void _mm_storeu_si16(void *p, __m128i b);
281 extern void _mm_maskmoveu_si128(__m128i d, __m128i n, _Out_writes_bytes_(16) char *p);
282 extern void _mm_storel_epi64(__m128i_u *p, __m128i a);
283 extern void _mm_stream_pd(double *p, __m128d a);
284 extern void _mm_stream_si128(__m128i *p, __m128i a);
285 extern void _mm_stream_si32(int *p, int a);
286 extern void _mm_clflush(void const *p);
287 extern void _mm_lfence(void);
288 extern void _mm_mfence(void);
289 extern __m128i _mm_packs_epi16(__m128i a, __m128i b);
290 extern __m128i _mm_packs_epi32(__m128i a, __m128i b);
291 extern __m128i _mm_packus_epi16(__m128i a, __m128i b);
292 extern int _mm_extract_epi16(__m128i a, int imm);
293 extern __m128i _mm_insert_epi16(__m128i a, int b, int imm);
294 extern int _mm_movemask_epi8(__m128i a);
295 extern __m128i _mm_shuffle_epi32(__m128i a, int imm);
296 extern __m128i _mm_shufflelo_epi16(__m128i a, int imm);
297 extern __m128i _mm_shufflehi_epi16(__m128i a, int imm);
298 extern __m128i _mm_unpackhi_epi8(__m128i a, __m128i b);
299 extern __m128i _mm_unpackhi_epi16(__m128i a, __m128i b);
300 extern __m128i _mm_unpackhi_epi32(__m128i a, __m128i b);
301 extern __m128i _mm_unpackhi_epi64(__m128i a, __m128i b);
302 extern __m128i _mm_unpacklo_epi8(__m128i a, __m128i b);
303 extern __m128i _mm_unpacklo_epi16(__m128i a, __m128i b);
304 extern __m128i _mm_unpacklo_epi32(__m128i a, __m128i b);
305 extern __m128i _mm_unpacklo_epi64(__m128i a, __m128i b);
306 extern __m64 _mm_movepi64_pi64(__m128i a);
307 extern __m128i _mm_movpi64_epi64(__m64 a);
308 extern __m128i _mm_move_epi64(__m128i a);
309 extern __m128d _mm_unpackhi_pd(__m128d a, __m128d b);
310 extern __m128d _mm_unpacklo_pd(__m128d a, __m128d b);
311 extern int _mm_movemask_pd(__m128d a);
312 extern __m128d _mm_shuffle_pd(__m128d a, __m128d b, int imm);
313 extern __m128 _mm_castpd_ps(__m128d a);
314 extern __m128i _mm_castpd_si128(__m128d a);
315 extern __m128d _mm_castps_pd(__m128 a);
316 extern __m128i _mm_castps_si128(__m128 a);
317 extern __m128 _mm_castsi128_ps(__m128i a);
318 extern __m128d _mm_castsi128_pd(__m128i a);
319 void _mm_pause(void);
320 
321 /* Alternate names */
322 #define _mm_set_pd1(a) _mm_set1_pd(a)
323 #define _mm_load_pd1(p) _mm_load1_pd(p)
324 #define _mm_store_pd1(p, a) _mm_store1_pd((p), (a))
325 #define _mm_bslli_si128 _mm_slli_si128
326 #define _mm_bsrli_si128 _mm_srli_si128
327 #define _mm_stream_si64 _mm_stream_si64x
328 
329 #if defined(_MSC_VER) && !defined(__clang__)
330 
331 #pragma intrinsic(_mm_add_sd)
332 #pragma intrinsic(_mm_add_pd)
333 #pragma intrinsic(_mm_sub_sd)
334 #pragma intrinsic(_mm_sub_pd)
335 #pragma intrinsic(_mm_mul_sd)
336 #pragma intrinsic(_mm_mul_pd)
337 #pragma intrinsic(_mm_div_sd)
338 #pragma intrinsic(_mm_div_pd)
339 #pragma intrinsic(_mm_sqrt_sd)
340 #pragma intrinsic(_mm_sqrt_pd)
341 #pragma intrinsic(_mm_min_sd)
342 #pragma intrinsic(_mm_min_pd)
343 #pragma intrinsic(_mm_max_sd)
344 #pragma intrinsic(_mm_max_pd)
345 #pragma intrinsic(_mm_and_pd)
346 #pragma intrinsic(_mm_andnot_pd)
347 #pragma intrinsic(_mm_or_pd)
348 #pragma intrinsic(_mm_xor_pd)
349 #pragma intrinsic(_mm_cmpeq_pd)
350 #pragma intrinsic(_mm_cmplt_pd)
351 #pragma intrinsic(_mm_cmple_pd)
352 #pragma intrinsic(_mm_cmpgt_pd)
353 #pragma intrinsic(_mm_cmpge_pd)
354 #pragma intrinsic(_mm_cmpord_pd)
355 #pragma intrinsic(_mm_cmpunord_pd)
356 #pragma intrinsic(_mm_cmpneq_pd)
357 #pragma intrinsic(_mm_cmpnlt_pd)
358 #pragma intrinsic(_mm_cmpnle_pd)
359 #pragma intrinsic(_mm_cmpngt_pd)
360 #pragma intrinsic(_mm_cmpnge_pd)
361 #pragma intrinsic(_mm_cmpeq_sd)
362 #pragma intrinsic(_mm_cmplt_sd)
363 #pragma intrinsic(_mm_cmple_sd)
364 #pragma intrinsic(_mm_cmpgt_sd)
365 #pragma intrinsic(_mm_cmpge_sd)
366 #pragma intrinsic(_mm_cmpord_sd)
367 #pragma intrinsic(_mm_cmpunord_sd)
368 #pragma intrinsic(_mm_cmpneq_sd)
369 #pragma intrinsic(_mm_cmpnlt_sd)
370 #pragma intrinsic(_mm_cmpnle_sd)
371 #pragma intrinsic(_mm_cmpngt_sd)
372 #pragma intrinsic(_mm_cmpnge_sd)
373 #pragma intrinsic(_mm_comieq_sd)
374 #pragma intrinsic(_mm_comilt_sd)
375 #pragma intrinsic(_mm_comile_sd)
376 #pragma intrinsic(_mm_comigt_sd)
377 #pragma intrinsic(_mm_comige_sd)
378 #pragma intrinsic(_mm_comineq_sd)
379 #pragma intrinsic(_mm_ucomieq_sd)
380 #pragma intrinsic(_mm_ucomilt_sd)
381 #pragma intrinsic(_mm_ucomile_sd)
382 #pragma intrinsic(_mm_ucomigt_sd)
383 #pragma intrinsic(_mm_ucomige_sd)
384 #pragma intrinsic(_mm_ucomineq_sd)
385 #pragma intrinsic(_mm_cvtpd_ps)
386 #pragma intrinsic(_mm_cvtps_pd)
387 #pragma intrinsic(_mm_cvtepi32_pd)
388 #pragma intrinsic(_mm_cvtpd_epi32)
389 #pragma intrinsic(_mm_cvtsd_si32)
390 #pragma intrinsic(_mm_cvtsd_ss)
391 #pragma intrinsic(_mm_cvtsi32_sd)
392 #pragma intrinsic(_mm_cvtss_sd)
393 #pragma intrinsic(_mm_cvttpd_epi32)
394 #pragma intrinsic(_mm_cvttsd_si32)
395 //#pragma intrinsic(_mm_cvtpd_pi32)
396 //#pragma intrinsic(_mm_cvttpd_pi32)
397 //#pragma intrinsic(_mm_cvtpi32_pd)
398 #pragma intrinsic(_mm_cvtsd_f64)
399 #pragma intrinsic(_mm_load_pd)
400 #pragma intrinsic(_mm_load1_pd)
401 #pragma intrinsic(_mm_loadr_pd)
402 #pragma intrinsic(_mm_loadu_pd)
403 //#pragma intrinsic(_mm_loadu_si64)
404 //#pragma intrinsic(_mm_loadu_si32)
405 //#pragma intrinsic(_mm_loadu_si16)
406 #pragma intrinsic(_mm_load_sd)
407 #pragma intrinsic(_mm_loadh_pd)
408 #pragma intrinsic(_mm_loadl_pd)
409 //#pragma intrinsic(_mm_undefined_pd)
410 #pragma intrinsic(_mm_set_sd)
411 #pragma intrinsic(_mm_set1_pd)
412 #pragma intrinsic(_mm_set_pd)
413 #pragma intrinsic(_mm_setr_pd)
414 #pragma intrinsic(_mm_setzero_pd)
415 #pragma intrinsic(_mm_move_sd)
416 #pragma intrinsic(_mm_store_sd)
417 #pragma intrinsic(_mm_store_pd)
418 #pragma intrinsic(_mm_store1_pd)
419 #pragma intrinsic(_mm_storeu_pd)
420 #pragma intrinsic(_mm_storer_pd)
421 #pragma intrinsic(_mm_storeh_pd)
422 #pragma intrinsic(_mm_storel_pd)
423 #pragma intrinsic(_mm_add_epi8)
424 #pragma intrinsic(_mm_add_epi16)
425 #pragma intrinsic(_mm_add_epi32)
426 //#pragma intrinsic(_mm_add_si64)
427 #pragma intrinsic(_mm_add_epi64)
428 #pragma intrinsic(_mm_adds_epi8)
429 #pragma intrinsic(_mm_adds_epi16)
430 #pragma intrinsic(_mm_adds_epu8)
431 #pragma intrinsic(_mm_adds_epu16)
432 #pragma intrinsic(_mm_avg_epu8)
433 #pragma intrinsic(_mm_avg_epu16)
434 #pragma intrinsic(_mm_madd_epi16)
435 #pragma intrinsic(_mm_max_epi16)
436 #pragma intrinsic(_mm_max_epu8)
437 #pragma intrinsic(_mm_min_epi16)
438 #pragma intrinsic(_mm_min_epu8)
439 #pragma intrinsic(_mm_mulhi_epi16)
440 #pragma intrinsic(_mm_mulhi_epu16)
441 #pragma intrinsic(_mm_mullo_epi16)
442 //#pragma intrinsic(_mm_mul_su32)
443 #pragma intrinsic(_mm_mul_epu32)
444 #pragma intrinsic(_mm_sad_epu8)
445 #pragma intrinsic(_mm_sub_epi8)
446 #pragma intrinsic(_mm_sub_epi16)
447 #pragma intrinsic(_mm_sub_epi32)
448 //#pragma intrinsic(_mm_sub_si64)
449 #pragma intrinsic(_mm_sub_epi64)
450 #pragma intrinsic(_mm_subs_epi8)
451 #pragma intrinsic(_mm_subs_epi16)
452 #pragma intrinsic(_mm_subs_epu8)
453 #pragma intrinsic(_mm_subs_epu16)
454 #pragma intrinsic(_mm_and_si128)
455 #pragma intrinsic(_mm_andnot_si128)
456 #pragma intrinsic(_mm_or_si128)
457 #pragma intrinsic(_mm_xor_si128)
458 #pragma intrinsic(_mm_slli_si128)
459 #pragma intrinsic(_mm_slli_epi16)
460 #pragma intrinsic(_mm_sll_epi16)
461 #pragma intrinsic(_mm_slli_epi32)
462 #pragma intrinsic(_mm_sll_epi32)
463 #pragma intrinsic(_mm_slli_epi64)
464 #pragma intrinsic(_mm_sll_epi64)
465 #pragma intrinsic(_mm_srai_epi16)
466 #pragma intrinsic(_mm_sra_epi16)
467 #pragma intrinsic(_mm_srai_epi32)
468 #pragma intrinsic(_mm_sra_epi32)
469 #pragma intrinsic(_mm_srli_si128)
470 #pragma intrinsic(_mm_srli_epi16)
471 #pragma intrinsic(_mm_srl_epi16)
472 #pragma intrinsic(_mm_srli_epi32)
473 #pragma intrinsic(_mm_srl_epi32)
474 #pragma intrinsic(_mm_srli_epi64)
475 #pragma intrinsic(_mm_srl_epi64)
476 #pragma intrinsic(_mm_cmpeq_epi8)
477 #pragma intrinsic(_mm_cmpeq_epi16)
478 #pragma intrinsic(_mm_cmpeq_epi32)
479 #pragma intrinsic(_mm_cmpgt_epi8)
480 #pragma intrinsic(_mm_cmpgt_epi16)
481 #pragma intrinsic(_mm_cmpgt_epi32)
482 #pragma intrinsic(_mm_cmplt_epi8)
483 #pragma intrinsic(_mm_cmplt_epi16)
484 #pragma intrinsic(_mm_cmplt_epi32)
485 #ifdef _M_AMD64
486 #pragma intrinsic(_mm_cvtsi64_sd)
487 #pragma intrinsic(_mm_cvtsd_si64)
488 #pragma intrinsic(_mm_cvttsd_si64)
489 #endif
490 #pragma intrinsic(_mm_cvtepi32_ps)
491 #pragma intrinsic(_mm_cvtps_epi32)
492 #pragma intrinsic(_mm_cvttps_epi32)
493 #pragma intrinsic(_mm_cvtsi32_si128)
494 #ifdef _M_AMD64
495 #pragma intrinsic(_mm_cvtsi64_si128)
496 #endif
497 #pragma intrinsic(_mm_cvtsi128_si32)
498 #ifdef _M_AMD64
499 #pragma intrinsic(_mm_cvtsi128_si64)
500 #endif
501 #pragma intrinsic(_mm_load_si128)
502 #pragma intrinsic(_mm_loadu_si128)
503 #pragma intrinsic(_mm_loadl_epi64)
504 //#pragma intrinsic(_mm_undefined_si128)
505 //#pragma intrinsic(_mm_set_epi64x)
506 //#pragma intrinsic(_mm_set_epi64)
507 #pragma intrinsic(_mm_set_epi32)
508 #pragma intrinsic(_mm_set_epi16)
509 #pragma intrinsic(_mm_set_epi8)
510 //#pragma intrinsic(_mm_set1_epi64x)
511 //#pragma intrinsic(_mm_set1_epi64)
512 #pragma intrinsic(_mm_set1_epi32)
513 #pragma intrinsic(_mm_set1_epi16)
514 #pragma intrinsic(_mm_set1_epi8)
515 #pragma intrinsic(_mm_setl_epi64)
516 //#pragma intrinsic(_mm_setr_epi64)
517 #pragma intrinsic(_mm_setr_epi32)
518 #pragma intrinsic(_mm_setr_epi16)
519 #pragma intrinsic(_mm_setr_epi8)
520 #pragma intrinsic(_mm_setzero_si128)
521 #pragma intrinsic(_mm_store_si128)
522 #pragma intrinsic(_mm_storeu_si128)
523 //#pragma intrinsic(_mm_storeu_si64)
524 //#pragma intrinsic(_mm_storeu_si32)
525 //#pragma intrinsic(_mm_storeu_si16)
526 #pragma intrinsic(_mm_maskmoveu_si128)
527 #pragma intrinsic(_mm_storel_epi64)
528 #pragma intrinsic(_mm_stream_pd)
529 #pragma intrinsic(_mm_stream_si128)
530 #pragma intrinsic(_mm_stream_si32)
531 #pragma intrinsic(_mm_clflush)
532 #pragma intrinsic(_mm_lfence)
533 #pragma intrinsic(_mm_mfence)
534 #pragma intrinsic(_mm_packs_epi16)
535 #pragma intrinsic(_mm_packs_epi32)
536 #pragma intrinsic(_mm_packus_epi16)
537 #pragma intrinsic(_mm_extract_epi16)
538 #pragma intrinsic(_mm_insert_epi16)
539 #pragma intrinsic(_mm_movemask_epi8)
540 #pragma intrinsic(_mm_shuffle_epi32)
541 #pragma intrinsic(_mm_shufflelo_epi16)
542 #pragma intrinsic(_mm_shufflehi_epi16)
543 #pragma intrinsic(_mm_unpackhi_epi8)
544 #pragma intrinsic(_mm_unpackhi_epi16)
545 #pragma intrinsic(_mm_unpackhi_epi32)
546 #pragma intrinsic(_mm_unpackhi_epi64)
547 #pragma intrinsic(_mm_unpacklo_epi8)
548 #pragma intrinsic(_mm_unpacklo_epi16)
549 #pragma intrinsic(_mm_unpacklo_epi32)
550 #pragma intrinsic(_mm_unpacklo_epi64)
551 //#pragma intrinsic(_mm_movepi64_pi64)
552 //#pragma intrinsic(_mm_movpi64_epi64)
553 #pragma intrinsic(_mm_move_epi64)
554 #pragma intrinsic(_mm_unpackhi_pd)
555 #pragma intrinsic(_mm_unpacklo_pd)
556 #pragma intrinsic(_mm_movemask_pd)
557 #pragma intrinsic(_mm_shuffle_pd)
558 #pragma intrinsic(_mm_castpd_ps)
559 #pragma intrinsic(_mm_castpd_si128)
560 #pragma intrinsic(_mm_castps_pd)
561 #pragma intrinsic(_mm_castps_si128)
562 #pragma intrinsic(_mm_castsi128_ps)
563 #pragma intrinsic(_mm_castsi128_pd)
564 #pragma intrinsic(_mm_pause)
565 
566 #else /* _MSC_VER */
567 
568 /*
569   Clang: https://github.com/llvm/llvm-project/blob/main/clang/lib/Headers/emmintrin.h
570   Clang older version: https://github.com/llvm/llvm-project/blob/3ef88b31843e040c95f23ff2c3c206f1fa399c05/clang/lib/Headers/emmintrin.h
571   unikraft: https://github.com/unikraft/lib-intel-intrinsics/blob/staging/include/emmintrin.h
572 */
573 
_mm_add_sd(__m128d a,__m128d b)574 __INTRIN_INLINE_SSE2 __m128d _mm_add_sd(__m128d a, __m128d b)
575 {
576     a[0] += b[0];
577     return a;
578 }
579 
_mm_add_pd(__m128d a,__m128d b)580 __INTRIN_INLINE_SSE2 __m128d _mm_add_pd(__m128d a, __m128d b)
581 {
582     return (__m128d)((__v2df)a + (__v2df)b);
583 }
584 
_mm_sub_sd(__m128d a,__m128d b)585 __INTRIN_INLINE_SSE2 __m128d _mm_sub_sd(__m128d a, __m128d b)
586 {
587     a[0] -= b[0];
588     return a;
589 }
590 
_mm_sub_pd(__m128d a,__m128d b)591 __INTRIN_INLINE_SSE2 __m128d _mm_sub_pd(__m128d a, __m128d b)
592 {
593     return (__m128d)((__v2df)a - (__v2df)b);
594 }
595 
_mm_mul_sd(__m128d a,__m128d b)596 __INTRIN_INLINE_SSE2 __m128d _mm_mul_sd(__m128d a, __m128d b)
597 {
598     a[0] *= b[0];
599     return a;
600 }
601 
_mm_mul_pd(__m128d a,__m128d b)602 __INTRIN_INLINE_SSE2 __m128d _mm_mul_pd(__m128d a, __m128d b)
603 {
604     return (__m128d)((__v2df)a * (__v2df)b);
605 }
606 
_mm_div_sd(__m128d a,__m128d b)607 __INTRIN_INLINE_SSE2 __m128d _mm_div_sd(__m128d a, __m128d b)
608 {
609     a[0] /= b[0];
610     return a;
611 }
612 
_mm_div_pd(__m128d a,__m128d b)613 __INTRIN_INLINE_SSE2 __m128d _mm_div_pd(__m128d a, __m128d b)
614 {
615     return (__m128d)((__v2df)a / (__v2df)b);
616 }
617 
_mm_sqrt_sd(__m128d a,__m128d b)618 __INTRIN_INLINE_SSE2 __m128d _mm_sqrt_sd(__m128d a, __m128d b)
619 {
620     __m128d __c = __builtin_ia32_sqrtsd((__v2df)b);
621     return __extension__(__m128d){__c[0], a[1]};
622 }
623 
_mm_sqrt_pd(__m128d a)624 __INTRIN_INLINE_SSE2 __m128d _mm_sqrt_pd(__m128d a)
625 {
626     return __builtin_ia32_sqrtpd((__v2df)a);
627 }
628 
_mm_min_sd(__m128d a,__m128d b)629 __INTRIN_INLINE_SSE2 __m128d _mm_min_sd(__m128d a, __m128d b)
630 {
631     return __builtin_ia32_minsd((__v2df)a, (__v2df)b);
632 }
633 
_mm_min_pd(__m128d a,__m128d b)634 __INTRIN_INLINE_SSE2 __m128d _mm_min_pd(__m128d a, __m128d b)
635 {
636     return __builtin_ia32_minpd((__v2df)a, (__v2df)b);
637 }
638 
_mm_max_sd(__m128d a,__m128d b)639 __INTRIN_INLINE_SSE2 __m128d _mm_max_sd(__m128d a, __m128d b)
640 {
641     return __builtin_ia32_maxsd((__v2df)a, (__v2df)b);
642 }
643 
_mm_max_pd(__m128d a,__m128d b)644 __INTRIN_INLINE_SSE2 __m128d _mm_max_pd(__m128d a, __m128d b)
645 {
646     return __builtin_ia32_maxpd((__v2df)a, (__v2df)b);
647 }
648 
_mm_and_pd(__m128d a,__m128d b)649 __INTRIN_INLINE_SSE2 __m128d _mm_and_pd(__m128d a, __m128d b)
650 {
651     return (__m128d)((__v2du)a & (__v2du)b);
652 }
653 
_mm_andnot_pd(__m128d a,__m128d b)654 __INTRIN_INLINE_SSE2 __m128d _mm_andnot_pd(__m128d a, __m128d b)
655 {
656     return (__m128d)(~(__v2du)a & (__v2du)b);
657 }
658 
_mm_or_pd(__m128d a,__m128d b)659 __INTRIN_INLINE_SSE2 __m128d _mm_or_pd(__m128d a, __m128d b)
660 {
661     return (__m128d)((__v2du)a | (__v2du)b);
662 }
663 
_mm_xor_pd(__m128d a,__m128d b)664 __INTRIN_INLINE_SSE2 __m128d _mm_xor_pd(__m128d a, __m128d b)
665 {
666     return (__m128d)((__v2du)a ^ (__v2du)b);
667 }
668 
_mm_cmpeq_pd(__m128d a,__m128d b)669 __INTRIN_INLINE_SSE2 __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
670 {
671     return (__m128d)__builtin_ia32_cmpeqpd((__v2df)a, (__v2df)b);
672 }
673 
_mm_cmplt_pd(__m128d a,__m128d b)674 __INTRIN_INLINE_SSE2 __m128d _mm_cmplt_pd(__m128d a, __m128d b)
675 {
676     return (__m128d)__builtin_ia32_cmpltpd((__v2df)a, (__v2df)b);
677 }
678 
_mm_cmple_pd(__m128d a,__m128d b)679 __INTRIN_INLINE_SSE2 __m128d _mm_cmple_pd(__m128d a, __m128d b)
680 {
681     return (__m128d)__builtin_ia32_cmplepd((__v2df)a, (__v2df)b);
682 }
683 
_mm_cmpgt_pd(__m128d a,__m128d b)684 __INTRIN_INLINE_SSE2 __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
685 {
686     return (__m128d)__builtin_ia32_cmpltpd((__v2df)b, (__v2df)a);
687 }
688 
_mm_cmpge_pd(__m128d a,__m128d b)689 __INTRIN_INLINE_SSE2 __m128d _mm_cmpge_pd(__m128d a, __m128d b)
690 {
691     return (__m128d)__builtin_ia32_cmplepd((__v2df)b, (__v2df)a);
692 }
693 
_mm_cmpord_pd(__m128d a,__m128d b)694 __INTRIN_INLINE_SSE2 __m128d _mm_cmpord_pd(__m128d a, __m128d b)
695 {
696     return (__m128d)__builtin_ia32_cmpordpd((__v2df)a, (__v2df)b);
697 }
698 
_mm_cmpunord_pd(__m128d a,__m128d b)699 __INTRIN_INLINE_SSE2 __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
700 {
701     return (__m128d)__builtin_ia32_cmpunordpd((__v2df)a, (__v2df)b);
702 }
703 
_mm_cmpneq_pd(__m128d a,__m128d b)704 __INTRIN_INLINE_SSE2 __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
705 {
706     return (__m128d)__builtin_ia32_cmpneqpd((__v2df)a, (__v2df)b);
707 }
708 
_mm_cmpnlt_pd(__m128d a,__m128d b)709 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
710 {
711     return (__m128d)__builtin_ia32_cmpnltpd((__v2df)a, (__v2df)b);
712 }
713 
_mm_cmpnle_pd(__m128d a,__m128d b)714 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
715 {
716     return (__m128d)__builtin_ia32_cmpnlepd((__v2df)a, (__v2df)b);
717 }
718 
_mm_cmpngt_pd(__m128d a,__m128d b)719 __INTRIN_INLINE_SSE2 __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
720 {
721     return (__m128d)__builtin_ia32_cmpnltpd((__v2df)b, (__v2df)a);
722 }
723 
_mm_cmpnge_pd(__m128d a,__m128d b)724 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
725 {
726     return (__m128d)__builtin_ia32_cmpnlepd((__v2df)b, (__v2df)a);
727 }
728 
_mm_cmpeq_sd(__m128d a,__m128d b)729 __INTRIN_INLINE_SSE2 __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
730 {
731     return (__m128d)__builtin_ia32_cmpeqsd((__v2df)a, (__v2df)b);
732 }
733 
_mm_cmplt_sd(__m128d a,__m128d b)734 __INTRIN_INLINE_SSE2 __m128d _mm_cmplt_sd(__m128d a, __m128d b)
735 {
736     return (__m128d)__builtin_ia32_cmpltsd((__v2df)a, (__v2df)b);
737 }
738 
_mm_cmple_sd(__m128d a,__m128d b)739 __INTRIN_INLINE_SSE2 __m128d _mm_cmple_sd(__m128d a, __m128d b)
740 {
741     return (__m128d)__builtin_ia32_cmplesd((__v2df)a, (__v2df)b);
742 }
743 
_mm_cmpgt_sd(__m128d a,__m128d b)744 __INTRIN_INLINE_SSE2 __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
745 {
746     __m128d __c = __builtin_ia32_cmpltsd((__v2df)b, (__v2df)a);
747     return __extension__(__m128d){__c[0], a[1]};
748 }
749 
_mm_cmpge_sd(__m128d a,__m128d b)750 __INTRIN_INLINE_SSE2 __m128d _mm_cmpge_sd(__m128d a, __m128d b)
751 {
752     __m128d __c = __builtin_ia32_cmplesd((__v2df)b, (__v2df)a);
753     return __extension__(__m128d){__c[0], a[1]};
754 }
755 
_mm_cmpord_sd(__m128d a,__m128d b)756 __INTRIN_INLINE_SSE2 __m128d _mm_cmpord_sd(__m128d a, __m128d b)
757 {
758     return (__m128d)__builtin_ia32_cmpordsd((__v2df)a, (__v2df)b);
759 }
760 
_mm_cmpunord_sd(__m128d a,__m128d b)761 __INTRIN_INLINE_SSE2 __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
762 {
763     return (__m128d)__builtin_ia32_cmpunordsd((__v2df)a, (__v2df)b);
764 }
765 
_mm_cmpneq_sd(__m128d a,__m128d b)766 __INTRIN_INLINE_SSE2 __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
767 {
768     return (__m128d)__builtin_ia32_cmpneqsd((__v2df)a, (__v2df)b);
769 }
770 
_mm_cmpnlt_sd(__m128d a,__m128d b)771 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
772 {
773     return (__m128d)__builtin_ia32_cmpnltsd((__v2df)a, (__v2df)b);
774 }
775 
_mm_cmpnle_sd(__m128d a,__m128d b)776 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
777 {
778     return (__m128d)__builtin_ia32_cmpnlesd((__v2df)a, (__v2df)b);
779 }
780 
_mm_cmpngt_sd(__m128d a,__m128d b)781 __INTRIN_INLINE_SSE2 __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
782 {
783     __m128d __c = __builtin_ia32_cmpnltsd((__v2df)b, (__v2df)a);
784     return __extension__(__m128d){__c[0], a[1]};
785 }
786 
_mm_cmpnge_sd(__m128d a,__m128d b)787 __INTRIN_INLINE_SSE2 __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
788 {
789     __m128d __c = __builtin_ia32_cmpnlesd((__v2df)b, (__v2df)a);
790     return __extension__(__m128d){__c[0], a[1]};
791 }
792 
_mm_comieq_sd(__m128d a,__m128d b)793 __INTRIN_INLINE_SSE2 int _mm_comieq_sd(__m128d a, __m128d b)
794 {
795     return __builtin_ia32_comisdeq((__v2df)a, (__v2df)b);
796 }
797 
_mm_comilt_sd(__m128d a,__m128d b)798 __INTRIN_INLINE_SSE2 int _mm_comilt_sd(__m128d a, __m128d b)
799 {
800     return __builtin_ia32_comisdlt((__v2df)a, (__v2df)b);
801 }
802 
_mm_comile_sd(__m128d a,__m128d b)803 __INTRIN_INLINE_SSE2 int _mm_comile_sd(__m128d a, __m128d b)
804 {
805     return __builtin_ia32_comisdle((__v2df)a, (__v2df)b);
806 }
807 
_mm_comigt_sd(__m128d a,__m128d b)808 __INTRIN_INLINE_SSE2 int _mm_comigt_sd(__m128d a, __m128d b)
809 {
810     return __builtin_ia32_comisdgt((__v2df)a, (__v2df)b);
811 }
812 
_mm_comige_sd(__m128d a,__m128d b)813 __INTRIN_INLINE_SSE2 int _mm_comige_sd(__m128d a, __m128d b)
814 {
815     return __builtin_ia32_comisdge((__v2df)a, (__v2df)b);
816 }
817 
_mm_comineq_sd(__m128d a,__m128d b)818 __INTRIN_INLINE_SSE2 int _mm_comineq_sd(__m128d a, __m128d b)
819 {
820     return __builtin_ia32_comisdneq((__v2df)a, (__v2df)b);
821 }
822 
_mm_ucomieq_sd(__m128d a,__m128d b)823 __INTRIN_INLINE_SSE2 int _mm_ucomieq_sd(__m128d a, __m128d b)
824 {
825     return __builtin_ia32_ucomisdeq((__v2df)a, (__v2df)b);
826 }
827 
_mm_ucomilt_sd(__m128d a,__m128d b)828 __INTRIN_INLINE_SSE2 int _mm_ucomilt_sd(__m128d a, __m128d b)
829 {
830     return __builtin_ia32_ucomisdlt((__v2df)a, (__v2df)b);
831 }
832 
_mm_ucomile_sd(__m128d a,__m128d b)833 __INTRIN_INLINE_SSE2 int _mm_ucomile_sd(__m128d a, __m128d b)
834 {
835     return __builtin_ia32_ucomisdle((__v2df)a, (__v2df)b);
836 }
837 
_mm_ucomigt_sd(__m128d a,__m128d b)838 __INTRIN_INLINE_SSE2 int _mm_ucomigt_sd(__m128d a, __m128d b)
839 {
840     return __builtin_ia32_ucomisdgt((__v2df)a, (__v2df)b);
841 }
842 
_mm_ucomige_sd(__m128d a,__m128d b)843 __INTRIN_INLINE_SSE2 int _mm_ucomige_sd(__m128d a, __m128d b)
844 {
845     return __builtin_ia32_ucomisdge((__v2df)a, (__v2df)b);
846 }
847 
_mm_ucomineq_sd(__m128d a,__m128d b)848 __INTRIN_INLINE_SSE2 int _mm_ucomineq_sd(__m128d a, __m128d b)
849 {
850     return __builtin_ia32_ucomisdneq((__v2df)a, (__v2df)b);
851 }
852 
_mm_cvtpd_ps(__m128d a)853 __INTRIN_INLINE_SSE2 __m128 _mm_cvtpd_ps(__m128d a)
854 {
855     return __builtin_ia32_cvtpd2ps((__v2df)a);
856 }
857 
_mm_cvtps_pd(__m128 a)858 __INTRIN_INLINE_SSE2 __m128d _mm_cvtps_pd(__m128 a)
859 {
860 #if HAS_BUILTIN(__builtin_convertvector)
861     return (__m128d)__builtin_convertvector(__builtin_shufflevector((__v4sf)a, (__v4sf)a, 0, 1), __v2df);
862 #else
863     return __builtin_ia32_cvtps2pd(a);
864 #endif
865 }
866 
_mm_cvtepi32_pd(__m128i a)867 __INTRIN_INLINE_SSE2 __m128d _mm_cvtepi32_pd(__m128i a)
868 {
869 #if HAS_BUILTIN(__builtin_convertvector)
870     return (__m128d)__builtin_convertvector(__builtin_shufflevector((__v4si)a, (__v4si)a, 0, 1), __v2df);
871 #else
872     return __builtin_ia32_cvtdq2pd((__v4si)a);
873 #endif
874 }
875 
_mm_cvtpd_epi32(__m128d a)876 __INTRIN_INLINE_SSE2 __m128i _mm_cvtpd_epi32(__m128d a)
877 {
878     return (__m128i)__builtin_ia32_cvtpd2dq((__v2df)a);
879 }
880 
_mm_cvtsd_si32(__m128d a)881 __INTRIN_INLINE_SSE2 int _mm_cvtsd_si32(__m128d a)
882 {
883     return __builtin_ia32_cvtsd2si((__v2df)a);
884 }
885 
_mm_cvtsd_ss(__m128 a,__m128d b)886 __INTRIN_INLINE_SSE2 __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
887 {
888     return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)a, (__v2df)b);
889 }
890 
_mm_cvtsi32_sd(__m128d a,int b)891 __INTRIN_INLINE_SSE2 __m128d _mm_cvtsi32_sd(__m128d a,
892                                                               int b)
893 {
894     a[0] = b;
895     return a;
896 }
897 
_mm_cvtss_sd(__m128d a,__m128 b)898 __INTRIN_INLINE_SSE2 __m128d _mm_cvtss_sd(__m128d a, __m128 b)
899 {
900     a[0] = b[0];
901     return a;
902 }
903 
_mm_cvttpd_epi32(__m128d a)904 __INTRIN_INLINE_SSE2 __m128i _mm_cvttpd_epi32(__m128d a)
905 {
906     return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)a);
907 }
908 
_mm_cvttsd_si32(__m128d a)909 __INTRIN_INLINE_SSE2 int _mm_cvttsd_si32(__m128d a)
910 {
911     return __builtin_ia32_cvttsd2si((__v2df)a);
912 }
913 
_mm_cvtpd_pi32(__m128d a)914 __INTRIN_INLINE_MMXSSE2 __m64 _mm_cvtpd_pi32(__m128d a)
915 {
916     return (__m64)__builtin_ia32_cvtpd2pi((__v2df)a);
917 }
918 
_mm_cvttpd_pi32(__m128d a)919 __INTRIN_INLINE_MMXSSE2 __m64 _mm_cvttpd_pi32(__m128d a)
920 {
921     return (__m64)__builtin_ia32_cvttpd2pi((__v2df)a);
922 }
923 
_mm_cvtpi32_pd(__m64 a)924 __INTRIN_INLINE_MMXSSE2 __m128d _mm_cvtpi32_pd(__m64 a)
925 {
926     return __builtin_ia32_cvtpi2pd((__v2si)a);
927 }
928 
_mm_cvtsd_f64(__m128d a)929 __INTRIN_INLINE_SSE2 double _mm_cvtsd_f64(__m128d a)
930 {
931     return a[0];
932 }
933 
_mm_load_pd(double const * dp)934 __INTRIN_INLINE_SSE2 __m128d _mm_load_pd(double const *dp)
935 {
936     return *(const __m128d *)dp;
937 }
938 
_mm_load1_pd(double const * dp)939 __INTRIN_INLINE_SSE2 __m128d _mm_load1_pd(double const *dp)
940 {
941     struct __mm_load1_pd_struct {
942       double __u;
943     } __attribute__((__packed__, __may_alias__));
944     double __u = ((const struct __mm_load1_pd_struct *)dp)->__u;
945     return __extension__(__m128d){__u, __u};
946 }
947 
948 // GCC:
949 /* Create a selector for use with the SHUFPD instruction.  */
950 #define _MM_SHUFFLE2(fp1,fp0) \
951  (((fp1) << 1) | (fp0))
952 
_mm_loadr_pd(double const * dp)953 __INTRIN_INLINE_SSE2 __m128d _mm_loadr_pd(double const *dp)
954 {
955 #if HAS_BUILTIN(__builtin_shufflevector)
956     __m128d u = *(const __m128d *)dp;
957     return __builtin_shufflevector((__v2df)u, (__v2df)u, 1, 0);
958 #else
959     return (__m128d){ dp[1], dp[0] };
960 #endif
961 }
962 
_mm_loadu_pd(double const * dp)963 __INTRIN_INLINE_SSE2 __m128d _mm_loadu_pd(double const *dp)
964 {
965     struct __loadu_pd {
966       __m128d_u __v;
967     } __attribute__((__packed__, __may_alias__));
968     return ((const struct __loadu_pd *)dp)->__v;
969 }
970 
_mm_loadu_si64(void const * a)971 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si64(void const *a)
972 {
973     struct __loadu_si64 {
974       long long __v;
975     } __attribute__((__packed__, __may_alias__));
976     long long __u = ((const struct __loadu_si64 *)a)->__v;
977     return __extension__(__m128i)(__v2di){__u, 0LL};
978 }
979 
_mm_loadu_si32(void const * a)980 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si32(void const *a)
981 {
982     struct __loadu_si32 {
983       int __v;
984     } __attribute__((__packed__, __may_alias__));
985     int __u = ((const struct __loadu_si32 *)a)->__v;
986     return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
987 }
988 
_mm_loadu_si16(void const * a)989 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si16(void const *a)
990 {
991     struct __loadu_si16 {
992       short __v;
993     } __attribute__((__packed__, __may_alias__));
994     short __u = ((const struct __loadu_si16 *)a)->__v;
995     return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
996 }
997 
_mm_load_sd(double const * dp)998 __INTRIN_INLINE_SSE2 __m128d _mm_load_sd(double const *dp)
999 {
1000     struct __mm_load_sd_struct {
1001       double __u;
1002     } __attribute__((__packed__, __may_alias__));
1003     double __u = ((const struct __mm_load_sd_struct *)dp)->__u;
1004     return __extension__(__m128d){__u, 0};
1005 }
1006 
_mm_loadh_pd(__m128d a,double const * dp)1007 __INTRIN_INLINE_SSE2 __m128d _mm_loadh_pd(__m128d a, double const *dp)
1008 {
1009     struct __mm_loadh_pd_struct {
1010       double __u;
1011     } __attribute__((__packed__, __may_alias__));
1012     double __u = ((const struct __mm_loadh_pd_struct *)dp)->__u;
1013     return __extension__(__m128d){a[0], __u};
1014 }
1015 
_mm_loadl_pd(__m128d a,double const * dp)1016 __INTRIN_INLINE_SSE2 __m128d _mm_loadl_pd(__m128d a, double const *dp)
1017 {
1018     struct __mm_loadl_pd_struct {
1019       double __u;
1020     } __attribute__((__packed__, __may_alias__));
1021     double __u = ((const struct __mm_loadl_pd_struct *)dp)->__u;
1022     return __extension__(__m128d){__u, a[1]};
1023 }
1024 
_mm_undefined_pd(void)1025 __INTRIN_INLINE_SSE2 __m128d _mm_undefined_pd(void)
1026 {
1027 #if HAS_BUILTIN(__builtin_ia32_undef128)
1028     return (__m128d)__builtin_ia32_undef128();
1029 #else
1030     __m128d undef = undef;
1031     return undef;
1032 #endif
1033 }
1034 
_mm_set_sd(double w)1035 __INTRIN_INLINE_SSE2 __m128d _mm_set_sd(double w)
1036 {
1037     return __extension__(__m128d){w, 0};
1038 }
1039 
_mm_set1_pd(double w)1040 __INTRIN_INLINE_SSE2 __m128d _mm_set1_pd(double w)
1041 {
1042     return __extension__(__m128d){w, w};
1043 }
1044 
_mm_set_pd(double w,double x)1045 __INTRIN_INLINE_SSE2 __m128d _mm_set_pd(double w, double x)
1046 {
1047     return __extension__(__m128d){x, w};
1048 }
1049 
_mm_setr_pd(double w,double x)1050 __INTRIN_INLINE_SSE2 __m128d _mm_setr_pd(double w, double x)
1051 {
1052     return __extension__(__m128d){w, x};
1053 }
1054 
_mm_setzero_pd(void)1055 __INTRIN_INLINE_SSE2 __m128d _mm_setzero_pd(void)
1056 {
1057     return __extension__(__m128d){0, 0};
1058 }
1059 
_mm_move_sd(__m128d a,__m128d b)1060 __INTRIN_INLINE_SSE2 __m128d _mm_move_sd(__m128d a, __m128d b)
1061 {
1062     a[0] = b[0];
1063     return a;
1064 }
1065 
_mm_store_sd(double * dp,__m128d a)1066 __INTRIN_INLINE_SSE2 void _mm_store_sd(double *dp, __m128d a)
1067 {
1068     struct __mm_store_sd_struct {
1069       double __u;
1070     } __attribute__((__packed__, __may_alias__));
1071     ((struct __mm_store_sd_struct *)dp)->__u = a[0];
1072 }
1073 
_mm_store_pd(double * dp,__m128d a)1074 __INTRIN_INLINE_SSE2 void _mm_store_pd(double *dp, __m128d a)
1075 {
1076     *(__m128d *)dp = a;
1077 }
1078 
_mm_store1_pd(double * dp,__m128d a)1079 __INTRIN_INLINE_SSE2 void _mm_store1_pd(double *dp, __m128d a)
1080 {
1081 #if HAS_BUILTIN(__builtin_shufflevector)
1082     a = __builtin_shufflevector((__v2df)a, (__v2df)a, 0, 0);
1083     _mm_store_pd(dp, a);
1084 #else
1085     dp[0] = a[0];
1086     dp[1] = a[0];
1087 #endif
1088 }
1089 
_mm_storeu_pd(double * dp,__m128d a)1090 __INTRIN_INLINE_SSE2 void _mm_storeu_pd(double *dp, __m128d a)
1091 {
1092     struct __storeu_pd {
1093       __m128d_u __v;
1094     } __attribute__((__packed__, __may_alias__));
1095     ((struct __storeu_pd *)dp)->__v = a;
1096 }
1097 
_mm_storer_pd(double * dp,__m128d a)1098 __INTRIN_INLINE_SSE2 void _mm_storer_pd(double *dp, __m128d a)
1099 {
1100 #if HAS_BUILTIN(__builtin_shufflevector)
1101     a = __builtin_shufflevector((__v2df)a, (__v2df)a, 1, 0);
1102     *(__m128d *)dp = a;
1103 #else
1104     dp[0] = a[1];
1105     dp[1] = a[0];
1106 #endif
1107 }
1108 
_mm_storeh_pd(double * dp,__m128d a)1109 __INTRIN_INLINE_SSE2 void _mm_storeh_pd(double *dp, __m128d a)
1110 {
1111     struct __mm_storeh_pd_struct {
1112       double __u;
1113     } __attribute__((__packed__, __may_alias__));
1114     ((struct __mm_storeh_pd_struct *)dp)->__u = a[1];
1115 }
1116 
_mm_storel_pd(double * dp,__m128d a)1117 __INTRIN_INLINE_SSE2 void _mm_storel_pd(double *dp, __m128d a)
1118 {
1119     struct __mm_storeh_pd_struct {
1120       double __u;
1121     } __attribute__((__packed__, __may_alias__));
1122     ((struct __mm_storeh_pd_struct *)dp)->__u = a[0];
1123 }
1124 
_mm_add_epi8(__m128i a,__m128i b)1125 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi8(__m128i a, __m128i b)
1126 {
1127     return (__m128i)((__v16qu)a + (__v16qu)b);
1128 }
1129 
_mm_add_epi16(__m128i a,__m128i b)1130 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi16(__m128i a, __m128i b)
1131 {
1132     return (__m128i)((__v8hu)a + (__v8hu)b);
1133 }
1134 
_mm_add_epi32(__m128i a,__m128i b)1135 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi32(__m128i a, __m128i b)
1136 {
1137     return (__m128i)((__v4su)a + (__v4su)b);
1138 }
1139 
_mm_add_si64(__m64 a,__m64 b)1140 __INTRIN_INLINE_MMXSSE2 __m64 _mm_add_si64(__m64 a, __m64 b)
1141 {
1142     return (__m64)__builtin_ia32_paddq((__v1di)a, (__v1di)b);
1143 }
1144 
_mm_add_epi64(__m128i a,__m128i b)1145 __INTRIN_INLINE_SSE2 __m128i _mm_add_epi64(__m128i a, __m128i b)
1146 {
1147     return (__m128i)((__v2du)a + (__v2du)b);
1148 }
1149 
_mm_adds_epi8(__m128i a,__m128i b)1150 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epi8(__m128i a, __m128i b)
1151 {
1152 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1153     return (__m128i)__builtin_elementwise_add_sat((__v16qs)a, (__v16qs)b);
1154 #else
1155     return (__m128i)__builtin_ia32_paddsb128((__v16qi)a, (__v16qi)b);
1156 #endif
1157 }
1158 
_mm_adds_epi16(__m128i a,__m128i b)1159 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epi16(__m128i a, __m128i b)
1160 {
1161 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1162     return (__m128i)__builtin_elementwise_add_sat((__v8hi)a, (__v8hi)b);
1163 #else
1164     return (__m128i)__builtin_ia32_paddsw128((__v8hi)a, (__v8hi)b);
1165 #endif
1166 }
1167 
_mm_adds_epu8(__m128i a,__m128i b)1168 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epu8(__m128i a, __m128i b)
1169 {
1170 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1171     return (__m128i)__builtin_elementwise_add_sat((__v16qu)a, (__v16qu)b);
1172 #else
1173     return (__m128i)__builtin_ia32_paddusb128((__v16qi)a, (__v16qi)b);
1174 #endif
1175 }
1176 
_mm_adds_epu16(__m128i a,__m128i b)1177 __INTRIN_INLINE_SSE2 __m128i _mm_adds_epu16(__m128i a, __m128i b)
1178 {
1179 #if HAS_BUILTIN(__builtin_elementwise_add_sat)
1180     return (__m128i)__builtin_elementwise_add_sat((__v8hu)a, (__v8hu)b);
1181 #else
1182     return (__m128i)__builtin_ia32_paddusw128((__v8hi)a, (__v8hi)b);
1183 #endif
1184 }
1185 
_mm_avg_epu8(__m128i a,__m128i b)1186 __INTRIN_INLINE_SSE2 __m128i _mm_avg_epu8(__m128i a, __m128i b)
1187 {
1188     return (__m128i)__builtin_ia32_pavgb128((__v16qi)a, (__v16qi)b);
1189 }
1190 
_mm_avg_epu16(__m128i a,__m128i b)1191 __INTRIN_INLINE_SSE2 __m128i _mm_avg_epu16(__m128i a, __m128i b)
1192 {
1193     return (__m128i)__builtin_ia32_pavgw128((__v8hi)a, (__v8hi)b);
1194 }
1195 
_mm_madd_epi16(__m128i a,__m128i b)1196 __INTRIN_INLINE_SSE2 __m128i _mm_madd_epi16(__m128i a, __m128i b)
1197 {
1198     return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)a, (__v8hi)b);
1199 }
1200 
_mm_max_epi16(__m128i a,__m128i b)1201 __INTRIN_INLINE_SSE2 __m128i _mm_max_epi16(__m128i a, __m128i b)
1202 {
1203 #if HAS_BUILTIN(__builtin_elementwise_max)
1204     return (__m128i)__builtin_elementwise_max((__v8hi)a, (__v8hi)b);
1205 #else
1206     return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)a, (__v8hi)b);
1207 #endif
1208 }
1209 
_mm_max_epu8(__m128i a,__m128i b)1210 __INTRIN_INLINE_SSE2 __m128i _mm_max_epu8(__m128i a, __m128i b)
1211 {
1212 #if HAS_BUILTIN(__builtin_elementwise_max)
1213     return (__m128i)__builtin_elementwise_max((__v16qu)a, (__v16qu)b);
1214 #else
1215     return (__m128i)__builtin_ia32_pmaxub128((__v16qi)a, (__v16qi)b);
1216 #endif
1217 }
1218 
_mm_min_epi16(__m128i a,__m128i b)1219 __INTRIN_INLINE_SSE2 __m128i _mm_min_epi16(__m128i a, __m128i b)
1220 {
1221 #if HAS_BUILTIN(__builtin_elementwise_min)
1222     return (__m128i)__builtin_elementwise_min((__v8hi)a, (__v8hi)b);
1223 #else
1224     return (__m128i)__builtin_ia32_pminsw128((__v8hi)a, (__v8hi)b);
1225 #endif
1226 }
1227 
_mm_min_epu8(__m128i a,__m128i b)1228 __INTRIN_INLINE_SSE2 __m128i _mm_min_epu8(__m128i a, __m128i b)
1229 {
1230 #if HAS_BUILTIN(__builtin_elementwise_min)
1231     return (__m128i)__builtin_elementwise_min((__v16qu)a, (__v16qu)b);
1232 #else
1233     return (__m128i)__builtin_ia32_pminub128((__v16qi)a, (__v16qi)b);
1234 #endif
1235 }
1236 
_mm_mulhi_epi16(__m128i a,__m128i b)1237 __INTRIN_INLINE_SSE2 __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
1238 {
1239     return (__m128i)__builtin_ia32_pmulhw128((__v8hi)a, (__v8hi)b);
1240 }
1241 
_mm_mulhi_epu16(__m128i a,__m128i b)1242 __INTRIN_INLINE_SSE2 __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
1243 {
1244     return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)a, (__v8hi)b);
1245 }
1246 
_mm_mullo_epi16(__m128i a,__m128i b)1247 __INTRIN_INLINE_SSE2 __m128i _mm_mullo_epi16(__m128i a, __m128i b)
1248 {
1249     return (__m128i)((__v8hu)a * (__v8hu)b);
1250 }
1251 
_mm_mul_su32(__m64 a,__m64 b)1252 __INTRIN_INLINE_MMXSSE2 __m64 _mm_mul_su32(__m64 a, __m64 b)
1253 {
1254     return (__m64)__builtin_ia32_pmuludq((__v2si)a, (__v2si)b);
1255 }
1256 
_mm_mul_epu32(__m128i a,__m128i b)1257 __INTRIN_INLINE_SSE2 __m128i _mm_mul_epu32(__m128i a, __m128i b)
1258 {
1259     return __builtin_ia32_pmuludq128((__v4si)a, (__v4si)b);
1260 }
1261 
_mm_sad_epu8(__m128i a,__m128i b)1262 __INTRIN_INLINE_SSE2 __m128i _mm_sad_epu8(__m128i a, __m128i b)
1263 {
1264     return __builtin_ia32_psadbw128((__v16qi)a, (__v16qi)b);
1265 }
1266 
_mm_sub_epi8(__m128i a,__m128i b)1267 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi8(__m128i a, __m128i b)
1268 {
1269     return (__m128i)((__v16qu)a - (__v16qu)b);
1270 }
1271 
_mm_sub_epi16(__m128i a,__m128i b)1272 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi16(__m128i a, __m128i b)
1273 {
1274     return (__m128i)((__v8hu)a - (__v8hu)b);
1275 }
1276 
_mm_sub_epi32(__m128i a,__m128i b)1277 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi32(__m128i a, __m128i b)
1278 {
1279     return (__m128i)((__v4su)a - (__v4su)b);
1280 }
1281 
_mm_sub_si64(__m64 a,__m64 b)1282 __INTRIN_INLINE_MMXSSE2 __m64 _mm_sub_si64(__m64 a, __m64 b)
1283 {
1284     return (__m64)__builtin_ia32_psubq((__v1di)a, (__v1di)b);
1285 }
1286 
_mm_sub_epi64(__m128i a,__m128i b)1287 __INTRIN_INLINE_SSE2 __m128i _mm_sub_epi64(__m128i a, __m128i b)
1288 {
1289     return (__m128i)((__v2du)a - (__v2du)b);
1290 }
1291 
_mm_subs_epi8(__m128i a,__m128i b)1292 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epi8(__m128i a, __m128i b)
1293 {
1294 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1295     return (__m128i)__builtin_elementwise_sub_sat((__v16qs)a, (__v16qs)b);
1296 #else
1297     return (__m128i)__builtin_ia32_psubsb128((__v16qi)a, (__v16qi)b);
1298 #endif
1299 }
1300 
_mm_subs_epi16(__m128i a,__m128i b)1301 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epi16(__m128i a, __m128i b)
1302 {
1303 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1304     return (__m128i)__builtin_elementwise_sub_sat((__v8hi)a, (__v8hi)b);
1305 #else
1306     return (__m128i)__builtin_ia32_psubsw128((__v8hi)a, (__v8hi)b);
1307 #endif
1308 }
1309 
_mm_subs_epu8(__m128i a,__m128i b)1310 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epu8(__m128i a, __m128i b)
1311 {
1312 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1313     return (__m128i)__builtin_elementwise_sub_sat((__v16qu)a, (__v16qu)b);
1314 #else
1315     return (__m128i)__builtin_ia32_psubusb128((__v16qi)a, (__v16qi)b);
1316 #endif
1317 }
1318 
_mm_subs_epu16(__m128i a,__m128i b)1319 __INTRIN_INLINE_SSE2 __m128i _mm_subs_epu16(__m128i a, __m128i b)
1320 {
1321 #if HAS_BUILTIN(__builtin_elementwise_sub_sat)
1322     return (__m128i)__builtin_elementwise_sub_sat((__v8hu)a, (__v8hu)b);
1323 #else
1324     return (__m128i)__builtin_ia32_psubusw128((__v8hi)a, (__v8hi)b);
1325 #endif
1326 }
1327 
_mm_and_si128(__m128i a,__m128i b)1328 __INTRIN_INLINE_SSE2 __m128i _mm_and_si128(__m128i a, __m128i b)
1329 {
1330     return (__m128i)((__v2du)a & (__v2du)b);
1331 }
1332 
_mm_andnot_si128(__m128i a,__m128i b)1333 __INTRIN_INLINE_SSE2 __m128i _mm_andnot_si128(__m128i a, __m128i b)
1334 {
1335     return (__m128i)(~(__v2du)a & (__v2du)b);
1336 }
1337 
_mm_or_si128(__m128i a,__m128i b)1338 __INTRIN_INLINE_SSE2 __m128i _mm_or_si128(__m128i a, __m128i b)
1339 {
1340     return (__m128i)((__v2du)a | (__v2du)b);
1341 }
1342 
_mm_xor_si128(__m128i a,__m128i b)1343 __INTRIN_INLINE_SSE2 __m128i _mm_xor_si128(__m128i a, __m128i b)
1344 {
1345     return (__m128i)((__v2du)a ^ (__v2du)b);
1346 }
1347 
1348 #define _mm_slli_si128(a, imm) \
1349     ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
1350 
_mm_slli_epi16(__m128i a,int count)1351 __INTRIN_INLINE_SSE2 __m128i _mm_slli_epi16(__m128i a, int count)
1352 {
1353     return (__m128i)__builtin_ia32_psllwi128((__v8hi)a, count);
1354 }
1355 
_mm_sll_epi16(__m128i a,__m128i count)1356 __INTRIN_INLINE_SSE2 __m128i _mm_sll_epi16(__m128i a, __m128i count)
1357 {
1358     return (__m128i)__builtin_ia32_psllw128((__v8hi)a, (__v8hi)count);
1359 }
1360 
_mm_slli_epi32(__m128i a,int count)1361 __INTRIN_INLINE_SSE2 __m128i _mm_slli_epi32(__m128i a, int count)
1362 {
1363     return (__m128i)__builtin_ia32_pslldi128((__v4si)a, count);
1364 }
1365 
_mm_sll_epi32(__m128i a,__m128i count)1366 __INTRIN_INLINE_SSE2 __m128i _mm_sll_epi32(__m128i a, __m128i count)
1367 {
1368     return (__m128i)__builtin_ia32_pslld128((__v4si)a, (__v4si)count);
1369 }
1370 
_mm_slli_epi64(__m128i a,int count)1371 __INTRIN_INLINE_SSE2 __m128i _mm_slli_epi64(__m128i a, int count)
1372 {
1373     return __builtin_ia32_psllqi128((__v2di)a, count);
1374 }
1375 
_mm_sll_epi64(__m128i a,__m128i count)1376 __INTRIN_INLINE_SSE2 __m128i _mm_sll_epi64(__m128i a, __m128i count)
1377 {
1378     return __builtin_ia32_psllq128((__v2di)a, (__v2di)count);
1379 }
1380 
_mm_srai_epi16(__m128i a,int count)1381 __INTRIN_INLINE_SSE2 __m128i _mm_srai_epi16(__m128i a, int count)
1382 {
1383     return (__m128i)__builtin_ia32_psrawi128((__v8hi)a, count);
1384 }
1385 
_mm_sra_epi16(__m128i a,__m128i count)1386 __INTRIN_INLINE_SSE2 __m128i _mm_sra_epi16(__m128i a, __m128i count)
1387 {
1388     return (__m128i)__builtin_ia32_psraw128((__v8hi)a, (__v8hi)count);
1389 }
1390 
_mm_srai_epi32(__m128i a,int count)1391 __INTRIN_INLINE_SSE2 __m128i _mm_srai_epi32(__m128i a, int count)
1392 {
1393     return (__m128i)__builtin_ia32_psradi128((__v4si)a, count);
1394 }
1395 
_mm_sra_epi32(__m128i a,__m128i count)1396 __INTRIN_INLINE_SSE2 __m128i _mm_sra_epi32(__m128i a, __m128i count)
1397 {
1398     return (__m128i)__builtin_ia32_psrad128((__v4si)a, (__v4si)count);
1399 }
1400 
1401 #define _mm_srli_si128(a, imm) \
1402     ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm)))
1403 
_mm_srli_epi16(__m128i a,int count)1404 __INTRIN_INLINE_SSE2 __m128i _mm_srli_epi16(__m128i a, int count)
1405 {
1406     return (__m128i)__builtin_ia32_psrlwi128((__v8hi)a, count);
1407 }
1408 
_mm_srl_epi16(__m128i a,__m128i count)1409 __INTRIN_INLINE_SSE2 __m128i _mm_srl_epi16(__m128i a, __m128i count)
1410 {
1411     return (__m128i)__builtin_ia32_psrlw128((__v8hi)a, (__v8hi)count);
1412 }
1413 
_mm_srli_epi32(__m128i a,int count)1414 __INTRIN_INLINE_SSE2 __m128i _mm_srli_epi32(__m128i a, int count)
1415 {
1416     return (__m128i)__builtin_ia32_psrldi128((__v4si)a, count);
1417 }
1418 
_mm_srl_epi32(__m128i a,__m128i count)1419 __INTRIN_INLINE_SSE2 __m128i _mm_srl_epi32(__m128i a, __m128i count)
1420 {
1421     return (__m128i)__builtin_ia32_psrld128((__v4si)a, (__v4si)count);
1422 }
1423 
_mm_srli_epi64(__m128i a,int count)1424 __INTRIN_INLINE_SSE2 __m128i _mm_srli_epi64(__m128i a, int count)
1425 {
1426     return __builtin_ia32_psrlqi128((__v2di)a, count);
1427 }
1428 
_mm_srl_epi64(__m128i a,__m128i count)1429 __INTRIN_INLINE_SSE2 __m128i _mm_srl_epi64(__m128i a, __m128i count)
1430 {
1431     return __builtin_ia32_psrlq128((__v2di)a, (__v2di)count);
1432 }
1433 
_mm_cmpeq_epi8(__m128i a,__m128i b)1434 __INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
1435 {
1436     return (__m128i)((__v16qi)a == (__v16qi)b);
1437 }
1438 
_mm_cmpeq_epi16(__m128i a,__m128i b)1439 __INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
1440 {
1441     return (__m128i)((__v8hi)a == (__v8hi)b);
1442 }
1443 
_mm_cmpeq_epi32(__m128i a,__m128i b)1444 __INTRIN_INLINE_SSE2 __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
1445 {
1446     return (__m128i)((__v4si)a == (__v4si)b);
1447 }
1448 
_mm_cmpgt_epi8(__m128i a,__m128i b)1449 __INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
1450 {
1451     /* This function always performs a signed comparison, but __v16qi is a char
1452        which may be signed or unsigned, so use __v16qs. */
1453     return (__m128i)((__v16qs)a > (__v16qs)b);
1454 }
1455 
_mm_cmpgt_epi16(__m128i a,__m128i b)1456 __INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
1457 {
1458     return (__m128i)((__v8hi)a > (__v8hi)b);
1459 }
1460 
_mm_cmpgt_epi32(__m128i a,__m128i b)1461 __INTRIN_INLINE_SSE2 __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
1462 {
1463     return (__m128i)((__v4si)a > (__v4si)b);
1464 }
1465 
_mm_cmplt_epi8(__m128i a,__m128i b)1466 __INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
1467 {
1468     return _mm_cmpgt_epi8(b, a);
1469 }
1470 
_mm_cmplt_epi16(__m128i a,__m128i b)1471 __INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
1472 {
1473     return _mm_cmpgt_epi16(b, a);
1474 }
1475 
_mm_cmplt_epi32(__m128i a,__m128i b)1476 __INTRIN_INLINE_SSE2 __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
1477 {
1478     return _mm_cmpgt_epi32(b, a);
1479 }
1480 
1481 #ifdef _M_AMD64
1482 
_mm_cvtsi64_sd(__m128d a,long long b)1483 __INTRIN_INLINE_SSE2 __m128d _mm_cvtsi64_sd(__m128d a, long long b)
1484 {
1485     a[0] = b;
1486     return a;
1487 }
1488 
_mm_cvtsd_si64(__m128d a)1489 __INTRIN_INLINE_SSE2 long long _mm_cvtsd_si64(__m128d a)
1490 {
1491     return __builtin_ia32_cvtsd2si64((__v2df)a);
1492 }
1493 
_mm_cvttsd_si64(__m128d a)1494 __INTRIN_INLINE_SSE2 long long _mm_cvttsd_si64(__m128d a)
1495 {
1496     return __builtin_ia32_cvttsd2si64((__v2df)a);
1497 }
1498 #endif
1499 
_mm_cvtepi32_ps(__m128i a)1500 __INTRIN_INLINE_SSE2 __m128 _mm_cvtepi32_ps(__m128i a)
1501 {
1502 #if HAS_BUILTIN(__builtin_convertvector)
1503     return (__m128)__builtin_convertvector((__v4si)a, __v4sf);
1504 #else
1505     return __builtin_ia32_cvtdq2ps((__v4si)a);
1506 #endif
1507 }
1508 
_mm_cvtps_epi32(__m128 a)1509 __INTRIN_INLINE_SSE2 __m128i _mm_cvtps_epi32(__m128 a)
1510 {
1511     return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)a);
1512 }
1513 
_mm_cvttps_epi32(__m128 a)1514 __INTRIN_INLINE_SSE2 __m128i _mm_cvttps_epi32(__m128 a)
1515 {
1516     return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)a);
1517 }
1518 
_mm_cvtsi32_si128(int a)1519 __INTRIN_INLINE_SSE2 __m128i _mm_cvtsi32_si128(int a)
1520 {
1521     return __extension__(__m128i)(__v4si){a, 0, 0, 0};
1522 }
1523 
_mm_cvtsi64_si128(long long a)1524 __INTRIN_INLINE_SSE2 __m128i _mm_cvtsi64_si128(long long a)
1525 {
1526     return __extension__(__m128i)(__v2di){a, 0};
1527 }
1528 
_mm_cvtsi128_si32(__m128i a)1529 __INTRIN_INLINE_SSE2 int _mm_cvtsi128_si32(__m128i a)
1530 {
1531     __v4si b = (__v4si)a;
1532     return b[0];
1533 }
1534 
_mm_cvtsi128_si64(__m128i a)1535 __INTRIN_INLINE_SSE2 long long _mm_cvtsi128_si64(__m128i a)
1536 {
1537     return a[0];
1538 }
1539 
_mm_load_si128(__m128i const * p)1540 __INTRIN_INLINE_SSE2 __m128i _mm_load_si128(__m128i const *p)
1541 {
1542     return *p;
1543 }
1544 
_mm_loadu_si128(__m128i_u const * p)1545 __INTRIN_INLINE_SSE2 __m128i _mm_loadu_si128(__m128i_u const *p)
1546 {
1547     struct __loadu_si128 {
1548       __m128i_u __v;
1549     } __attribute__((__packed__, __may_alias__));
1550     return ((const struct __loadu_si128 *)p)->__v;
1551 }
1552 
_mm_loadl_epi64(__m128i_u const * p)1553 __INTRIN_INLINE_SSE2 __m128i _mm_loadl_epi64(__m128i_u const *p)
1554 {
1555     struct __mm_loadl_epi64_struct {
1556       long long __u;
1557     } __attribute__((__packed__, __may_alias__));
1558     return __extension__(__m128i){
1559         ((const struct __mm_loadl_epi64_struct *)p)->__u, 0};
1560 }
1561 
_mm_undefined_si128(void)1562 __INTRIN_INLINE_SSE2 __m128i _mm_undefined_si128(void)
1563 {
1564 #if HAS_BUILTIN(__builtin_ia32_undef128)
1565     return (__m128i)__builtin_ia32_undef128();
1566 #else
1567     __m128i undef = undef;
1568     return undef;
1569 #endif
1570 }
1571 
_mm_set_epi64x(long long q1,long long q0)1572 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi64x(long long q1, long long q0)
1573 {
1574     return __extension__(__m128i)(__v2di){q0, q1};
1575 }
1576 
_mm_set_epi64(__m64 q1,__m64 q0)1577 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi64(__m64 q1, __m64 q0)
1578 {
1579     return _mm_set_epi64x((long long)q1, (long long)q0);
1580 }
1581 
_mm_set_epi32(int i3,int i2,int i1,int i0)1582 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
1583 {
1584     return __extension__(__m128i)(__v4si){i0, i1, i2, i3};
1585 }
1586 
_mm_set_epi16(short w7,short w6,short w5,short w4,short w3,short w2,short w1,short w0)1587 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi16(
1588     short w7, short w6, short w5, short w4,
1589     short w3, short w2, short w1, short w0)
1590 {
1591     return __extension__(__m128i)(__v8hi){w0, w1, w2, w3, w4, w5, w6, w7};
1592 }
1593 
_mm_set_epi8(char b15,char b14,char b13,char b12,char b11,char b10,char b9,char b8,char b7,char b6,char b5,char b4,char b3,char b2,char b1,char b0)1594 __INTRIN_INLINE_SSE2 __m128i _mm_set_epi8(
1595     char b15, char b14, char b13, char b12,
1596     char b11, char b10, char b9, char b8,
1597     char b7, char b6, char b5, char b4,
1598     char b3, char b2, char b1, char b0)
1599 {
1600     return __extension__(__m128i)(__v16qi){
1601         b0, b1, b2,  b3,  b4,  b5,  b6,  b7,
1602         b8, b9, b10, b11, b12, b13, b14, b15};
1603 }
1604 
_mm_set1_epi64x(long long q)1605 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi64x(long long q)
1606 {
1607     return _mm_set_epi64x(q, q);
1608 }
1609 
_mm_set1_epi64(__m64 q)1610 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi64(__m64 q)
1611 {
1612     return _mm_set_epi64(q, q);
1613 }
1614 
_mm_set1_epi32(int i)1615 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi32(int i)
1616 {
1617     return _mm_set_epi32(i, i, i, i);
1618 }
1619 
_mm_set1_epi16(short w)1620 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi16(short w)
1621 {
1622     return _mm_set_epi16(w, w, w, w, w, w, w, w);
1623 }
1624 
_mm_set1_epi8(char b)1625 __INTRIN_INLINE_SSE2 __m128i _mm_set1_epi8(char b)
1626 {
1627     return _mm_set_epi8(b, b, b, b, b, b, b, b, b, b, b,
1628                         b, b, b, b, b);
1629 }
1630 
_mm_setr_epi64(__m64 q0,__m64 q1)1631 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi64(__m64 q0, __m64 q1)
1632 {
1633     return _mm_set_epi64(q1, q0);
1634 }
1635 
_mm_setr_epi32(int i0,int i1,int i2,int i3)1636 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi32(int i0, int i1, int i2, int i3)
1637 {
1638     return _mm_set_epi32(i3, i2, i1, i0);
1639 }
1640 
_mm_setr_epi16(short w0,short w1,short w2,short w3,short w4,short w5,short w6,short w7)1641 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi16(
1642     short w0, short w1, short w2, short w3,
1643     short w4, short w5, short w6, short w7)
1644 {
1645     return _mm_set_epi16(w7, w6, w5, w4, w3, w2, w1, w0);
1646 }
1647 
_mm_setr_epi8(char b0,char b1,char b2,char b3,char b4,char b5,char b6,char b7,char b8,char b9,char b10,char b11,char b12,char b13,char b14,char b15)1648 __INTRIN_INLINE_SSE2 __m128i _mm_setr_epi8(
1649     char b0, char b1, char b2, char b3,
1650     char b4, char b5, char b6, char b7,
1651     char b8, char b9, char b10,  char b11,
1652     char b12, char b13, char b14, char b15)
1653 {
1654     return _mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, b8,
1655                         b7, b6, b5, b4, b3, b2, b1, b0);
1656 }
1657 
_mm_setzero_si128(void)1658 __INTRIN_INLINE_SSE2 __m128i _mm_setzero_si128(void)
1659 {
1660     return __extension__(__m128i)(__v2di){0LL, 0LL};
1661 }
1662 
_mm_store_si128(__m128i * p,__m128i b)1663 __INTRIN_INLINE_SSE2 void _mm_store_si128(__m128i *p, __m128i b)
1664 {
1665     *p = b;
1666 }
1667 
_mm_storeu_si128(__m128i_u * p,__m128i b)1668 __INTRIN_INLINE_SSE2 void _mm_storeu_si128(__m128i_u *p, __m128i b)
1669 {
1670     struct __storeu_si128 {
1671       __m128i_u __v;
1672     } __attribute__((__packed__, __may_alias__));
1673     ((struct __storeu_si128 *)p)->__v = b;
1674 }
1675 
_mm_storeu_si64(void * p,__m128i b)1676 __INTRIN_INLINE_SSE2 void _mm_storeu_si64(void *p, __m128i b)
1677 {
1678     struct __storeu_si64 {
1679       long long __v;
1680     } __attribute__((__packed__, __may_alias__));
1681     ((struct __storeu_si64 *)p)->__v = ((__v2di)b)[0];
1682 }
1683 
_mm_storeu_si32(void * p,__m128i b)1684 __INTRIN_INLINE_SSE2 void _mm_storeu_si32(void *p, __m128i b)
1685 {
1686     struct __storeu_si32 {
1687       int __v;
1688     } __attribute__((__packed__, __may_alias__));
1689     ((struct __storeu_si32 *)p)->__v = ((__v4si)b)[0];
1690 }
1691 
_mm_storeu_si16(void * p,__m128i b)1692 __INTRIN_INLINE_SSE2 void _mm_storeu_si16(void *p, __m128i b)
1693 {
1694     struct __storeu_si16 {
1695       short __v;
1696     } __attribute__((__packed__, __may_alias__));
1697     ((struct __storeu_si16 *)p)->__v = ((__v8hi)b)[0];
1698 }
1699 
_mm_maskmoveu_si128(__m128i d,__m128i n,char * p)1700 __INTRIN_INLINE_SSE2 void _mm_maskmoveu_si128(__m128i d, __m128i n, char *p)
1701 {
1702     __builtin_ia32_maskmovdqu((__v16qi)d, (__v16qi)n, p);
1703 }
1704 
_mm_storel_epi64(__m128i_u * p,__m128i a)1705 __INTRIN_INLINE_SSE2 void _mm_storel_epi64(__m128i_u *p, __m128i a)
1706 {
1707     struct __mm_storel_epi64_struct {
1708       long long __u;
1709     } __attribute__((__packed__, __may_alias__));
1710     ((struct __mm_storel_epi64_struct *)p)->__u = a[0];
1711 }
1712 
_mm_stream_pd(double * p,__m128d a)1713 __INTRIN_INLINE_SSE2 void _mm_stream_pd(double *p, __m128d a)
1714 {
1715 #if HAS_BUILTIN(__builtin_nontemporal_store)
1716     __builtin_nontemporal_store((__v2df)a, (__v2df *)p);
1717 #else
1718     __builtin_ia32_movntpd(p, a);
1719 #endif
1720 }
1721 
_mm_stream_si128(__m128i * p,__m128i a)1722 __INTRIN_INLINE_SSE2 void _mm_stream_si128(__m128i *p, __m128i a)
1723 {
1724 #if HAS_BUILTIN(__builtin_nontemporal_store)
1725     __builtin_nontemporal_store((__v2di)a, (__v2di*)p);
1726 #else
1727     __builtin_ia32_movntdq(p, a);
1728 #endif
1729 }
1730 
_mm_stream_si32(int * p,int a)1731 __INTRIN_INLINE_SSE2 void _mm_stream_si32(int *p, int a)
1732 {
1733     __builtin_ia32_movnti(p, a);
1734 }
1735 
1736 #ifdef _M_AMD64
_mm_stream_si64(long long * p,long long a)1737 __INTRIN_INLINE_SSE2 void _mm_stream_si64(long long *p, long long a)
1738 {
1739     __builtin_ia32_movnti64(p, a);
1740 }
1741 #endif
1742 
1743 void _mm_clflush(void const *p);
1744 
1745 void _mm_lfence(void);
1746 
1747 void _mm_mfence(void);
1748 
_mm_packs_epi16(__m128i a,__m128i b)1749 __INTRIN_INLINE_SSE2 __m128i _mm_packs_epi16(__m128i a, __m128i b)
1750 {
1751     return (__m128i)__builtin_ia32_packsswb128((__v8hi)a, (__v8hi)b);
1752 }
1753 
_mm_packs_epi32(__m128i a,__m128i b)1754 __INTRIN_INLINE_SSE2 __m128i _mm_packs_epi32(__m128i a, __m128i b)
1755 {
1756     return (__m128i)__builtin_ia32_packssdw128((__v4si)a, (__v4si)b);
1757 }
1758 
_mm_packus_epi16(__m128i a,__m128i b)1759 __INTRIN_INLINE_SSE2 __m128i _mm_packus_epi16(__m128i a, __m128i b)
1760 {
1761     return (__m128i)__builtin_ia32_packuswb128((__v8hi)a, (__v8hi)b);
1762 }
1763 
1764 #define _mm_extract_epi16(a, imm)                                              \
1765     ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a),      \
1766                                                       (int)(imm)))
1767 
1768 #define _mm_insert_epi16(a, b, imm)                                            \
1769     ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b),        \
1770                                           (int)(imm)))
1771 
_mm_movemask_epi8(__m128i a)1772 __INTRIN_INLINE_SSE2 int _mm_movemask_epi8(__m128i a)
1773 {
1774     return __builtin_ia32_pmovmskb128((__v16qi)a);
1775 }
1776 
1777 #define _mm_shuffle_epi32(a, imm)                                              \
1778     ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
1779 
1780 #define _mm_shufflelo_epi16(a, imm)                                            \
1781     ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
1782 
1783 #define _mm_shufflehi_epi16(a, imm)                                            \
1784     ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
1785 
_mm_unpackhi_epi8(__m128i a,__m128i b)1786 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
1787 {
1788 #if HAS_BUILTIN(__builtin_shufflevector)
1789     return (__m128i)__builtin_shufflevector(
1790         (__v16qi)a, (__v16qi)b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
1791         16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
1792 #else
1793     return (__m128i)__builtin_ia32_punpckhbw128((__v16qi)a, (__v16qi)b);
1794 #endif
1795 }
1796 
_mm_unpackhi_epi16(__m128i a,__m128i b)1797 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
1798 {
1799 #if HAS_BUILTIN(__builtin_shufflevector)
1800     return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 4, 8 + 4, 5,
1801                                             8 + 5, 6, 8 + 6, 7, 8 + 7);
1802 #else
1803     return (__m128i)__builtin_ia32_punpckhwd128((__v8hi)a, (__v8hi)b);
1804 #endif
1805 }
1806 
_mm_unpackhi_epi32(__m128i a,__m128i b)1807 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
1808 {
1809 #if HAS_BUILTIN(__builtin_shufflevector)
1810     return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 2, 4 + 2, 3,
1811                                             4 + 3);
1812 #else
1813     return (__m128i)__builtin_ia32_punpckhdq128((__v4si)a, (__v4si)b);
1814 #endif
1815 }
1816 
_mm_unpackhi_epi64(__m128i a,__m128i b)1817 __INTRIN_INLINE_SSE2 __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
1818 {
1819 #if HAS_BUILTIN(__builtin_shufflevector)
1820     return (__m128i)__builtin_shufflevector((__v2di)a, (__v2di)b, 1, 2 + 1);
1821 #else
1822     return (__m128i)__builtin_ia32_punpckhqdq128((__v2di)a, (__v2di)b);
1823 #endif
1824 }
1825 
_mm_unpacklo_epi8(__m128i a,__m128i b)1826 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
1827 {
1828 #if HAS_BUILTIN(__builtin_shufflevector)
1829     return (__m128i)__builtin_shufflevector(
1830         (__v16qi)a, (__v16qi)b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
1831         16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
1832 #else
1833     return (__m128i)__builtin_ia32_punpcklbw128((__v16qi)a, (__v16qi)b);
1834 #endif
1835 }
1836 
_mm_unpacklo_epi16(__m128i a,__m128i b)1837 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
1838 {
1839 #if HAS_BUILTIN(__builtin_shufflevector)
1840     return (__m128i)__builtin_shufflevector((__v8hi)a, (__v8hi)b, 0, 8 + 0, 1,
1841                                             8 + 1, 2, 8 + 2, 3, 8 + 3);
1842 #else
1843     return (__m128i)__builtin_ia32_punpcklwd128((__v8hi)a, (__v8hi)b);
1844 #endif
1845 }
1846 
_mm_unpacklo_epi32(__m128i a,__m128i b)1847 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
1848 {
1849 #if HAS_BUILTIN(__builtin_shufflevector)
1850     return (__m128i)__builtin_shufflevector((__v4si)a, (__v4si)b, 0, 4 + 0, 1,
1851                                             4 + 1);
1852 #else
1853     return (__m128i)__builtin_ia32_punpckldq128((__v4si)a, (__v4si)b);
1854 #endif
1855 }
1856 
_mm_unpacklo_epi64(__m128i a,__m128i b)1857 __INTRIN_INLINE_SSE2 __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
1858 {
1859 #if HAS_BUILTIN(__builtin_shufflevector)
1860     return (__m128i)__builtin_shufflevector((__v2di)a, (__v2di)b, 0, 2 + 0);
1861 #else
1862     return (__m128i)__builtin_ia32_punpcklqdq128((__v2di)a, (__v2di)b);
1863 #endif
1864 }
1865 
_mm_movepi64_pi64(__m128i a)1866 __INTRIN_INLINE_SSE2 __m64 _mm_movepi64_pi64(__m128i a)
1867 {
1868     return (__m64)a[0];
1869 }
1870 
_mm_movpi64_epi64(__m64 a)1871 __INTRIN_INLINE_SSE2 __m128i _mm_movpi64_epi64(__m64 a)
1872 {
1873     return __extension__(__m128i)(__v2di){(long long)a, 0};
1874 }
1875 
_mm_move_epi64(__m128i a)1876 __INTRIN_INLINE_SSE2 __m128i _mm_move_epi64(__m128i a)
1877 {
1878 #if HAS_BUILTIN(__builtin_shufflevector)
1879     return __builtin_shufflevector((__v2di)a, _mm_setzero_si128(), 0, 2);
1880 #else
1881     return (__m128i)__builtin_ia32_movq128((__v2di)a);
1882 #endif
1883 }
1884 
_mm_unpackhi_pd(__m128d a,__m128d b)1885 __INTRIN_INLINE_SSE2 __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
1886 {
1887 #if HAS_BUILTIN(__builtin_shufflevector)
1888     return __builtin_shufflevector((__v2df)a, (__v2df)b, 1, 2 + 1);
1889 #else
1890     return (__m128d)__builtin_ia32_unpckhpd((__v2df)a, (__v2df)b);
1891 #endif
1892 }
1893 
_mm_unpacklo_pd(__m128d a,__m128d b)1894 __INTRIN_INLINE_SSE2 __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
1895 {
1896 #if HAS_BUILTIN(__builtin_shufflevector)
1897     return __builtin_shufflevector((__v2df)a, (__v2df)b, 0, 2 + 0);
1898 #else
1899     return (__m128d)__builtin_ia32_unpcklpd((__v2df)a, (__v2df)b);
1900 #endif
1901 }
1902 
_mm_movemask_pd(__m128d a)1903 __INTRIN_INLINE_SSE2 int _mm_movemask_pd(__m128d a)
1904 {
1905     return __builtin_ia32_movmskpd((__v2df)a);
1906 }
1907 
1908 #define _mm_shuffle_pd(a, b, i)                                                \
1909     ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b),  \
1910                                     (int)(i)))
1911 
_mm_castpd_ps(__m128d a)1912 __INTRIN_INLINE_SSE2 __m128 _mm_castpd_ps(__m128d a)
1913 {
1914     return (__m128)a;
1915 }
1916 
_mm_castpd_si128(__m128d a)1917 __INTRIN_INLINE_SSE2 __m128i _mm_castpd_si128(__m128d a)
1918 {
1919     return (__m128i)a;
1920 }
1921 
_mm_castps_pd(__m128 a)1922 __INTRIN_INLINE_SSE2 __m128d _mm_castps_pd(__m128 a)
1923 {
1924     return (__m128d)a;
1925 }
1926 
_mm_castps_si128(__m128 a)1927 __INTRIN_INLINE_SSE2 __m128i _mm_castps_si128(__m128 a)
1928 {
1929     return (__m128i)a;
1930 }
1931 
_mm_castsi128_ps(__m128i a)1932 __INTRIN_INLINE_SSE2 __m128 _mm_castsi128_ps(__m128i a)
1933 {
1934     return (__m128)a;
1935 }
1936 
_mm_castsi128_pd(__m128i a)1937 __INTRIN_INLINE_SSE2 __m128d _mm_castsi128_pd(__m128i a)
1938 {
1939     return (__m128d)a;
1940 }
1941 
1942 void _mm_pause(void);
1943 
1944 #endif /* _MSC_VER */
1945 
1946 #ifdef __cplusplus
1947 } // extern "C"
1948 #endif
1949 
1950 #endif /* _INCLUDED_EMM */
1951